diff --git a/CHANGELOG.md b/CHANGELOG.md
index aa367f2377..1befb3b2e1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -179,3 +179,67 @@
 * `metrics/rmse` should be removed because RMSE metrics don't really make sense here.
 
 * `metrics/trustworthiness` should be removed because it is already included in `metrics/coranking`.
+
+
+## Multi modality - Joint Embedding
+
+### New functionality
+
+* `api/anndata_*`: Created a file format specifications for the h5ad files throughout the pipeline.
+
+* `api/comp_*`: Created an api definition for the mask, method and metric components.
+
+* `mask_dataset`: Added a component for masking raw datasets into task-ready dataset objects.
+
+* `resources_test/joint_embedding/pancreas` with `src/joint_embedding/resources_test_scripts/pancreas.sh`.
+  
+### neurips 2021 migration
+
+* `control_methods/random_embed`: Migrated from neurips 2021. Extracted from baseline method `dummy_random`.
+
+* `control_methods/zeros_embed`: Migrated from neurips 2021. Extracted from baseline method `dummy_zeros`.
+
+* `methods/lmds`: Migrated from neurips 2021.
+
+* `methods/mnn`: Migrated and adapted from neurips 2021.
+
+* `methods/newwave`: Migrated and adapted from neurips 2021.
+
+* `methods/pca`: Migrated from neurips 2021.
+
+* `methods/totalvi`: Migrated from neurips 2021.
+
+* `methods/umap`: Migrated from neurips 2021.
+
+* `metrics/ari`: Migrated from neurips 2021.
+  
+* `metrics/asw_batch`: Migrated from neurips 2021.
+
+* `metrics/asw_label`: Migrated from neurips 2021.
+
+* `metrics/cc_cons`: Migrated from neurips 2021.
+
+* `metrics/check_format`: Migrated from neurips 2021.
+
+* `metrics/graph_connectivity`: Migrated from neurips 2021.
+
+* `metrics/latent_mixing`: Migrated from neurips 2021.
+
+* `metrics/nmi`: Migrated from neurips 2021.
+
+* `metrics/rfoob`: Migrated from neurips 2021.
+
+* `metrics/ti_cons`: Migrated from neurips 2021.
+
+* `metrics/ti_cons_batch`: Migrated from neurips 2021.
+
+### changes from neurips 2021
+
+* Updated docker config from R script. Was using an old `anndata` package which was giving warnings
+
+* stores the output from the methods in `.obsm["X_emb"]` instead of `.X` in the `anndata`
+
+* `X_emb` data is stored as a `Sparse Matrix`
+  
+
+* updated configs to latest `viash` 
\ No newline at end of file
diff --git a/src/joint_embedding/README.md b/src/joint_embedding/README.md
new file mode 100644
index 0000000000..7d9f65431c
--- /dev/null
+++ b/src/joint_embedding/README.md
@@ -0,0 +1,23 @@
+# Joint embedding
+
+Structure of this task:
+
+    src/embedding
+    ├── api                          Interface specifications for components and datasets in this task
+    ├── control_methods              Baseline (random/ground truth) methods to compare methods against
+    ├── methods                      Methods to be benchmarked
+    ├── metrics                      Metrics used to quantify performance of methods
+    ├── README.md                    This file
+    ├── resources_scripts            Scripts to process the datasets
+    ├── resources_test_scripts       Scripts to process the test resources
+    ├── split_dataset                Component to prepare common datasets
+    └── workflows                    Pipelines to run the full benchmark
+
+Relevant links:
+
+* [Description and results at openproblems.bio](https://openproblems.bio/neurips_2021/)
+
+* [Experimental results](https://openproblems-experimental.netlify.app/results/joint_embedding/)
+
+<!-- update this to openproblems.bio/guide when possible -->
+* [Contribution guide](https://github.com/openproblems-bio/openproblems-v2/blob/main/CONTRIBUTING.md)
diff --git a/src/joint_embedding/api/anndata_dataset.yaml b/src/joint_embedding/api/anndata_dataset.yaml
new file mode 100644
index 0000000000..23c6b427be
--- /dev/null
+++ b/src/joint_embedding/api/anndata_dataset.yaml
@@ -0,0 +1,75 @@
+type: file
+description: "A raw dataset"
+example: "dataset.h5ad"
+info:
+  label: "Dataset"
+  slots:
+    layers: 
+      - type: integer
+        name: counts
+        description: Raw counts
+        required: true
+    obs:
+      - type: string
+        name: batch
+        description: Batch information
+        required: true
+      - type: double
+        name: size_factors
+        description: The size factors created by the normalisation method, if any.
+        required: false
+      - type: string
+        name: cell_type
+        description: Type of cells
+        required: false
+      - type: string
+        name: pseudotime_order_GEX
+        description:
+        required: false
+      - type: string
+        name: pseudotime_order_ATAC
+        description:
+        required: false
+      - type: string
+        name: pseudotime_order_ADT
+        description:
+        required: false
+      - type: double
+        name: S_score
+        description:
+        required: false
+      - type: double
+        name: G2M_score
+        description:
+        required: false
+      - type: boolean
+        name: is_train
+        description: if sample is train data
+        required: true
+    var:
+      - type: string
+        name: gene_ids
+        description: 
+        required: false
+      - type: string
+        name: feature_types
+        description:
+        required: true
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: string
+        name: organism
+        description: "data from which organism "
+        required: false
+      - type: string
+        name: gene_activity_var_names
+        description:
+        required: false
+      - type: string
+        name: sample_pm_varnames
+        description:
+        required: false
+    
diff --git a/src/joint_embedding/api/anndata_masked_mod1.yaml b/src/joint_embedding/api/anndata_masked_mod1.yaml
new file mode 100644
index 0000000000..7ca6820671
--- /dev/null
+++ b/src/joint_embedding/api/anndata_masked_mod1.yaml
@@ -0,0 +1,37 @@
+type: file
+description: "The masked data"
+example: "masked.h5ad"
+info:
+  short_description: "masked data"
+  slots:
+    layers: 
+      - type: integer
+        name: counts
+        description: Raw counts
+    obs:
+      - type: string
+        name: batch
+        description: Batch information
+        required: true
+      - type: double
+        name: size_factors
+        description:
+        required: false
+    var:
+      - type: string
+        name: feature_types
+        description: 
+        required: true
+      - type: string
+        name: gene_ids
+        description:
+        required: false
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: string
+        name: organism
+        description: which organism
+        required: true
diff --git a/src/joint_embedding/api/anndata_masked_mod2.yaml b/src/joint_embedding/api/anndata_masked_mod2.yaml
new file mode 100644
index 0000000000..ad735fffba
--- /dev/null
+++ b/src/joint_embedding/api/anndata_masked_mod2.yaml
@@ -0,0 +1,39 @@
+type: file
+description: "The masked data for mod2 file"
+example: "masked.h5ad"
+info:
+  short_description: "Masked data"
+  slots:
+    layers: 
+      - type: integer
+        name: counts
+        description: Raw counts
+        required: true
+    obs:
+      - type: string
+        name: batch
+        description: Batch information
+        required: true
+    var:
+      - type: string
+        name: feature_types
+        description:
+        required: true
+      - type: string
+        name: gene_ids
+        description:
+        required: false
+    obsm:
+      - type: double
+        name: gene_activity
+        description: 
+        required: false
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: string
+        name: organism
+        description: which organism
+        required: true
diff --git a/src/joint_embedding/api/anndata_prediction.yaml b/src/joint_embedding/api/anndata_prediction.yaml
new file mode 100644
index 0000000000..49d8ae7d79
--- /dev/null
+++ b/src/joint_embedding/api/anndata_prediction.yaml
@@ -0,0 +1,25 @@
+type: file
+description: "The prediction file"
+example: "prediction.h5ad"
+info:
+  short_description: "Prediction"
+  slots:     
+    obs:
+      - type: string
+        name: batch
+        description: Batch information
+        required: true
+    obsm:
+      - type: double
+        name: X_emb
+        description:
+        required: true
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: string
+        name: method_id
+        description: "A unique identifier for the method"
+        required: true
diff --git a/src/joint_embedding/api/anndata_score.yaml b/src/joint_embedding/api/anndata_score.yaml
new file mode 100644
index 0000000000..bfe79f07cc
--- /dev/null
+++ b/src/joint_embedding/api/anndata_score.yaml
@@ -0,0 +1,25 @@
+type: file
+description: "Metric score file"
+example: "output.h5ad"
+info:
+  short_description: "Score"
+  slots:
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: string
+        name: method_id
+        description: "A unique identifier for the method"
+        required: true
+      - type: string
+        name: metric_ids
+        description: "One or more unique metric identifiers"
+        multiple: true
+        required: true
+      - type: double
+        name: metric_values
+        description: "The metric values obtained for the given prediction. Must be of same length as 'metric_ids'."
+        multiple: true
+        required: true
diff --git a/src/joint_embedding/api/anndata_solution.yaml b/src/joint_embedding/api/anndata_solution.yaml
new file mode 100644
index 0000000000..2ed03e1cc3
--- /dev/null
+++ b/src/joint_embedding/api/anndata_solution.yaml
@@ -0,0 +1,57 @@
+type: file
+description: "The solution for the data"
+example: "solution.h5ad"
+info:
+  short_description: "Solution"
+  slots:
+    layers: 
+      - type: integer
+        name: counts
+        description: Raw counts
+    obs:
+      - type: string
+        name: batch
+        description: Batch information
+        required: false
+      - type: string
+        name: cell_type
+        description: Type of cells
+        required: false
+      - type: string
+        name: pseudotime_order_GEX
+        description:
+        required: false
+      - type: string
+        name: pseudotime_order_ATAC
+        description:
+        required: false
+      - type: string
+        name: pseudotime_order_ADT
+        description:
+        required: false
+      - type: double
+        name: S_score
+        description:
+        required: false
+      - type: double
+        name: G2M_score
+        description:
+        required: false
+    var:
+      - type: string
+        name: feature_types
+        description: 
+        required: true
+      - type: string
+        name: gene_ids
+        description:
+        required: false
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: string
+        name: organism
+        description: which organism
+        required: true
diff --git a/src/joint_embedding/api/authors.yaml b/src/joint_embedding/api/authors.yaml
new file mode 100644
index 0000000000..f2e96c35fd
--- /dev/null
+++ b/src/joint_embedding/api/authors.yaml
@@ -0,0 +1,24 @@
+functionality:
+  authors:
+    - name: Robrecht Cannoodt
+      roles: [ author ]
+      props: { github: rcannood, orcid: "0000-0003-3641-729X" }
+    - name: Kai Waldrant
+      roles: [ contributor ]
+      props: { github: KaiWaldrant }
+    - name: Alex Tong
+      email: alexandertongdev@gmail.com
+      roles: [ author, maintainer ]
+      props: { github: atong01 }
+    - name: Christopher Lance
+      email: clance.connect@gmail.com
+      roles: [ author, maintainer ]
+      props: { github: xlancelottx }
+    - name: Michaela Mueller
+      email: mumichae@in.tum.de
+      roles: [ author, maintainer ]
+      props: { github: mumichae, orcid: "0000-0002-1401-1785" }
+    - name: Ann Chen
+      email: ann.chen@czbiohub.org
+      roles: [ author, maintainer ]
+      props: { github: atchen}
\ No newline at end of file
diff --git a/src/joint_embedding/api/comp_control_method.yaml b/src/joint_embedding/api/comp_control_method.yaml
new file mode 100644
index 0000000000..d1ec13ed17
--- /dev/null
+++ b/src/joint_embedding/api/comp_control_method.yaml
@@ -0,0 +1,46 @@
+functionality:
+  arguments:
+    - name: "--input_mod1"
+      __merge__: anndata_masked_mod1.yaml
+    - name: "--output"
+      __merge__: anndata_prediction.yaml
+      direction: output
+  test_resources:
+    - path: ../../../../resources_test   
+    - type: python_script
+      path: generic_test.py
+      text: |
+        from os import path
+        import subprocess
+        import anndata as ad
+        from scipy import sparse
+
+        input_mod1_path = "resources_test/common/joint_embedding/openproblems_bmmc_multiome_starter.mod1.h5ad"
+        output_path = "output.h5ad"
+
+        cmd = [
+            meta['executable'],
+            "--input_mod1", input_mod1_path,
+            "--output", output_path
+        ]
+
+        print(">> Running script as test")
+        out = subprocess.run(cmd, check=True, capture_output=True, text=True)
+
+        print("> Checking whether output files were created")
+        assert path.exists(output_path)
+
+        print("> Reading h5ad files")
+        input_mod1 = ad.read_h5ad(input_mod1_path)
+        output = ad.read_h5ad(output_path)
+
+        print("> Checking contents of output.h5ad")
+        assert output.uns['dataset_id'] == input_mod1.uns['dataset_id']
+        assert output.uns['method_id'] == meta['functionality_name']
+        assert output.n_obs == input_mod1.n_obs
+        assert output.n_vars >= 1
+        assert output.n_vars <= 100
+        assert all(output.obs_names == input_mod1.obs_names)
+        assert sparse.issparse(output.obsm['X_emb'])
+
+        print("> Test succeeded!")
diff --git a/src/joint_embedding/api/comp_mask_dataset.yaml b/src/joint_embedding/api/comp_mask_dataset.yaml
new file mode 100644
index 0000000000..0b97e89fca
--- /dev/null
+++ b/src/joint_embedding/api/comp_mask_dataset.yaml
@@ -0,0 +1,15 @@
+functionality:
+  arguments:
+    - name: "--input_mod1"
+      __merge__: anndata_dataset.yaml
+    - name: "--input_mod2"
+      __merge__: anndata_dataset.yaml
+    - name: "--output_mod1"
+      __merge__: anndata_masked_mod1.yaml
+      direction: output
+    - name: "--output_mod2"
+      __merge__: anndata_masked_mod2.yaml
+      direction: output
+    - name: "--output_solution"
+      __merge__: anndata_solution.yaml
+      direction: output
\ No newline at end of file
diff --git a/src/joint_embedding/api/comp_method.yaml b/src/joint_embedding/api/comp_method.yaml
new file mode 100644
index 0000000000..415f42ea3d
--- /dev/null
+++ b/src/joint_embedding/api/comp_method.yaml
@@ -0,0 +1,50 @@
+functionality:
+  arguments:
+    - name: "--input_mod1"
+      __merge__: anndata_masked_mod1.yaml
+    - name: "--input_mod2"
+      __merge__: anndata_masked_mod2.yaml
+    - name: "--output"
+      __merge__: anndata_prediction.yaml
+      direction: output
+  test_resources:
+    - path: ../../../../resources_test   
+    - type: python_script
+      path: generic_test.py
+      text: |
+        from os import path
+        import subprocess
+        import anndata as ad
+        from scipy import sparse
+
+        input_mod1_path = "resources_test/common/joint_embedding/openproblems_bmmc_multiome_starter.mod1.h5ad"
+        input_mod2_path = "resources_test/common/joint_embedding/openproblems_bmmc_multiome_starter.mod2.h5ad"
+        output_path = "output.h5ad"
+
+        cmd = [
+            meta['executable'],
+            "--input_mod1", input_mod1_path,
+            "--input_mod2", input_mod2_path,
+            "--output", output_path
+        ]
+
+        print(">> Running script as test")
+        out = subprocess.run(cmd, check=True, capture_output=True, text=True)
+
+        print("> Checking whether output files were created")
+        assert path.exists(output_path)
+
+        print("> Reading h5ad files")
+        input_mod1 = ad.read_h5ad(input_mod1_path)
+        output = ad.read_h5ad(output_path)
+
+        print("> Checking contents of output.h5ad")
+        assert output.uns['dataset_id'] == input_mod1.uns['dataset_id']
+        assert output.uns['method_id'] == meta['functionality_name']
+        assert output.n_obs == input_mod1.n_obs
+        assert output.n_vars >= 1
+        assert output.n_vars <= 100
+        assert all(output.obs_names == input_mod1.obs_names)
+        assert sparse.issparse(output.obsm['X_emb'])
+
+        print("> Test succeeded!")
\ No newline at end of file
diff --git a/src/joint_embedding/api/comp_metric.yaml b/src/joint_embedding/api/comp_metric.yaml
new file mode 100644
index 0000000000..76e3b17430
--- /dev/null
+++ b/src/joint_embedding/api/comp_metric.yaml
@@ -0,0 +1,87 @@
+functionality:
+  arguments:
+    - name: --input_prediction
+      __merge__: anndata_prediction.yaml
+    - name: --input_solution
+      __merge__: anndata_solution.yaml
+    - name: --output
+      __merge__: anndata_score.yaml
+      direction: output
+    - name: --debug
+      type: boolean_true
+      description: Verbose output for debugging.
+  test_resources:
+    - path: ../../../../resources_test
+    - type: python_script
+      path: generic_test.py
+      text: |
+        from os import path
+        import subprocess
+        import anndata as ad
+        import pandas as pd
+        import yaml
+
+        input_prediction_path = "resources_test/common/joint_embedding/cite_random_prediction.h5ad"
+        input_solution_path = "resources_test/common/joint_embedding/cite_solution.h5ad"
+        output_path = "output.h5ad"
+
+        # load config yaml
+        with open(meta["config"], "r") as file:
+                config = yaml.safe_load(file)
+
+        cmd = [
+            meta['executable'],
+            "--input_prediction", input_prediction_path,
+            "--input_solution", input_solution_path,
+            "--output", output_path
+        ]
+
+        print("> Running method", flush=True)
+        out = subprocess.run(cmd, stderr=subprocess.STDOUT, check=True)
+
+        print("> Checking whether output files were created", flush=True)
+        assert path.exists(output_path)
+
+        print("> Reading h5ad files", flush=True)
+        input_prediction = ad.read_h5ad(input_prediction_path)
+        input_solution = ad.read_h5ad(input_solution_path)
+        output = ad.read_h5ad(output_path)
+
+        # Create DF from metric config info
+        metric_info = config['functionality']['info']['metrics']
+        metric_meta = pd.DataFrame(metric_info)
+        metric_meta = metric_meta.astype({'metric_id': str, 'metric_name': str, 'metric_description': str, 'min': float, 'max': float, 'maximize': bool})
+        print("> Checking contents of metric info", flush=True)
+        assert 'metric_id' in metric_meta
+        assert 'min' in metric_meta
+        assert 'max' in metric_meta
+        assert 'maximize' in metric_meta
+
+        print("> Checking .uns['dataset_id']", flush=True)
+        assert 'dataset_id' in output.uns
+        assert output.uns['dataset_id'] == input_prediction.uns['dataset_id']
+
+        print("> Checking .uns['method_id']", flush=True)
+        assert 'method_id' in output.uns
+        assert output.uns['method_id'] == input_prediction.uns['method_id']
+
+        print("> Checking .uns['metric_ids']", flush=True)
+        assert 'metric_ids' in output.uns
+        assert set(output.uns['metric_ids']) == set(metric_meta.metric_id)
+
+        print("> Checking .uns['metric_values']", flush=True)
+        assert 'metric_values' in output.uns
+        assert output.uns['metric_ids'].size == output.uns['metric_values'].size
+
+        # merge with metric_meta to see if metric_value lies within the expected range
+        output_uns = pd.DataFrame({
+          'metric_id': output.uns['metric_ids'], 
+          'value': output.uns['metric_values']
+        })
+
+        scores = metric_meta.merge(output_uns, on="metric_id")
+
+        assert all(scores.value >= scores['min'])
+        assert all(scores.value <= scores['max'])
+
+        print("> Test succeeded!", flush=True)
diff --git a/src/joint_embedding/api/task_info.yaml b/src/joint_embedding/api/task_info.yaml
new file mode 100644
index 0000000000..79267651ce
--- /dev/null
+++ b/src/joint_embedding/api/task_info.yaml
@@ -0,0 +1,14 @@
+task_id: joint_embedding
+task_name: Joint Embedding
+summary: Learning of an embedded space that leverages the information of multiple modalities (e.g. for improved cell type annotation).
+description: |
+  The functioning of organs, tissues, and whole organisms is determined by the interplay of cells. 
+  Cells are characterised into broad types, which in turn can take on different states. Here, a cell 
+  state is made up of the sum of all processes that are occurring within the cell. We can gain insight 
+  into the state of a cell by different types of measurements: e.g., RNA expression, protein abundance, 
+  or chromatin conformation. Combining this information to describe cellular heterogeneity requires the 
+  formation of joint embeddings generated from this multimodal data. These embeddings must account for 
+  and remove possible batch effects between different measurement batches. The reward for methods that 
+  can achieve this is great: a highly resolved description of the underlying biological state of a cell 
+  that determines its function, how it interacts with other cells, and thus the cell’s role in the f
+  unctioning of the whole tissue.
\ No newline at end of file
diff --git a/src/joint_embedding/control_methods/random_embed/config.vsh.yaml b/src/joint_embedding/control_methods/random_embed/config.vsh.yaml
new file mode 100644
index 0000000000..15b0c745e9
--- /dev/null
+++ b/src/joint_embedding/control_methods/random_embed/config.vsh.yaml
@@ -0,0 +1,25 @@
+__merge__: ../../api/comp_control_method.yaml
+functionality:
+  name: random_embed
+  namespace: joint_embedding/control_methods
+  description: Generate a random embedding from a normal distribution.
+  info:
+    type: negative_control
+    method_name: Normal Dist.
+  arguments:
+    - name: "--n_dims"
+      type: "integer"
+      default: 100
+      description: Number of dimensions to output.
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: "python:3.10"
+    setup:
+      - type: python
+        pip: [ anndata>=0.8, numpy , scipy]
+  - type: nextflow
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/joint_embedding/control_methods/random_embed/script.py b/src/joint_embedding/control_methods/random_embed/script.py
new file mode 100644
index 0000000000..199ad08176
--- /dev/null
+++ b/src/joint_embedding/control_methods/random_embed/script.py
@@ -0,0 +1,26 @@
+import anndata
+import numpy as np
+from scipy import sparse
+
+## VIASH START
+par = {
+    "input_mod1": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.mod1.h5ad",
+    "output": "output/output_prediction.h5ad",
+    "n_dims": 100,
+}
+## VIASH END
+
+print("Load and prepare data")
+adata_mod1 = anndata.read_h5ad(par["input_mod1"])
+
+X = np.random.randn(adata_mod1.shape[0], par["n_dims"])
+print("Saving output")
+adata_out = anndata.AnnData(
+    X=X,
+    obsm= {"X_emb": sparse.csr_matrix(X)},
+    obs=adata_mod1.obs[["batch"]],
+    uns={"dataset_id": adata_mod1.uns["dataset_id"], "method_id": "random_embed"},
+)
+del adata_out.X
+
+adata_out.write_h5ad(par["output"], compression="gzip")
diff --git a/src/joint_embedding/control_methods/zeros_embed/config.vsh.yaml b/src/joint_embedding/control_methods/zeros_embed/config.vsh.yaml
new file mode 100644
index 0000000000..2cd0a9b71f
--- /dev/null
+++ b/src/joint_embedding/control_methods/zeros_embed/config.vsh.yaml
@@ -0,0 +1,25 @@
+__merge__: ../../api/comp_control_method.yaml
+functionality:
+  name: zeros_embed
+  namespace: joint_embedding/control_methods
+  description: Generate an embedding containing only zero values.
+  info:
+    type: negative_control
+    method_name: zeros_embed
+  arguments:
+    - name: "--n_dims"
+      type: "integer"
+      default: 1
+      description: Number of dimensions to output.
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: "python:3.10"
+    setup:
+      - type: python
+        pip: [ anndata, numpy, scipy ]
+  - type: nextflow
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/joint_embedding/control_methods/zeros_embed/script.py b/src/joint_embedding/control_methods/zeros_embed/script.py
new file mode 100644
index 0000000000..f54ef3ce18
--- /dev/null
+++ b/src/joint_embedding/control_methods/zeros_embed/script.py
@@ -0,0 +1,27 @@
+import anndata
+import numpy as np
+from scipy import sparse
+
+## VIASH START
+par = {
+    "input_mod1": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.mod1.h5ad",
+    "output": "tmp/output_prediction.h5ad",
+    "n_dims": 1,
+}
+## VIASH END
+
+print("Load and prepare data")
+adata_mod1 = anndata.read_h5ad(par["input_mod1"])
+
+X = np.zeros((adata_mod1.shape[0], par["n_dims"]))
+print("Saving output")
+adata_out = anndata.AnnData(
+    X=X,
+    obs=adata_mod1.obs,
+    uns={"dataset_id": adata_mod1.uns["dataset_id"], "method_id": "zeros_embed"},
+    obsm={"X_emb": sparse.csr_matrix(X) }
+)
+
+del adata_out.X
+
+adata_out.write_h5ad(par["output"], compression="gzip")
diff --git a/src/joint_embedding/mask_dataset/config.vsh.yaml b/src/joint_embedding/mask_dataset/config.vsh.yaml
new file mode 100644
index 0000000000..872df1d9fa
--- /dev/null
+++ b/src/joint_embedding/mask_dataset/config.vsh.yaml
@@ -0,0 +1,31 @@
+__merge__: ../api/comp_mask_dataset.yaml
+functionality:
+  name: mask_dataset
+  namespace: joint_embedding
+  description: |
+    A component for censoring joint embedding datasets to be given
+    to competition participants for the 'joint embedding' task.
+  arguments:
+    - name: "--train_only"
+      type: "boolean_true"
+      description: Whether or not to only omit the train cells.
+  resources:
+    - type: r_script
+      path: script.R
+  test_resources:
+    - type: r_script
+      path: test.R
+    - path: ../../../resources_test
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata, tidyverse , testthat]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [ anndata>=0.8 ]
+  - type: nextflow
+    directives: 
+      label: [ midmem, lowcpu ]
diff --git a/src/joint_embedding/mask_dataset/script.R b/src/joint_embedding/mask_dataset/script.R
new file mode 100644
index 0000000000..d417775314
--- /dev/null
+++ b/src/joint_embedding/mask_dataset/script.R
@@ -0,0 +1,97 @@
+cat("Loading dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+library(anndata, warn.conflicts = FALSE)
+library(assertthat, quietly = TRUE, warn.conflicts = FALSE)
+library(Matrix, quietly = TRUE, warn.conflicts = FALSE)
+
+## VIASH START
+input_path <- "resources_test/common/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter."
+output_path <- ""
+
+par <- list(
+  input_mod1 = paste0(input_path, "output_rna.h5ad"),
+  input_mod2 = paste0(input_path, "output_mod2.h5ad"),
+  output_mod1 = paste0(output_path, "output_mod1.h5ad"),
+  output_mod2 = paste0(output_path, "output_mod2.h5ad"),
+  output_solution = paste0(output_path, "solution.h5ad"),
+  train_only = TRUE
+)
+## VIASH END
+
+cat("Reading mod1 data\n")
+input_mod1 <- anndata::read_h5ad(par$input_mod1)
+ad1_mod <- unique(input_mod1$var[["feature_types"]])
+new_dataset_id <- paste0(input_mod1$uns[["dataset_id"]], "_JE")
+ad1_uns <- list(dataset_id = new_dataset_id, organism = "human")
+ad2_uns <- list(dataset_id = new_dataset_id, organism = "human")
+
+cat("Creating mod1 object\n")
+out_mod1 <- anndata::AnnData(
+  X = input_mod1$X,
+  layers = list(counts = input_mod1$layers[["counts"]]),
+  var = input_mod1$var %>% select(one_of("gene_ids"), feature_types),
+  obs = input_mod1$obs %>% select(one_of("batch", "size_factors")),
+  uns = ad1_uns
+)
+
+cat("Create solution object\n")
+out_solution <- anndata::AnnData(
+  X = input_mod1$X,
+  var = input_mod1$var %>% select(one_of("gene_ids"), feature_types),
+  obs = input_mod1$obs %>% select(
+    one_of("batch", "cell_type", "pseudotime_order_GEX", "pseudotime_order_ATAC", "pseudotime_order_ADT", "S_score", "G2M_score")
+  ),
+  uns = ad1_uns
+)
+
+is_train <- input_mod1$obs$is_train
+
+if (par$train_only) {
+  cat("Filtering out test cells\n", sep = "")
+  out_mod1 <- out_mod1[is_train, ] #$copy()
+  out_solution <- out_solution[is_train, ]# $copy()
+}
+
+rm(input_mod1)
+gc()
+
+cat("Reading mod2 data\n")
+input_mod2 <- anndata::read_h5ad(par$input_mod2)
+ad2_mod <- unique(input_mod2$var[["feature_types"]])
+ad2_obsm <- list()
+
+if (ad2_mod == "ATAC") {
+  ad2_uns$gene_activity_var_names <- input_mod2$uns$gene_activity_var_names
+  ad2_obsm$gene_activity <- as(input_mod2$obsm$gene_activity, "CsparseMatrix")
+}
+
+cat("Creating mod2 object\n")
+out_mod2 <- anndata::AnnData(
+  X = input_mod2$X,
+  layers = list(counts = input_mod2$layers[["counts"]]),
+  var = input_mod2$var %>% select(one_of("gene_ids"), feature_types),
+  obs = input_mod2$obs %>% select(one_of("batch")),
+  obsm = ad2_obsm,
+  uns = ad2_uns
+)
+rm(input_mod2)
+gc()
+
+if (par$train_only) {
+  cat("Filtering out test cells\n", sep = "")
+  out_mod2 <- out_mod2[is_train, ] #$copy()
+}
+
+cat("Saving output files as h5ad\n")
+cat("output_mod1:")
+print(out_mod1)
+zzz <- out_mod1$write_h5ad(par$output_mod1, compression = "gzip")
+
+cat("output_mod2:")
+print(out_mod2)
+zzz <- out_mod2$write_h5ad(par$output_mod2, compression = "gzip")
+
+cat("output_solution:")
+print(out_solution)
+zzz <- out_solution$write_h5ad(par$output_solution, compression = "gzip")
diff --git a/src/joint_embedding/mask_dataset/test.R b/src/joint_embedding/mask_dataset/test.R
new file mode 100644
index 0000000000..88930e00d2
--- /dev/null
+++ b/src/joint_embedding/mask_dataset/test.R
@@ -0,0 +1,55 @@
+library(testthat, quietly = TRUE, warn.conflicts = FALSE)
+library(anndata, warn.conflicts = FALSE)
+
+par <- list(
+  input_mod1 = "resources_test/common/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.output_rna.h5ad",
+  input_mod2 = "resources_test/common/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.output_mod2.h5ad",
+  output_mod1 = "output_mod1.h5ad",
+  output_mod2 = "output_mod2.h5ad",
+  output_solution = "solution.h5ad"
+)
+
+cat("> Running mask component\n")
+out <- processx::run(
+  command = paste0("./", meta["functionality_name"]),
+  args = c(
+    "--input_mod1", par$input_mod1,
+    "--input_mod2", par$input_mod2,
+    "--output_mod1", par$output_mod1,
+    "--output_mod2", par$output_mod2,
+    "--output_solution", par$output_solution
+  ),
+  stderr_to_stdout = TRUE
+)
+
+cat("> Checking whether output files were created\n")
+expect_true(file.exists(par$output_mod1))
+expect_true(file.exists(par$output_mod2))
+expect_true(file.exists(par$output_solution))
+
+cat("> Reading h5ad files\n")
+input_mod1 <- anndata::read_h5ad(par$input_mod1)
+input_mod2 <- anndata::read_h5ad(par$input_mod2)
+output_mod1 <- anndata::read_h5ad(par$output_mod1)
+output_mod2 <- anndata::read_h5ad(par$output_mod2)
+output_solution <- anndata::read_h5ad(par$output_solution)
+
+cat("> Checking contents of h5ad files\n")
+expect_equal(output_mod1$uns[["dataset_id"]], paste0(input_mod1$uns[["dataset_id"]], "_JE"))
+expect_equal(output_mod2$uns[["dataset_id"]], paste0(input_mod1$uns[["dataset_id"]], "_JE"))
+expect_equal(output_solution$uns[["dataset_id"]], paste0(input_mod1$uns[["dataset_id"]], "_JE"))
+expect_equal(output_mod1$uns[["organism"]], input_mod1$uns[["organism"]])
+expect_equal(output_mod2$uns[["organism"]], input_mod1$uns[["organism"]])
+expect_equal(output_solution$uns[["organism"]], input_mod1$uns[["organism"]])
+expect_equal(output_mod1$n_obs, input_mod1$n_obs)
+expect_equal(output_mod2$n_obs, input_mod2$n_obs)
+expect_equal(output_mod1$n_vars, input_mod1$n_vars)
+expect_equal(output_mod2$n_vars, input_mod2$n_vars)
+expect_equal(output_mod1$var_names, input_mod1$var_names)
+expect_equal(output_mod2$var_names, input_mod2$var_names)
+expect_equal(output_mod1$obs_names, input_mod1$obs_names)
+expect_equal(output_mod2$obs_names, input_mod2$obs_names)
+
+# TODO check contents of matrices, check rownames
+
+cat("> Test succeeded!\n")
diff --git a/src/joint_embedding/methods/lmds/config.vsh.yaml b/src/joint_embedding/methods/lmds/config.vsh.yaml
new file mode 100644
index 0000000000..440172d748
--- /dev/null
+++ b/src/joint_embedding/methods/lmds/config.vsh.yaml
@@ -0,0 +1,33 @@
+__merge__: ../../api/comp_method.yamllowmem
+functionality:
+  name: lmds
+  namespace: joint_embedding/methods
+  description: Landmark MDS dimensionality reduction on the Spearman distance.
+  info:
+    type: method
+    method_name: "LMDS"
+  arguments:
+    - name: "--distance_method"
+      type: "string"
+      default: "spearman"
+      description: The distance method to use. Possible values are euclidean, pearson, spearman and others.
+    - name: "--n_dims"
+      type: integer
+      default: 10
+      description: Number of dimensions to output.
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata, lmds, tidyverse ]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [anndata>=0.8]
+  - type: nextflow
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/joint_embedding/methods/lmds/script.R b/src/joint_embedding/methods/lmds/script.R
new file mode 100644
index 0000000000..7dda2a3c4c
--- /dev/null
+++ b/src/joint_embedding/methods/lmds/script.R
@@ -0,0 +1,44 @@
+cat("Loading dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+library(anndata, warn.conflicts = FALSE)
+library(Matrix, warn.conflicts = FALSE, quietly = TRUE)
+
+## VIASH START
+# path <- "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter."
+path <- "resources_test/common/joint_embedding/openproblems_bmmc_multiome_starter."
+par <- list(
+  input_mod1 = paste0(path, "mod1.h5ad"),
+  input_mod2 = paste0(path, "mod2.h5ad"),
+  output = "output/lmds/output.h5ad",
+  n_dims = 10L,
+  distance_method = "spearman"
+)
+## VIASH END
+
+cat("Reading h5ad files\n")
+ad1 <- anndata::read_h5ad(par$input_mod1)
+ad2 <- anndata::read_h5ad(par$input_mod2)
+
+cat("Performing DR\n")
+dr <- lmds::lmds(
+  cbind(ad1$X, ad2$X),
+  ndim = par$n_dims,
+  distance_method = par$distance_method
+)
+
+rownames(dr) <- rownames(ad1)
+colnames(dr) <- paste0("comp_", seq_len(par$n_dims))
+
+out <- anndata::AnnData(
+  X = dr,
+  uns = list(
+    dataset_id = ad1$uns[["dataset_id"]],
+    method_id = meta$functionality_name
+  ),
+  obsm = list(X_emb = as(dr, "CsparseMatrix"))
+)
+
+
+cat("Writing predictions to file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")
diff --git a/src/joint_embedding/methods/mnn/config.vsh.yaml b/src/joint_embedding/methods/mnn/config.vsh.yaml
new file mode 100644
index 0000000000..30dc728fd9
--- /dev/null
+++ b/src/joint_embedding/methods/mnn/config.vsh.yaml
@@ -0,0 +1,31 @@
+__merge__: ../../api/comp_method.yaml
+functionality:
+  name: mnn
+  namespace: joint_embedding/methods
+  description: Mutual nearest neighbors correction followed by PCA.
+  info:
+    type: method
+    method_name: "MNN"
+  arguments:
+    - name: "--hvg_sel"
+      type: "integer"
+      default: 1000
+      description: Number of features per modality to use.
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata, lmds, tidyverse, bioconductor]
+      - type: r
+        bioc: [ SingleCellExperiment, batchelor, proxyC ]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [anndata>=0.8]      
+  - type: nextflow
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/joint_embedding/methods/mnn/script.R b/src/joint_embedding/methods/mnn/script.R
new file mode 100644
index 0000000000..27a4454bff
--- /dev/null
+++ b/src/joint_embedding/methods/mnn/script.R
@@ -0,0 +1,67 @@
+cat("Loading dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+requireNamespace("anndata", quietly = TRUE)
+library(Matrix, warn.conflicts = FALSE, quietly = TRUE)
+requireNamespace("batchelor", quietly = TRUE)
+requireNamespace("SingleCellExperiment", quietly = TRUE)
+
+## VIASH START
+# path <- "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter."
+path <- "output/datasets/joint_embedding/openproblems_bmmc_multiome_phase1/openproblems_bmmc_multiome_phase1.censor_dataset.output_"
+# path <- "output/public_datasets/joint_embedding/dyngen_citeseq_1/dyngen_citeseq_1.censor_dataset.output_"
+par <- list(
+  input_mod1 = paste0(path, "mod1.h5ad"),
+  input_mod2 = paste0(path, "mod2.h5ad"),
+  output = "output.h5ad",
+  hvg_sel = 1000L
+)
+meta <- list(functionality_name = "foo")
+## VIASH END
+
+method_id <- meta$functionality_name
+
+cat("Reading h5ad files\n")
+input_mod1 <- anndata::read_h5ad(par$input_mod1)
+
+rn <- rownames(input_mod1)
+batch <- input_mod1$obs$batch
+dataset_id <- input_mod1$uns[["dataset_id"]]
+Xt_mod1 <- t(input_mod1$X)
+
+# select hvg
+if (!is.null(par$hvg_sel) && nrow(Xt_mod1) > par$hvg_sel) {
+  sd_mod1 <- proxyC::rowSds(Xt_mod1)
+  Xt_mod1 <- Xt_mod1[order(sd_mod1, decreasing = TRUE)[seq_len(par$hvg_sel)], ]
+}
+
+rm(input_mod1)
+gc()
+
+Xt_mod2 <- t(anndata::read_h5ad(par$input_mod2)$X)
+if (!is.null(par$hvg_sel) && nrow(Xt_mod2) > par$hvg_sel) {
+  sd_mod2 <- proxyC::rowSds(Xt_mod2)
+  Xt_mod2 <- Xt_mod2[order(sd_mod2, decreasing = TRUE)[seq_len(par$hvg_sel)], ]
+}
+
+cat("Running fastMNN\n")
+mnn_out <- batchelor::fastMNN(
+  rbind(Xt_mod1, Xt_mod2),
+  batch = batch
+)
+dr <- SingleCellExperiment::reducedDim(mnn_out, "corrected")
+
+rownames(dr) <- rn
+colnames(dr) <- paste0("comp_", seq_len(ncol(dr)))
+
+out <- anndata::AnnData(
+  X = dr,
+  uns = list(
+    dataset_id = dataset_id,
+    method_id = meta$functionality_name
+  ),
+  obsm = list(X_emb = as(dr, "CsparseMatrix"))
+)
+
+cat("Writing predictions to file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")
diff --git a/src/joint_embedding/methods/newwave/config.vsh.yaml b/src/joint_embedding/methods/newwave/config.vsh.yaml
new file mode 100644
index 0000000000..0939d7b6f7
--- /dev/null
+++ b/src/joint_embedding/methods/newwave/config.vsh.yaml
@@ -0,0 +1,40 @@
+__merge__: ../../api/comp_method.yaml
+functionality:
+  name: newwave
+  namespace: joint_embedding/methods
+  description: Concatenated NewWave.
+  info:
+    type: method
+    method_name: "NewWave"
+    doi: "10.1101/2021.08.02.453487"
+  arguments:
+    - name: "--maxiter"
+      type: "integer"
+      default: 100
+      description: Maximum number of NewWave iterations.
+    - name: "--k"
+      type: "integer"
+      default: 10
+      description: NewWave K parameter.
+    - name: "--hvg_sel"
+      type: "integer"
+      default: 1000
+      description: Number of features per modality to use.
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata, lmds, tidyverse, bioconductor]
+      - type: r
+        bioc: [ SingleCellExperiment, NewWave, proxyC ]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [anndata>=0.8]
+  - type: nextflow
+    directives: 
+      label: [ highmem, highcpu ]
diff --git a/src/joint_embedding/methods/newwave/script.R b/src/joint_embedding/methods/newwave/script.R
new file mode 100644
index 0000000000..f87d79cdd5
--- /dev/null
+++ b/src/joint_embedding/methods/newwave/script.R
@@ -0,0 +1,111 @@
+cat("Loading dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+requireNamespace("anndata", quietly = TRUE)
+library(Matrix, warn.conflicts = FALSE, quietly = TRUE)
+requireNamespace("NewWave", quietly = TRUE)
+requireNamespace("SingleCellExperiment", quietly = TRUE)
+
+## VIASH START
+# path <- "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter."
+path <- "output/datasets/joint_embedding/openproblems_bmmc_multiome_phase1/openproblems_bmmc_multiome_phase1.censor_dataset.output_"
+# path <- "output/public_datasets/joint_embedding/dyngen_citeseq_1/dyngen_citeseq_1.censor_dataset.output_"
+par <- list(
+  input_mod1 = paste0(path, "mod1.h5ad"),
+  input_mod2 = paste0(path, "mod2.h5ad"),
+  output = "output.h5ad",
+  maxiter = 2L,
+  k = 3L,
+  hvg_sel = 1000
+)
+meta <- list(functionality_name = "foo")
+## VIASH END
+
+method_id <- meta$functionality_name
+
+cat("Reading mod1 h5ad\n")
+input_mod1 <- anndata::read_h5ad(par$input_mod1)
+
+rn <- rownames(input_mod1)
+batch <- input_mod1$obs$batch
+dataset_id <- input_mod1$uns[["dataset_id"]]
+
+sd1 <- proxyC::colSds(input_mod1$X)
+fil1 <-
+  if (!is.null(par$hvg_sel) && ncol(input_mod1) > par$hvg_sel) {
+    head(order(sd1, decreasing = TRUE), par$hvg_sel)
+  } else {
+    which(sd1 > 0)
+  }
+data1 <- SummarizedExperiment::SummarizedExperiment(
+  assays = list(counts = t(input_mod1$layers[["counts"]][, fil1])),
+  colData = data.frame(batch = factor(batch))
+)
+rm(input_mod1)
+gc()
+
+cat("Running NewWave on mod1\n")
+res1 <- NewWave::newWave(
+  data1,
+  X = "~batch",
+  verbose = TRUE,
+  K = par$k,
+  maxiter_optimize = par$maxiter,
+  n_gene_par = min(300, nrow(data1)),
+  n_cell_par = min(300, ncol(data1)),
+  commondispersion = FALSE
+)
+rm(data1)
+
+dr_x1 <- SingleCellExperiment::reducedDim(res1)
+
+cat("Reading mod2 anndata\n")
+input_mod2 <- anndata::read_h5ad(par$input_mod2)
+sd2 <- proxyC::colSds(input_mod2$X)
+fil2 <-
+  if (!is.null(par$hvg_sel) && ncol(input_mod2) > par$hvg_sel) {
+    head(order(sd2, decreasing = TRUE), par$hvg_sel)
+  } else {
+    which(sd2 > 0)
+  }
+data2 <- SummarizedExperiment::SummarizedExperiment(
+  assays = list(counts = t(input_mod2$layers[["counts"]][, fil2])),
+  colData = data.frame(batch = factor(batch))
+)
+rm(input_mod2)
+gc()
+
+cat("Running NewWave on mod2\n")
+res2 <- NewWave::newWave(
+  data2,
+  X = "~batch",
+  verbose = TRUE,
+  K = par$k,
+  maxiter_optimize = par$maxiter,
+  n_gene_par = min(300, nrow(data2)),
+  n_cell_par = min(300, ncol(data2)),
+  commondispersion = FALSE
+)
+dr_x2 <- SingleCellExperiment::reducedDim(res2)
+rm(data2)
+
+cat("Spline separate DRs\n")
+dr <- do.call(cbind, lapply(seq_len(ncol(dr_x1)), function(i) {
+  cbind(dr_x1[, i], dr_x2[, i])
+}))
+
+rownames(dr) <- rn
+colnames(dr) <- paste0("comp_", seq_len(ncol(dr)))
+
+out <- anndata::AnnData(
+  X = dr,
+  uns = list(
+    dataset_id = dataset_id,
+    method_id = meta$functionality_name
+  ),
+  obsm = list(X_emb = as(dr, "CsparseMatrix"))
+
+)
+
+cat("Writing predictions to file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")
diff --git a/src/joint_embedding/methods/pca/config.vsh.yaml b/src/joint_embedding/methods/pca/config.vsh.yaml
new file mode 100644
index 0000000000..860a48c300
--- /dev/null
+++ b/src/joint_embedding/methods/pca/config.vsh.yaml
@@ -0,0 +1,34 @@
+__merge__: ../../api/comp_method.yaml
+functionality:
+  name: pca
+  namespace: joint_embedding/methods
+  description: PCA dimensionality reduction.
+  info:
+    type: method
+    method_name: "PCA"
+  authors:
+  arguments:
+    - name: "--n_dims"
+      type: "integer"
+      default: 10
+      description: Number of dimensions to output.
+    - name: "--hvg_sel"
+      type: "integer"
+      default: 1000
+      description: Number of features per modality to use.
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata, lmds, tidyverse, bioconductor, irlba, proxyC]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [anndata>=0.8]
+  - type: nextflow
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/joint_embedding/methods/pca/script.R b/src/joint_embedding/methods/pca/script.R
new file mode 100644
index 0000000000..d16eb027cf
--- /dev/null
+++ b/src/joint_embedding/methods/pca/script.R
@@ -0,0 +1,63 @@
+cat("Loading dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+requireNamespace("anndata", quietly = TRUE)
+library(Matrix, warn.conflicts = FALSE, quietly = TRUE)
+
+## VIASH START
+# path <- "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter."
+path <- "output/datasets/joint_embedding/openproblems_bmmc_multiome_phase1/openproblems_bmmc_multiome_phase1.censor_dataset.output_"
+# path <- "output/public_datasets/joint_embedding/dyngen_citeseq_1/dyngen_citeseq_1.censor_dataset.output_"
+par <- list(
+  input_mod1 = paste0(path, "mod1.h5ad"),
+  input_mod2 = paste0(path, "mod2.h5ad"),
+  output = "output.h5ad",
+  n_dims = 4L,
+  hvg_sel = 1000L
+)
+meta <- list(functionality_name = "foo")
+## VIASH END
+
+cat("Reading h5ad files\n")
+input_mod1 <- anndata::read_h5ad(par$input_mod1)
+
+rn <- rownames(input_mod1)
+batch <- input_mod1$obs$batch
+dataset_id <- input_mod1$uns[["dataset_id"]]
+X_mod1 <- input_mod1$X
+
+# select hvg
+if (!is.null(par$hvg_sel) && ncol(X_mod1) > par$hvg_sel) {
+  sd_mod1 <- proxyC::colSds(X_mod1)
+  X_mod1 <- X_mod1[, head(order(sd_mod1, decreasing = TRUE), par$hvg_sel)]
+}
+
+rm(input_mod1)
+gc()
+
+X_mod2 <- anndata::read_h5ad(par$input_mod2)$X
+if (!is.null(par$hvg_sel) && ncol(X_mod2) > par$hvg_sel) {
+  sd_mod2 <- proxyC::colSds(X_mod2)
+  X_mod2 <- X_mod2[, head(order(sd_mod2, decreasing = TRUE), par$hvg_sel)]
+}
+
+cat("Performing DR\n")
+dr <- irlba::prcomp_irlba(
+  cbind(X_mod1, X_mod2),
+  n = par$n_dims
+)$x
+
+rownames(dr) <- rn
+colnames(dr) <- paste0("comp_", seq_len(par$n_dims))
+
+out <- anndata::AnnData(
+  X = dr,
+  uns = list(
+    dataset_id = dataset_id,
+    method_id = meta$functionality_name
+  ),
+  obsm = list( X_emb = as(dr, "CsparseMatrix"))
+)
+
+cat("Writing predictions to file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")
diff --git a/src/joint_embedding/methods/totalvi/config.vsh.yaml b/src/joint_embedding/methods/totalvi/config.vsh.yaml
new file mode 100644
index 0000000000..5b0969172a
--- /dev/null
+++ b/src/joint_embedding/methods/totalvi/config.vsh.yaml
@@ -0,0 +1,34 @@
+__merge__: ../../api/comp_method.yaml
+functionality:
+  name: totalvi
+  namespace: joint_embedding/methods
+  version: dev
+  description: "totalVI: joint probabilistic modeling with Total Variational Inference"
+  info:
+    type: method
+    method_name: totalVI
+    doi: 10.1038/s41592-020-01050-x    
+  arguments:
+    - name: --hvg_number
+      type: integer
+      default: 4000
+      description: Number of HVG to include in totalVI
+    - name: --max_epochs
+      type: integer
+      default: 400
+      description: Number of max epochs to run totalVI
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: "python:3.10"
+    setup:
+      - type: python
+        pip: [ anndata>=0.8, scanpy, scikit-misc, scipy, scikit-learn, scvi-tools]
+  - type: nextflow
+    directives: 
+      label: [ lowmem, lowcpu ]
+
+
+
diff --git a/src/joint_embedding/methods/totalvi/script.py b/src/joint_embedding/methods/totalvi/script.py
new file mode 100644
index 0000000000..9b40e1f3d1
--- /dev/null
+++ b/src/joint_embedding/methods/totalvi/script.py
@@ -0,0 +1,58 @@
+import anndata
+import scanpy as sc
+from scipy import sparse
+from scvi.model import TOTALVI
+
+## VIASH START
+par = {
+    "input_mod1": "output/public_datasets/joint_embedding/totalvi_spleen_lymph_111/totalvi_spleen_lymph_111.censor_dataset.output_mod1.h5ad",
+    "input_mod2": "output/public_datasets/joint_embedding/totalvi_spleen_lymph_111/totalvi_spleen_lymph_111.censor_dataset.output_mod2.h5ad",
+    "output": "tmp/output_prediction.h5ad",
+    "hvg_number": 4000,
+    "max_epochs": 20
+}
+
+meta = {
+    'funcionality_name': "foo"
+}
+## VIASH END
+
+print("Load and prepare data", flush=True)
+adata_mod1 = anndata.read_h5ad(par['input_mod1'])
+adata_mod2 = anndata.read_h5ad(par['input_mod2'])
+adata_mod1.obsm['protein_expression'] = adata_mod2.layers["counts"].toarray()
+
+print('Select highly variable genes', flush=True)
+sc.pp.highly_variable_genes(
+    adata_mod1,
+    n_top_genes=par['hvg_number'],
+    flavor="seurat_v3",
+    batch_key="batch",
+    subset=True
+)
+
+print("Set up model", flush=True)
+TOTALVI.setup_anndata(
+    adata_mod1,
+    batch_key="batch",
+    protein_expression_obsm_key="protein_expression"
+)
+
+print('Train totalVI with', par['max_epochs'], 'epochs', flush=True)
+vae = TOTALVI(adata_mod1, latent_distribution="normal")
+vae.train(max_epochs = par['max_epochs'])
+
+print("Postprocessing and saving output", flush=True)
+adata_out = anndata.AnnData(
+    X=vae.get_latent_representation(),
+    obs=adata_mod1.obs[['batch']],
+    uns={
+        "dataset_id": adata_mod1.uns["dataset_id"],
+        "method_id": meta["functionality_name"]
+    },
+    obsm = {"X_emb": sparse.csr_matrix(vae.get_latent_representation())}
+)
+
+del adata_out.X
+
+adata_out.write_h5ad(par['output'], compression = "gzip")
diff --git a/src/joint_embedding/methods/umap/config.vsh.yaml b/src/joint_embedding/methods/umap/config.vsh.yaml
new file mode 100644
index 0000000000..5ef2d3d11e
--- /dev/null
+++ b/src/joint_embedding/methods/umap/config.vsh.yaml
@@ -0,0 +1,46 @@
+__merge__: ../../api/comp_method.yaml
+functionality:
+  name: umap
+  namespace: joint_embedding/methods
+  version: dev
+  description: UMAP dimensionality reduction on the Euclidean distance.
+  info:
+    type: method
+    method_name: UMAP
+  arguments:
+    - name: "--n_dims"
+      type: "integer"
+      default: 10
+      description: Number of dimensions to output.
+    - name: "--metric"
+      type: "string"
+      default: "euclidean"
+      description: The metric to use. Possible values are euclidean, cosine, manhattan.
+    - name: "--n_neighbors"
+      type: "integer"
+      default: 15
+      description: Number of neighbor to use int he KNN.
+    - name: "--n_pcs"
+      type: "integer"
+      default: 50
+      description: Number of principal components to use in the PCA step.
+    - name: "--hvg_sel"
+      type: "integer"
+      default: 1000
+      description: Number of features per modality to use.
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata, lmds, tidyverse, irlba, proxyC, uwot]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [anndata>=0.8]
+  - type: nextflow
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/joint_embedding/methods/umap/script.R b/src/joint_embedding/methods/umap/script.R
new file mode 100644
index 0000000000..2f654be245
--- /dev/null
+++ b/src/joint_embedding/methods/umap/script.R
@@ -0,0 +1,80 @@
+cat("Loading dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+requireNamespace("anndata", quietly = TRUE)
+library(Matrix, warn.conflicts = FALSE, quietly = TRUE)
+
+## VIASH START
+# path <- "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter."
+path <- "output/datasets/joint_embedding/openproblems_bmmc_multiome_phase1/openproblems_bmmc_multiome_phase1.censor_dataset.output_"
+# path <- "output/public_datasets/joint_embedding/dyngen_citeseq_1/dyngen_citeseq_1.censor_dataset.output_"
+par <- list(
+  input_mod1 = paste0(path, "mod1.h5ad"),
+  input_mod2 = paste0(path, "mod2.h5ad"),
+  output = "output.h5ad",
+  n_dims = 10L,
+  n_neighbors = 15L,
+  metric = "euclidean",
+  n_pcs = 50L,
+  hvg_sel = 100L
+)
+meta <- list(functionality_name = "foo")
+## VIASH END
+
+n_cores <- parallel::detectCores(all.tests = FALSE, logical = TRUE)
+
+cat("Reading h5ad files\n")
+input_mod1 <- anndata::read_h5ad(par$input_mod1)
+
+rn <- rownames(input_mod1)
+batch <- input_mod1$obs$batch
+dataset_id <- input_mod1$uns[["dataset_id"]]
+X_mod1 <- input_mod1$X
+
+# select hvg
+if (!is.null(par$hvg_sel) && ncol(X_mod1) > par$hvg_sel) {
+  sd_mod1 <- proxyC::colSds(X_mod1)
+  X_mod1 <- X_mod1[, head(order(sd_mod1, decreasing = TRUE), par$hvg_sel)]
+}
+
+rm(input_mod1)
+gc()
+
+X_mod2 <- anndata::read_h5ad(par$input_mod2)$X
+if (!is.null(par$hvg_sel) && ncol(X_mod2) > par$hvg_sel) {
+  sd_mod2 <- proxyC::colSds(X_mod2)
+  X_mod2 <- X_mod2[, head(order(sd_mod2, decreasing = TRUE), par$hvg_sel)]
+}
+
+cat("Performing PCA\n")
+X_pca <- irlba::prcomp_irlba(
+  cbind(X_mod1, X_mod2),
+  n = 100
+)$x
+
+cat("Performing UMap\n")
+dr <- uwot::umap(
+  X_pca,
+  n_components = par$n_dims,
+  n_neighbors = par$n_neighbors,
+  metric = par$metric,
+  n_threads = n_cores,
+  nn_method = "annoy"
+)
+
+rownames(dr) <- rn
+colnames(dr) <- paste0("comp_", seq_len(par$n_dims))
+
+out <- anndata::AnnData(
+  X = dr,
+  uns = list(
+    dataset_id = dataset_id,
+    method_id = meta$functionality_name
+  ),
+  obsm = list(
+    X_emb = as(dr, "CsparseMatrix")
+  )
+)
+
+cat("Writing predictions to file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")
diff --git a/src/joint_embedding/metrics/ari/config.vsh.yaml b/src/joint_embedding/metrics/ari/config.vsh.yaml
new file mode 100644
index 0000000000..41ef532e5a
--- /dev/null
+++ b/src/joint_embedding/metrics/ari/config.vsh.yaml
@@ -0,0 +1,28 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: ari
+  namespace: joint_embedding/metrics
+  description: Adjusted rand index (ARI)
+  info:
+    metrics:
+      - metric_id: ari
+        metric_name: ari
+        metric_description: Adjusted rand index (ARI)
+        min: 0
+        max: 1
+        maximize: true
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: python:3.10
+    setup:
+      - type: python
+        pip: [anndata>=0.8, scib, scanpy]
+    test_setup:
+      - type: python
+        pip: [ pyyaml ]
+  - type: nextflow
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/joint_embedding/metrics/ari/script.py b/src/joint_embedding/metrics/ari/script.py
new file mode 100644
index 0000000000..dc7c195f66
--- /dev/null
+++ b/src/joint_embedding/metrics/ari/script.py
@@ -0,0 +1,58 @@
+import pprint
+import scanpy as sc
+import anndata as ad
+import scib
+
+## VIASH START
+par = dict(
+    input_prediction="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+    input_solution="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad",
+    output="openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.ari.had",
+    debug=True
+)
+
+## VIASH END
+
+if par['debug']:
+    pprint.pprint(par)
+
+print("Read prediction anndata")
+adata = ad.read_h5ad(par['input_prediction'])
+dataset_id = adata.uns['dataset_id']
+
+print("Read solution anndata")
+adata_solution = ad.read_h5ad(par['input_solution'])
+
+print('Transfer obs annotations')
+adata.obs['batch'] = adata_solution.obs['batch'][adata.obs_names]
+adata.obs['cell_type'] = adata_solution.obs['cell_type'][adata.obs_names]
+
+print('Preprocessing')
+sc.pp.neighbors(adata, use_rep='X_emb')
+
+print('Clustering')
+scib.cl.opt_louvain(
+    adata,
+    label_key='cell_type',
+    cluster_key='cluster',
+    plot=False,
+    inplace=True,
+    force=True
+)
+
+print('Compute score')
+score = scib.me.ari(adata, group1='cluster', group2='cell_type')
+
+# store adata with metrics
+print("Create output object")
+out = ad.AnnData(
+    uns=dict(
+        dataset_id=adata.uns['dataset_id'],
+        method_id=adata.uns['method_id'],
+        metric_ids=["ari"],
+        metric_values=[score]
+    )
+)
+
+print("Write output to h5ad file")
+out.write(par['output'], compression='gzip')
diff --git a/src/joint_embedding/metrics/asw_batch/config.vsh.yaml b/src/joint_embedding/metrics/asw_batch/config.vsh.yaml
new file mode 100644
index 0000000000..e89b30dc90
--- /dev/null
+++ b/src/joint_embedding/metrics/asw_batch/config.vsh.yaml
@@ -0,0 +1,28 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: asw_batch
+  namespace: joint_embedding/metrics
+  description: Average silhouette width (ASW) of batches per label
+  info:
+    metrics:
+      - metric_id: asw_batch
+        metric_name: asw_batch
+        metric_description: Average silhouette width (ASW) of batches per label
+        min: 0
+        max: 1
+        maximize: True
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: python:3.10
+    setup:
+      - type: python
+        pip: [ anndata>=0.8, scanpy, scib]
+    test_setup:
+      - type: python
+        pip: [ pyyaml ]
+  - type: nextflow
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/joint_embedding/metrics/asw_batch/script.py b/src/joint_embedding/metrics/asw_batch/script.py
new file mode 100644
index 0000000000..88e89c21a8
--- /dev/null
+++ b/src/joint_embedding/metrics/asw_batch/script.py
@@ -0,0 +1,55 @@
+import pprint
+import scanpy as sc
+import anndata as ad
+import scib
+
+## VIASH START
+par = dict(
+    input_prediction="resources_test/common/joint_embedding/cite_random_prediction.h5ad",
+    input_solution="resources_test/common/joint_embedding/cite_solution.h5ad",
+    output="resources_test/common/joint_embedding/score_cc_cons.h5ad",
+    debug=False
+)
+
+## VIASH END
+
+if par['debug']:
+    pprint.pprint(par)
+
+input_prediction = par['input_prediction']
+input_solution = par['input_solution']
+output = par['output']
+
+print("Read prediction anndata")
+adata = ad.read_h5ad(input_prediction)
+dataset_id = adata.uns['dataset_id']
+
+print("Read solution anndata")
+adata_solution = ad.read_h5ad(input_solution)
+
+print('Transfer obs annotations')
+adata.obs['batch'] = adata_solution.obs['batch'][adata.obs_names]
+adata.obs['cell_type'] = adata_solution.obs['cell_type'][adata.obs_names]
+
+print('Compute score')
+score = scib.me.silhouette_batch(
+    adata,
+    batch_key='batch',
+    group_key='cell_type',
+    embed='X_emb',
+    verbose=False
+)
+
+# store adata with metrics
+print("Create output object")
+out = ad.AnnData(
+    uns=dict(
+        dataset_id=adata.uns['dataset_id'],
+        method_id=adata.uns['method_id'],
+        metric_ids=['asw_batch'],
+        metric_values=[score]
+    )
+)
+
+print("Write output to h5ad file")
+out.write(output, compression='gzip')
diff --git a/src/joint_embedding/metrics/asw_label/config.vsh.yaml b/src/joint_embedding/metrics/asw_label/config.vsh.yaml
new file mode 100644
index 0000000000..262b48d6f4
--- /dev/null
+++ b/src/joint_embedding/metrics/asw_label/config.vsh.yaml
@@ -0,0 +1,28 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: asw_label
+  namespace: joint_embedding/metrics
+  description: Average silhouette width (ASW) of labels
+  info:
+    metrics:
+      - metric_id: asw_label
+        metric_name: asw_label
+        metric_description: Average silhouette width (ASW) of labels
+        min: 0
+        max: 1
+        maximize: True
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: python:3.10
+    setup:
+      - type: python
+        pip: [anndata>=0.8, scib]
+    test_setup:
+      - type: python
+        pip: [ pyyaml ]
+  - type: nextflow
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/joint_embedding/metrics/asw_label/script.py b/src/joint_embedding/metrics/asw_label/script.py
new file mode 100644
index 0000000000..4d04092004
--- /dev/null
+++ b/src/joint_embedding/metrics/asw_label/script.py
@@ -0,0 +1,49 @@
+import pprint
+import anndata as ad
+import scib
+
+## VIASH START
+par = dict(
+    input_prediction="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+    input_solution="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad",
+    output="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.asw_batch.tsv",
+    debug=True
+)
+
+## VIASH END
+
+if par['debug']:
+    pprint.pprint(par)
+
+input_prediction = par['input_prediction']
+input_solution = par['input_solution']
+output = par['output']
+
+print("Read prediction anndata")
+adata = ad.read_h5ad(input_prediction)
+dataset_id = adata.uns['dataset_id']
+
+print("Read solution anndata")
+adata_solution = ad.read_h5ad(input_solution)
+
+print('Transfer obs annotations')
+adata.obs['batch'] = adata_solution.obs['batch'][adata.obs_names]
+adata.obs['cell_type'] = adata_solution.obs['cell_type'][adata.obs_names]
+
+print('Compute score')
+score = scib.me.silhouette(adata, group_key='cell_type', embed='X_emb')
+
+# store adata with metrics
+print("Create output object")
+out = ad.AnnData(
+    uns=dict(
+        dataset_id=adata.uns['dataset_id'],
+        method_id=adata.uns['method_id'],
+        metric_ids=['asw_label'],
+        metric_values=[score]
+    )
+)
+
+print("Write output to h5ad file")
+out.write(output, compression='gzip')
+
diff --git a/src/joint_embedding/metrics/cc_cons/config.vsh.yaml b/src/joint_embedding/metrics/cc_cons/config.vsh.yaml
new file mode 100644
index 0000000000..ba501a5e21
--- /dev/null
+++ b/src/joint_embedding/metrics/cc_cons/config.vsh.yaml
@@ -0,0 +1,28 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: cc_cons
+  namespace: joint_embedding/metrics
+  description: Cell cycle conservation score
+  info:
+    metrics:
+      - metric_id: cc_cons
+        metric_name: cc_cons
+        metric_description: Cell cycle conservation score
+        min: 0
+        max: 1
+        maximize: True
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: python:3.10
+    setup:
+      - type: python
+        pip: [anndata>=0.8, scib]
+    test_setup:
+      - type: python
+        pip: [ pyyaml ]
+  - type: nextflow
+    directives: 
+      label: [ vhighmem, midcpu ]
diff --git a/src/joint_embedding/metrics/cc_cons/script.py b/src/joint_embedding/metrics/cc_cons/script.py
new file mode 100644
index 0000000000..feecf45f12
--- /dev/null
+++ b/src/joint_embedding/metrics/cc_cons/script.py
@@ -0,0 +1,56 @@
+import pprint
+import anndata as ad
+import scib
+
+## VIASH START
+par = dict(
+    input_prediction="resources_test/common/joint_embedding/cite_random_prediction.h5ad",
+    input_solution="resources_test/common/joint_embedding/cite_solution.h5ad",
+    output="resources_test/common/joint_embedding/score_cc_cons.h5ad",
+    debug=False
+)
+## VIASH END
+
+
+if par['debug']:
+    pprint.pprint(par)
+
+print("Read prediction anndata")
+adata = ad.read_h5ad(par['input_prediction'])
+dataset_id = adata.uns['dataset_id']
+
+print("Read solution anndata")
+adata_solution = ad.read_h5ad(par['input_solution'])
+organism = adata_solution.uns['organism']
+
+print('Transfer obs annotations')
+adata.obs['batch'] = adata_solution.obs['batch'][adata.obs_names]
+adata.obs['cell_type'] = adata_solution.obs['cell_type'][adata.obs_names]
+recompute_cc = 'S_score' not in adata_solution.obs_keys() or \
+               'G2M_score' not in adata_solution.obs_keys()
+
+print('Compute score')
+adata.obsm['X_emb'] = adata.obsm['X_emb'].toarray()
+
+score = scib.me.cell_cycle(
+    adata_pre=adata_solution,
+    adata_post=adata,
+    batch_key='batch',
+    embed='X_emb',
+    recompute_cc=recompute_cc,
+    organism=organism
+)
+
+# store adata with metrics
+print("Create output object")
+out = ad.AnnData(
+    uns= {
+        "dataset_id":adata.uns['dataset_id'],
+        "method_id":adata.uns['method_id'],
+        "metric_ids":['cc_cons'],
+        "metric_values":[score],
+    }
+)
+
+print("Write output to h5ad file")
+out.write(par['output'], compression='gzip')
diff --git a/src/joint_embedding/metrics/check_format/config.vsh.yaml b/src/joint_embedding/metrics/check_format/config.vsh.yaml
new file mode 100644
index 0000000000..90294f01e6
--- /dev/null
+++ b/src/joint_embedding/metrics/check_format/config.vsh.yaml
@@ -0,0 +1,38 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: check_format
+  namespace: joint_embedding/metrics
+  description: Checking whether the prediction of a method has the right format.
+  info:
+    metrics:
+      - metric_id: finished
+        metric_name: finished
+        metric_description: check if metric finished
+        min: 0
+        max: 1
+        maximize: true
+      - metric_id: correct_format
+        metric_name: correct_format
+        metric_description: check if format is correct
+        min: 0
+        max: 1
+        maximize: true
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: r
+        cran: [ anndata, lmds, tidyverse ]
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3]
+      - type: python
+        pip: [ anndata>=0.8 ]
+    test_setup:
+      - type: python
+        pip: [ pyyaml ]
+  - type: nextflow
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/joint_embedding/metrics/check_format/script.R b/src/joint_embedding/metrics/check_format/script.R
new file mode 100644
index 0000000000..afe8ed10fd
--- /dev/null
+++ b/src/joint_embedding/metrics/check_format/script.R
@@ -0,0 +1,61 @@
+cat("Load dependencies\n")
+library(assertthat, quietly = TRUE, warn.conflicts = FALSE)
+library(anndata, warn.conflicts = FALSE)
+
+## VIASH START
+task <- "joint_embedding"
+par <- list(
+  input_solution = paste0("resources_test/", task, "/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad"),
+  input_prediction = paste0("resources_test/", task, "/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad"),
+  output = paste0("resources_test/", task, "/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.scores.h5ad")
+)
+## VIASH END
+
+cat("Read prediction h5ad\n")
+ad_sol <- read_h5ad(par$input_solution)
+
+cat("Checking solution h5ad\n")
+correct_format <- tryCatch({
+  # read prediction
+  ad_pred <- read_h5ad(par$input_prediction)
+
+  # check dataset id
+  dataset_id <- ad_pred$uns[["dataset_id"]]
+  assert_that(dataset_id == ad_sol$uns[["dataset_id"]])
+
+  # check method id
+  method_id <- ad_pred$uns[["method_id"]]
+  assert_that(
+    is.character(method_id),
+    method_id != ""
+  )
+
+  # check X
+  assert_that(
+    ad_pred$n_obs == ad_sol$n_obs,
+    ad_pred$n_vars >= 1,
+    ad_pred$n_vars <= 100,
+    !is.null(ad_pred$obs_names),
+    all(ad_pred$obs_names == ad_sol$obs_names)
+  )
+
+  1
+}, error = function(e) {
+  cat("ERROR: ", e$message, "\n", sep = "")
+  0
+})
+
+
+cat("Create output object\n")
+out <- AnnData(
+  shape = c(0, 0),
+  uns = list(
+    dataset_id = ad_pred$uns$dataset_id,
+    method_id = ad_pred$uns$method_id,
+    metric_ids = c("finished", "correct_format"),
+    metric_values = c(1, correct_format)
+  )
+)
+
+cat("Write output to h5ad file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")
diff --git a/src/joint_embedding/metrics/graph_connectivity/config.vsh.yaml b/src/joint_embedding/metrics/graph_connectivity/config.vsh.yaml
new file mode 100644
index 0000000000..c79966893c
--- /dev/null
+++ b/src/joint_embedding/metrics/graph_connectivity/config.vsh.yaml
@@ -0,0 +1,28 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: graph_connectivity
+  namespace: joint_embedding/metrics
+  description: Graph connectivity
+  info:
+    metrics:
+      - metric_id: graph_conn
+        metric_name: graph_conn
+        metric_description: Graph connectivity
+        min: 0
+        max: 1
+        maximize: True
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: python:3.10
+    setup:
+      - type: python
+        pip: [anndata>=0.8, scib]
+    test_setup:
+      - type: python
+        pip: [ pyyaml ]
+  - type: nextflow
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/joint_embedding/metrics/graph_connectivity/script.py b/src/joint_embedding/metrics/graph_connectivity/script.py
new file mode 100644
index 0000000000..ab9089d74a
--- /dev/null
+++ b/src/joint_embedding/metrics/graph_connectivity/script.py
@@ -0,0 +1,53 @@
+import pprint
+import scanpy as sc
+import anndata as ad
+import scib
+
+## VIASH START
+par = dict(
+    input_prediction="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+    input_solution="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad",
+    output="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.graph_conn.tsv",
+    debug=True
+)
+
+## VIASH END
+
+if par['debug']:
+    pprint.pprint(par)
+
+
+input_prediction = par['input_prediction']
+input_solution = par['input_solution']
+output = par['output']
+
+print("Read prediction anndata")
+adata = ad.read_h5ad(input_prediction)
+dataset_id = adata.uns['dataset_id']
+
+print("Read solution anndata")
+adata_solution = ad.read_h5ad(input_solution)
+
+print('Transfer obs annotations')
+adata.obs['batch'] = adata_solution.obs['batch'][adata.obs_names]
+adata.obs['cell_type'] = adata_solution.obs['cell_type'][adata.obs_names]
+
+print('Preprocessing')
+sc.pp.neighbors(adata, use_rep='X_emb')
+
+print('Compute score')
+score = scib.me.graph_connectivity(adata, label_key='cell_type')
+
+# store adata with metrics
+print("Create output object")
+out = ad.AnnData(
+    uns = { 
+        'dataset_id':adata.uns['dataset_id'],
+        'method_id':adata.uns['method_id'],
+        'metric_ids':['graph_conn'],
+        'metric_values':[score]
+        }
+)
+
+print("Write output to h5ad file")
+out.write(output, compression='gzip')
diff --git a/src/joint_embedding/metrics/latent_mixing/config.vsh.yaml b/src/joint_embedding/metrics/latent_mixing/config.vsh.yaml
new file mode 100644
index 0000000000..bf989bfc20
--- /dev/null
+++ b/src/joint_embedding/metrics/latent_mixing/config.vsh.yaml
@@ -0,0 +1,33 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: latent_mixing
+  namespace: joint_embedding/metrics
+  description: Calculate latent mixing metric for joint embedding task.
+  info:
+    metrics:
+      - metric_id: latent_mixing
+        metric_name: Latent mixing
+        metric_description: Calculate latent mixing 
+        min: -1
+        max: 0
+        maximize: True
+  arguments:
+    - name: "--n_neighbors"
+      type: integer
+      default: 100
+      description: Number of neighbors for the entropy_batch_mixing metric.
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: python:3.10
+    setup:
+      - type: python
+        pip: [anndata>=0.8, scikit-learn, scipy]
+    test_setup:
+      - type: python
+        pip: [ pyyaml ]
+  - type: nextflow
+    directives: 
+      label: [ lowmem, lowcpu ]
\ No newline at end of file
diff --git a/src/joint_embedding/metrics/latent_mixing/script.py b/src/joint_embedding/metrics/latent_mixing/script.py
new file mode 100644
index 0000000000..511e7f8d2a
--- /dev/null
+++ b/src/joint_embedding/metrics/latent_mixing/script.py
@@ -0,0 +1,92 @@
+import anndata as ad
+import numpy as np
+import scipy
+from sklearn.neighbors import NearestNeighbors
+
+# VIASH START
+par = {
+    "input_prediction": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+    "input_solution": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad",
+    "output": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.scores_totalvi.h5ad",
+    "n_neighbors": 100
+}
+# VIASH END
+
+print("Read input files")
+predict_adata = ad.read_h5ad(par["input_prediction"])
+solution_adata = ad.read_h5ad(par["input_solution"])
+
+print("Merge prediction with solution")
+merged_adata = predict_adata.copy()
+
+batch_val = solution_adata.obs["batch"].astype(str)
+batch_unique_values, batch_index = np.unique(batch_val, return_inverse=True)
+
+merged_adata.obs["batch"] = batch_index
+
+def entropy_batch_mixing(
+    latent_space, batches, n_neighbors=50, n_pools=50, n_samples_per_pool=100
+):
+
+    def neg_kl(hist_data, global_freq):
+        n_batches = len(np.unique(hist_data))
+        if n_batches > 2:
+            raise ValueError("Should be only two clusters for this metric")
+        frequency = np.mean(hist_data == 1)
+        if frequency == 0 or frequency == 1:
+            return 0
+        return -(
+            frequency * np.log(frequency / global_freq)
+            + (1 - frequency) * np.log((1 - frequency) / (1 - global_freq))
+        )
+
+    n_neighbors = min(n_neighbors, latent_space.getnnz() - 1)
+    nne = NearestNeighbors(n_neighbors=1 + n_neighbors, n_jobs=8)
+    nne.fit(latent_space)
+    kmatrix = nne.kneighbors_graph(latent_space) - scipy.sparse.identity(
+        latent_space.shape[0]
+    )
+
+    global_freq = np.mean(batches)
+    print(global_freq)
+    score = 0
+    for t in range(n_pools):
+        indices = np.random.choice(
+            np.arange(latent_space.shape[0]), size=n_samples_per_pool
+        )
+        score += np.mean(
+            [
+                neg_kl(
+                    batches[  # the batches of cell i's neighbors
+                        kmatrix[indices].nonzero()[
+                            1
+                        ][  # the neighbors of cell i (columns in row i)
+                            kmatrix[indices].nonzero()[0] == i  # the row of cell i
+                        ]
+                    ],
+                    global_freq,
+                )
+                for i in range(n_samples_per_pool)
+            ]
+        )
+    return score / float(n_pools)
+
+
+print("Calculate latent mixing metric")
+latent_mixing = entropy_batch_mixing(
+    latent_space=merged_adata.obsm['X_emb'],
+    batches=merged_adata.obs["batch"].values,
+    n_neighbors=par["n_neighbors"]
+)
+
+print("Write output")
+adata_out = ad.AnnData(
+    uns = {
+        "dataset_id": predict_adata.uns["dataset_id"],
+        "method_id" : predict_adata.uns["method_id"],
+        "metric_ids" : ["latent_mixing"],
+        "metric_values" : [latent_mixing]
+    }
+)
+
+adata_out.write_h5ad(par['output'], compression = "gzip")
\ No newline at end of file
diff --git a/src/joint_embedding/metrics/nmi/config.vsh.yaml b/src/joint_embedding/metrics/nmi/config.vsh.yaml
new file mode 100644
index 0000000000..e83772d65e
--- /dev/null
+++ b/src/joint_embedding/metrics/nmi/config.vsh.yaml
@@ -0,0 +1,28 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: nmi
+  namespace: joint_embedding/metrics
+  description: Normalised mutual information (NMI)
+  info:
+    metrics:
+      - metric_id: nmi
+        metric_name: NMI
+        metric_description: Normalised mutual information (NMI)
+        min: 0
+        max: 1
+        maximize: True
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: python:3.10
+    setup:
+      - type: python
+        pip: [anndata>=0.8, scib]
+    test_setup:
+      - type: python
+        pip: [ pyyaml ]
+  - type: nextflow
+    directives: 
+      label: [ lowmem, lowcpu ]
diff --git a/src/joint_embedding/metrics/nmi/script.py b/src/joint_embedding/metrics/nmi/script.py
new file mode 100644
index 0000000000..73ef901bd7
--- /dev/null
+++ b/src/joint_embedding/metrics/nmi/script.py
@@ -0,0 +1,64 @@
+import pprint
+import scanpy as sc
+import anndata as ad
+import scib
+
+
+## VIASH START
+par = dict(
+    input_prediction="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+    input_solution="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad",
+    output="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.nmi.tsv",
+    debug=True
+)
+
+## VIASH END
+
+if par['debug']:
+    pprint.pprint(par)
+
+
+input_prediction = par['input_prediction']
+input_solution = par['input_solution']
+output = par['output']
+
+print("Read prediction anndata")
+adata = ad.read_h5ad(input_prediction)
+dataset_id = adata.uns['dataset_id']
+
+print("Read solution anndata")
+adata_solution = ad.read_h5ad(input_solution)
+
+print('Transfer obs annotations')
+adata.obs['batch'] = adata_solution.obs['batch'][adata.obs_names]
+adata.obs['cell_type'] = adata_solution.obs['cell_type'][adata.obs_names]
+
+print('Preprocessing')
+sc.pp.neighbors(adata, use_rep='X_emb')
+
+print('Clustering')
+scib.cl.opt_louvain(
+    adata,
+    label_key='cell_type',
+    cluster_key='cluster',
+    plot=False,
+    inplace=True,
+    force=True
+)
+
+print('Compute score')
+score = scib.me.nmi(adata, group1='cluster', group2='cell_type')
+
+# store adata with metrics
+print("Create output object")
+out = ad.AnnData(
+    uns=dict(
+        dataset_id=adata.uns['dataset_id'],
+        method_id=adata.uns['method_id'],
+        metric_ids=['nmi'],
+        metric_values=[score]
+    )
+)
+
+print("Write output to h5ad file")
+out.write(output, compression='gzip')
\ No newline at end of file
diff --git a/src/joint_embedding/metrics/rfoob/config.vsh.yaml b/src/joint_embedding/metrics/rfoob/config.vsh.yaml
new file mode 100644
index 0000000000..099219de47
--- /dev/null
+++ b/src/joint_embedding/metrics/rfoob/config.vsh.yaml
@@ -0,0 +1,50 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: rfoob
+  namespace: joint_embedding/metrics
+  description: Calculating basic metrics for the joint embedding task.
+  info:
+    metrics:
+      - metric_id: rfoob_celltype_accuracy
+        metric_name: Rfoob Celltype Accuray
+        metric_description: check the celltype accuracy
+        min: 0
+        max: 1
+        maximize: True
+      - metric_metric_id: rfoob_pseudotimegex_rsq
+        metric_name: rfoob pseudotimegex rsq
+        metric_description: rfoob pseudotimegex rsq
+        min: -1
+        max: 1
+        maximize: true
+      - metric_id: rfoob_pseudotimeadt_rsq
+        metric_name: rfoob pseudotimeadt rsq
+        metric_description: rfoob pseudotimeadt rsq
+        min: -1
+        max: 1
+        maximize: True
+      - metric_id: rfoob_batch_error
+        metric_name: rfoob batch error
+        metric_description: rfoob batch error
+        min: 0
+        max: 1
+        maximize: True
+  resources:
+    - type: r_script
+      path: script.R
+platforms:
+  - type: docker
+    image: eddelbuettel/r2u:22.04
+    setup:
+      - type: apt
+        packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3, git ]
+      - type: python
+        pip: [ anndata>=0.8 ]
+      - type: r
+        cran: [  anndata, ranger, tidyverse, testthat]
+    test_setup:
+      - type: python
+        pip: [ pyyaml ]
+  - type: nextflow
+    directives: 
+      label: [ lowmem, lowcpu ]
\ No newline at end of file
diff --git a/src/joint_embedding/metrics/rfoob/script.R b/src/joint_embedding/metrics/rfoob/script.R
new file mode 100644
index 0000000000..e237192fe3
--- /dev/null
+++ b/src/joint_embedding/metrics/rfoob/script.R
@@ -0,0 +1,68 @@
+cat("Load dependencies\n")
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+library(testthat, quietly = TRUE, warn.conflicts = FALSE)
+requireNamespace("anndata", quietly = TRUE)
+
+## VIASH START
+par <- list(
+  input_solution = "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad",
+  input_prediction = "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+  output = "scores.h5ad"
+)
+## VIASH END
+
+cat("Read solution h5ad\n")
+ad_sol <- anndata::read_h5ad(par$input_solution)
+
+cat("Read prediction h5ad\n")
+expect_true(
+  grepl("\\.h5ad$", par$input_prediction),
+  info = "Prediction file should be an h5ad file"
+)
+ad_pred <-
+  tryCatch({
+    anndata::read_h5ad(par$input_prediction)
+  }, error = function(e) {
+    stop(paste0("Can't open prediction h5ad file. Detailed error message:\n", e$message))
+  })
+expect_true(
+  ad_sol$uns$dataset_id == ad_pred$uns$dataset_id
+)
+
+cat("Calculating metrics\n")
+df <- data.frame(as.matrix(ad_pred$obsm[["X_emb"]]), SOLUTION_CELL_TYPE = ad_sol$obs[["cell_type"]])
+rf1 <- ranger::ranger(SOLUTION_CELL_TYPE ~ ., df)
+
+df <- data.frame(as.matrix(ad_pred$obsm[["X_emb"]]), SOLUTION_PSEUDOTIME_ORDER = ad_sol$obs$pseudotime_order_GEX)
+df <- df[is.finite(df$SOLUTION_PSEUDOTIME_ORDER), , drop = FALSE]
+rf2 <- ranger::ranger(SOLUTION_PSEUDOTIME_ORDER ~ ., df)
+
+colname <- colnames(ad_sol$obs)[grepl("pseudotime_order_A.*", colnames(ad_sol$obs))]
+df <- data.frame(as.matrix(ad_pred$obsm[["X_emb"]]), SOLUTION_PSEUDOTIME_ORDER = ad_sol$obs[[colname]])
+df <- df[is.finite(df$SOLUTION_PSEUDOTIME_ORDER), , drop = FALSE]
+rf3 <- ranger::ranger(SOLUTION_PSEUDOTIME_ORDER ~ ., df)
+
+df <- data.frame(as.matrix(ad_pred$obsm[["X_emb"]]), SOLUTION_BATCH = ad_sol$obs$batch)
+rf4 <- ranger::ranger(SOLUTION_BATCH ~ ., df)
+
+metric_values <- c(
+  rfoob_celltype_accuracy = 1 - rf1$prediction.error,
+  rfoob_pseudotimegex_rsq = rf2$r.squared,
+  rfoob_pseudotimeadt_rsq = rf3$r.squared,
+  rfoob_batch_error = rf4$prediction.error
+)
+
+cat("Create output object\n")
+out <- anndata::AnnData(
+  shape = c(0, 0),
+  uns = list(
+    dataset_id = ad_pred$uns$dataset_id,
+    method_id = ad_pred$uns$method_id,
+    metric_ids = names(metric_values),
+    metric_values = metric_values
+  )
+)
+
+cat("Write output to h5ad file\n")
+zzz <- out$write_h5ad(par$output, compression = "gzip")
diff --git a/src/joint_embedding/metrics/ti_cons/config.vsh.yaml b/src/joint_embedding/metrics/ti_cons/config.vsh.yaml
new file mode 100644
index 0000000000..68525b337b
--- /dev/null
+++ b/src/joint_embedding/metrics/ti_cons/config.vsh.yaml
@@ -0,0 +1,40 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: ti_cons
+  namespace: joint_embedding/metrics
+  description: Trajectory inference conservation score
+  info:
+    metrics:
+      - metric_id: ti_cons_RNA
+        metric_name: ti cons RNA
+        metric_description: ti cons RNA
+        min: 0
+        max: 1
+        maximize: True
+      - metric_id: ti_cons_ADT_ATAC
+        metric_name: ti cons ADT ATAC
+        metric_description: ti cons ADT ATAC
+        min: 0
+        max: 1
+        maximize: True
+      - metric_id: ti_cons_mean
+        metric_name: ti cons mean
+        metric_description: ti cons mean
+        min: 0
+        max: 1
+        maximize: True
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: python:3.10
+    setup:
+      - type: python
+        pip: [anndata>=0.8, scib]
+    test_setup:
+      - type: python
+        pip: [ pyyaml ]
+  - type: nextflow
+    directives: 
+      label: [ midmem, midcpu ]
diff --git a/src/joint_embedding/metrics/ti_cons/script.py b/src/joint_embedding/metrics/ti_cons/script.py
new file mode 100644
index 0000000000..1d04067024
--- /dev/null
+++ b/src/joint_embedding/metrics/ti_cons/script.py
@@ -0,0 +1,81 @@
+
+
+print('Importing libraries')
+import pprint
+import numpy as np
+import scanpy as sc
+import anndata as ad
+import scib
+
+## VIASH START
+par = dict(
+    input_prediction="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+    input_solution="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad",
+    output="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.ti_cons.h5ad",
+    debug=True
+)
+## VIASH END
+
+if par['debug']:
+    pprint.pprint(par)
+
+OUTPUT_TYPE = 'graph'
+METRIC = 'ti_cons'
+
+input_prediction = par['input_prediction']
+input_solution = par['input_solution']
+output = par['output']
+
+print("Read prediction anndata")
+adata = ad.read_h5ad(input_prediction)
+dataset_id = adata.uns['dataset_id']
+
+print("Read solution anndata")
+adata_solution = ad.read_h5ad(input_solution)
+
+print('Transfer obs annotations')
+adata.obs['batch'] = adata_solution.obs['batch'][adata.obs_names]
+adata.obs['cell_type'] = adata_solution.obs['cell_type'][adata.obs_names]
+adt_atac_trajectory = 'pseudotime_order_ATAC' if 'pseudotime_order_ATAC' in adata_solution.obs else 'pseudotime_order_ADT'
+
+print('Preprocessing')
+sc.pp.neighbors(adata, use_rep='X_emb')
+
+print('Compute scores')
+obs_keys = adata_solution.obs_keys()
+
+if 'pseudotime_order_GEX' in obs_keys:
+    score_rna = scib.me.trajectory_conservation(
+        adata_pre=adata_solution,
+        adata_post=adata,
+        label_key='cell_type',
+        pseudotime_key='pseudotime_order_GEX'
+    )
+else:
+    score_rna = np.nan
+
+if adt_atac_trajectory in obs_keys:
+    score_adt_atac = scib.me.trajectory_conservation(
+        adata_pre=adata_solution,
+        adata_post=adata,
+        label_key='cell_type',
+        pseudotime_key=adt_atac_trajectory
+    )
+else:
+    score_adt_atac = np.nan
+
+score_mean = (score_rna + score_adt_atac) / 2
+
+# store adata with metrics
+print("Create output object")
+out = ad.AnnData(
+    uns=dict(
+        dataset_id=adata.uns['dataset_id'],
+        method_id=adata.uns['method_id'],
+        metric_ids=['ti_cons_RNA', 'ti_cons_ADT_ATAC', 'ti_cons_mean'],
+        metric_values=[score_rna, score_adt_atac, score_mean]
+    )
+)
+
+print("Write output to h5ad file")
+out.write(output, compression='gzip')
\ No newline at end of file
diff --git a/src/joint_embedding/metrics/ti_cons_batch/config.vsh.yaml b/src/joint_embedding/metrics/ti_cons_batch/config.vsh.yaml
new file mode 100644
index 0000000000..7139b8aadc
--- /dev/null
+++ b/src/joint_embedding/metrics/ti_cons_batch/config.vsh.yaml
@@ -0,0 +1,40 @@
+__merge__: ../../api/comp_metric.yaml
+functionality:
+  name: ti_cons_batch
+  namespace: joint_embedding/metrics
+  description: Trajectory inference conservation score per batch
+  info:
+    metrics:
+      - metric_id: ti_cons_batch_RNA
+        metric_name: ti cons batch RNA
+        metric_description: placeholder
+        min: 0
+        max: 1
+        maximize: True
+      - metric_id: ti_cons_batch_ADT_ATAC
+        metric_name: ti cons batch ADT ATAC
+        metric_description: placeholder
+        min: 0
+        max: 1
+        maximize: True
+      - metric_id: ti_cons_batch_mean
+        metric_name: ti cons batch mean
+        metric_description: placeholder
+        min: 0
+        max: 1
+        maximize: True
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: python:3.10
+    setup:
+      - type: python
+        pip: [anndata>=0.8, scib]
+    test_setup:
+      - type: python
+        pip: [ pyyaml ]
+  - type: nextflow
+    directives: 
+      label: [ midmem, midcpu ]
diff --git a/src/joint_embedding/metrics/ti_cons_batch/script.py b/src/joint_embedding/metrics/ti_cons_batch/script.py
new file mode 100644
index 0000000000..5206b59610
--- /dev/null
+++ b/src/joint_embedding/metrics/ti_cons_batch/script.py
@@ -0,0 +1,87 @@
+import pprint
+import numpy as np
+import scanpy as sc
+import anndata as ad
+import scib
+
+## VIASH START
+par = dict(
+    input_prediction="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad",
+    input_solution="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad",
+    output="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.ti_cons.h5ad",
+    debug=True
+)
+## VIASH END
+
+if par['debug']:
+    pprint.pprint(par)
+
+OUTPUT_TYPE = 'graph'
+METRIC = 'ti_cons_batch'
+
+input_prediction = par['input_prediction']
+input_solution = par['input_solution']
+output = par['output']
+
+print("Read prediction anndata")
+adata = ad.read_h5ad(input_prediction)
+dataset_id = adata.uns['dataset_id']
+
+print("Read solution anndata")
+adata_solution = ad.read_h5ad(input_solution)
+
+print('Transfer obs annotations')
+adata.obs['batch'] = adata_solution.obs['batch'][adata.obs_names]
+adata.obs['cell_type'] = adata_solution.obs['cell_type'][adata.obs_names]
+adt_atac_trajectory = 'pseudotime_order_ATAC' if 'pseudotime_order_ATAC' in adata_solution.obs else 'pseudotime_order_ADT'
+
+print('Preprocessing')
+sc.pp.neighbors(adata, use_rep='X_emb')
+
+print('Compute scores')
+obs_keys = adata_solution.obs_keys()
+
+if 'pseudotime_order_GEX' in obs_keys:
+    score_rna = scib.me.trajectory_conservation(
+        adata_pre=adata_solution,
+        adata_post=adata,
+        label_key='cell_type',
+        batch_key='batch',
+        pseudotime_key='pseudotime_order_GEX'
+    )
+else:
+    score_rna = np.nan
+
+if adt_atac_trajectory in obs_keys:
+    score_adt_atac = scib.me.trajectory_conservation(
+        adata_pre=adata_solution,
+        adata_post=adata,
+        label_key='cell_type',
+        batch_key='batch',
+        pseudotime_key=adt_atac_trajectory
+    )
+else:
+    score_adt_atac = np.nan
+
+score_mean = (score_rna + score_adt_atac) / 2
+
+# store adata with metrics
+print("Create output object")
+out = ad.AnnData(
+    uns=dict(
+        dataset_id=adata.uns['dataset_id'],
+        method_id=adata.uns['method_id'],
+        metric_ids=['ti_cons_batch_RNA', 'ti_cons_batch_ADT_ATAC', 'ti_cons_batch_mean'],
+        metric_values=[score_rna, score_adt_atac, score_mean]
+    )
+)
+
+print("Write output to h5ad file")
+out.write(output, compression='gzip')
+
+# # store score as tsv
+# with open(output, 'w') as file:
+#     header = ['dataset', 'output_type', 'metric', 'value']
+#     entry = [dataset_id, OUTPUT_TYPE, METRIC, score]
+#     file.write('\t'.join(header) + '\n')
+#     file.write('\t'.join([str(x) for x in entry]))
diff --git a/src/joint_embedding/resources_scripts/mask_datasets.sh b/src/joint_embedding/resources_scripts/mask_datasets.sh
new file mode 100644
index 0000000000..b5a194e7b8
--- /dev/null
+++ b/src/joint_embedding/resources_scripts/mask_datasets.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+COMMON_DATASETS="resources/datasets/openproblems_v1"
+OUTPUT_DIR="resources/joint_embedding/datasets/openproblems_v1"
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+params_file="$OUTPUT_DIR/params.yaml"
+
+if [ ! -f $params_file ]; then
+  python << HERE
+import anndata as ad
+import glob
+import yaml
+
+h5ad_files = glob.glob("$COMMON_DATASETS/**.h5ad")
+
+# this task doesn't use normalizations
+# 
+param_list = {}
+
+for h5ad_file in h5ad_files:
+  print(f"Checking {h5ad_file}")
+  adata = ad.read_h5ad(h5ad_file, backed=True)
+  if "counts" in adata.layers:
+    dataset_id = adata.uns["dataset_id"].replace("/", ".")
+    obj = {
+      'id': dataset_id, 
+      'input': h5ad_file,
+      'dataset_id': dataset_id,
+    }
+    param_list[dataset_id] = obj
+
+output = {
+  "param_list": list(param_list.values()),
+  "seed": 123,
+  "output_train": "\$id.train.h5ad",
+  "output_test": "\$id.test.h5ad"
+}
+
+with open("$params_file", "w") as file:
+  yaml.dump(output, file)
+HERE
+fi
+
+export NXF_VER=22.04.5
+nextflow \
+  run . \
+  -main-script target/nextflow/joint_embedding/split_dataset/main.nf \
+  -profile docker \
+  -resume \
+  -params-file $params_file \
+  --publish_dir "$OUTPUT_DIR"
+
+bin/tools/docker/nextflow/process_log/process_log \
+  --output "$OUTPUT_DIR/nextflow_log.tsv"
\ No newline at end of file
diff --git a/src/joint_embedding/resources_scripts/run_benchmarks.sh b/src/joint_embedding/resources_scripts/run_benchmarks.sh
new file mode 100644
index 0000000000..8a74b5a49a
--- /dev/null
+++ b/src/joint_embedding/resources_scripts/run_benchmarks.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+set -e
+
+export TOWER_WORKSPACE_ID=53907369739130
+
+DATASETS_DIR="resources/joint_embedding/datasets/openproblems_v1"
+OUTPUT_DIR="resources/joint_embedding/benchmarks/openproblems_v1"
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+params_file="$OUTPUT_DIR/params.yaml"
+
+if [ ! -f $params_file ]; then
+  python << HERE
+import yaml
+import os
+
+dataset_dir = "$DATASETS_DIR"
+output_dir = "$OUTPUT_DIR"
+
+# read split datasets yaml
+with open(dataset_dir + "/params.yaml", "r") as file:
+  split_list = yaml.safe_load(file)
+datasets = split_list['param_list']
+
+# figure out where train/test files were stored
+param_list = []
+
+for dataset in datasets:
+  id = dataset["id"]
+  input_train = dataset_dir + "/" + id + ".train.h5ad"
+  input_test = dataset_dir + "/" + id + ".test.h5ad"
+  
+  if os.path.exists(input_test):
+    obj = {
+      'id': id, 
+    'id': id, 
+      'id': id, 
+      'dataset_id': dataset["dataset_id"],
+      'input_train': input_train,
+      'input_test': input_test
+    }
+    param_list.append(obj)
+
+# write as output file
+output = {
+  "param_list": param_list,
+}
+
+with open(output_dir + "/params.yaml", "w") as file:
+  yaml.dump(output, file)
+HERE
+fi
+
+export NXF_VER=22.04.5
+nextflow \
+  run . \
+  -main-script src/joint_embedding/workflows/run/main.nf \
+  -profile docker \
+  -params-file "$params_file" \
+  --publish_dir "$OUTPUT_DIR" \
+  -with-tower
+
+bin/tools/docker/nextflow/process_log/process_log \
+  --output "$OUTPUT_DIR/nextflow_log.tsv"
\ No newline at end of file
diff --git a/src/joint_embedding/resources_test_scripts/bmmc_cite.sh b/src/joint_embedding/resources_test_scripts/bmmc_cite.sh
new file mode 100644
index 0000000000..5541edfcb5
--- /dev/null
+++ b/src/joint_embedding/resources_test_scripts/bmmc_cite.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+#
+#make sure the following command has been executed
+#bin/viash_build -q 'joint_embedding|common'
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+MOD_1_DATA=resources_test/common/openproblems_bmmc_cite_starter/openproblems_bmmc_cite_starter.output_rna.h5ad
+MOD_2_DATA=resources_test/common/openproblems_bmmc_cite_starter/openproblems_bmmc_cite_starter.output_mod2.h5ad
+DATASET_DIR=resources_test/joint_embedding/bmmc_cite
+
+if [ ! -f $MOD_1_DATA ]; then
+    echo "Error! Could not find raw data"
+    exit 1
+fi
+
+mkdir -p $DATASET_DIR
+
+# split dataset
+viash run src/joint_embedding/mask_dataset/config.vsh.yaml -- \
+    --input_mod1 $MOD_1_DATA \
+    --input_mod2 $MOD_2_DATA \
+    --output_mod1 $DATASET_DIR/cite_mod1.h5ad \
+    --output_mod2 $DATASET_DIR/cite_mod2.h5ad \
+    --output_solution $DATASET_DIR/cite_solution.h5ad
+
+# run one method
+viash run src/joint_embedding/methods/pca/config.vsh.yaml -- \
+    --input_mod1 $DATASET_DIR/cite_mod1.h5ad \
+    --input_mod2 $DATASET_DIR/cite_mod2.h5ad \
+    --output $DATASET_DIR/pca.h5ad
+
+# run one metric
+viash run src/joint_embedding/metrics/ari/config.vsh.yaml -- \
+    --input_prediction $DATASET_DIR/pca.h5ad \
+    --input_solution $DATASET_DIR/cite_solution.h5ad \
+    --output $DATASET_DIR/ari.h5ad
+
+# run benchmark
+export NXF_VER=22.04.5
+
+nextflow \
+  run . \
+  -main-script src/joint_embedding/workflows/run/main.nf \
+  -profile docker \
+  -resume \
+  --id bmmc_cite \
+  --dataset_id bmmc_site \
+  --input_mod1 $DATASET_DIR/cite_mod1.h5ad \
+  --input_mod2 $DATASET_DIR/cite_mod2.h5ad \
+  --input_solution $DATASET_DIR/cite_solution.h5ad \
+  --output scores.tsv \
+  --publish_dir $DATASET_DIR/
\ No newline at end of file
diff --git a/src/joint_embedding/workflows/run/config.vsh.yaml b/src/joint_embedding/workflows/run/config.vsh.yaml
new file mode 100644
index 0000000000..5f3e7800cf
--- /dev/null
+++ b/src/joint_embedding/workflows/run/config.vsh.yaml
@@ -0,0 +1,26 @@
+functionality:
+  name: "run_benchmark"
+  namespace: "joint_embedding/workflows"
+  argument_groups:
+    - name: Inputs
+      arguments:
+        - name: "--id"
+          type: "string"
+          description: "The ID of the dataset"
+          required: true
+        - name: "--input_mod1"
+          type: "file" # todo: replace with includes
+        - name: "--input_mod2"
+          type: "file" # todo: replace with includes
+        - name: "--input_solution"
+          type: "file" # todo: replace with includes
+    - name: Outputs
+      arguments:
+        - name: "--output"
+          direction: "output"
+          type: file
+  resources:
+    - type: nextflow_script
+      path: main.nf
+platforms:
+  - type: nextflow
\ No newline at end of file
diff --git a/src/joint_embedding/workflows/run/main.nf b/src/joint_embedding/workflows/run/main.nf
new file mode 100644
index 0000000000..45ac7ad76d
--- /dev/null
+++ b/src/joint_embedding/workflows/run/main.nf
@@ -0,0 +1,152 @@
+nextflow.enable.dsl=2
+
+sourceDir = params.rootDir + "/src"
+targetDir = params.rootDir + "/target/nextflow"
+
+// import control methods
+include { random_embed } from "$targetDir/joint_embedding/control_methods/random_embed/main.nf"
+include { zeros_embed } from "$targetDir/joint_embedding/control_methods/zeros_embed/main.nf"
+
+// import methods
+include { lmds } from "$targetDir/joint_embedding/methods/lmds/main.nf"
+include { mnn } from "$targetDir/joint_embedding/methods/mnn/main.nf"
+include { newwave } from "$targetDir/joint_embedding/methods/newwave/main.nf"
+include { pca } from "$targetDir/joint_embedding/methods/pca/main.nf"
+include { totalvi } from "$targetDir/joint_embedding/methods/totalvi/main.nf"
+include { umap } from "$targetDir/joint_embedding/methods/umap/main.nf"
+
+// import metrics
+include { ari } from "$targetDir/joint_embedding/metrics/ari/main.nf"
+include { asw_batch } from "$targetDir/joint_embedding/metrics/asw_batch/main.nf"
+include { asw_label } from "$targetDir/joint_embedding/metrics/asw_label/main.nf"
+include { cc_cons } from "$targetDir/joint_embedding/metrics/cc_cons/main.nf"
+include { check_format } from "$targetDir/joint_embedding/metrics/check_format/main.nf"
+include { graph_connectivity } from "$targetDir/joint_embedding/metrics/graph_connectivity/main.nf"
+include { latent_mixing } from "$targetDir/joint_embedding/metrics/latent_mixing/main.nf"
+include { nmi } from "$targetDir/joint_embedding/metrics/nmi/main.nf"
+include { rfoob } from "$targetDir/joint_embedding/metrics/rfoob/main.nf"
+include { ti_cons } from "$targetDir/joint_embedding/metrics/ti_cons/main.nf"
+include { ti_cons_batch } from "$targetDir/joint_embedding/metrics/ti_cons_batch/main.nf"
+
+// tsv generation component
+include { extract_scores } from "$targetDir/common/extract_scores/main.nf"
+
+// import helper functions
+include { readConfig; viashChannel; helpMessage } from sourceDir + "/wf_utils/WorkflowHelper.nf"
+include { setWorkflowArguments; getWorkflowArguments; passthroughMap as pmap } from sourceDir + "/wf_utils/DataflowHelper.nf"
+
+config = readConfig("$projectDir/config.vsh.yaml")
+
+// construct a map of methods (id -> method_module)
+methods = [ lmds, mnn, newwave, pca, totalvi, umap]
+  .collectEntries{method ->
+    [method.config.functionality.name, method]
+  }
+
+workflow {
+  helpMessage(config)
+
+  viashChannel(params, config)
+    | run_wf
+}
+
+workflow run_wf {
+  take:
+  input_ch
+
+  main:
+  output_ch = input_ch
+
+    // split params for downstream components
+    | setWorkflowArguments(
+      method: ["input_mod1", "input_mod2"],
+      metric: ["input_solution"],
+      output: ["output"]
+    )
+
+    // multiply events by the number of method
+    | add_methods
+
+    // run methods
+    | getWorkflowArguments(key: "method")
+    | run_methods
+
+    // construct tuples for metrics
+    | pmap{ id, file, passthrough ->
+      // derive unique ids from output filenames
+      def newId = file.getName().replaceAll(".output.*", "")
+      // combine prediction with solution
+      def newData = [ input_prediction: file, input_solution: passthrough.metric.input_solution ]
+      [ newId, newData, passthrough ]
+    }
+    
+    // run metrics
+    | getWorkflowArguments(key: "metric")
+    | run_metrics
+    
+    // convert to tsv  
+    | aggregate_results
+
+  emit:
+  output_ch
+}
+
+workflow add_methods {
+  take: input_ch
+  main:
+  output_ch = Channel.fromList(methods.keySet())
+    | combine(input_ch)
+
+    // generate combined id for method_id and dataset_id
+    | pmap{method_id, dataset_id, data ->
+      def new_id = dataset_id + "." + method_id
+      def new_data = data.clone() + [method_id: method_id]
+      new_data.remove("id")
+      [new_id, new_data]
+    }
+  emit: output_ch
+}
+
+workflow run_methods {
+  take: input_ch
+  main:
+    // generate one channel per method
+    method_chs = methods.collect { method_id, method_module ->
+        input_ch
+          | filter{it[1].method_id == method_id}
+          | method_module
+      }
+    // mix all results
+    output_ch = method_chs[0].mix(*method_chs.drop(1))
+
+  emit: output_ch
+}
+
+workflow run_metrics {
+  take: input_ch
+  main:
+
+  output_ch = input_ch
+    | (ari & asw_batch & asw_label & cc_cons & check_format & graph_connectivity & latent_mixing & nmi & rfoob & ti_cons & ti_cons_batch)
+    | mix
+
+  emit: output_ch
+}
+
+workflow aggregate_results {
+  take: input_ch
+  main:
+
+  output_ch = input_ch
+    | toSortedList
+    | filter{ it.size() > 0 }
+    | map{ it -> 
+      [ "combined", it.collect{ it[1] } ] + it[0].drop(2) 
+    }
+    | getWorkflowArguments(key: "output")
+    | extract_scores.run(
+        auto: [ publish: true ]
+    )
+
+  emit: output_ch
+}
\ No newline at end of file
diff --git a/src/joint_embedding/workflows/run/nextflow.config b/src/joint_embedding/workflows/run/nextflow.config
new file mode 100644
index 0000000000..ea674ffa07
--- /dev/null
+++ b/src/joint_embedding/workflows/run/nextflow.config
@@ -0,0 +1,14 @@
+manifest {
+  name = 'joint_embedding/workflows/run'
+  mainScript = 'main.nf'
+  nextflowVersion = '!>=22.04.5'
+  description = 'Multi modality - joint embedding'
+}
+
+params {
+  rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString()
+}
+
+// include common settings
+includeConfig("${params.rootDir}/src/wf_utils/ProfilesHelper.config")
+includeConfig("${params.rootDir}/src/wf_utils/labels.config")
\ No newline at end of file