diff --git a/CHANGELOG.md b/CHANGELOG.md index aa367f2377..1befb3b2e1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -179,3 +179,67 @@ * `metrics/rmse` should be removed because RMSE metrics don't really make sense here. * `metrics/trustworthiness` should be removed because it is already included in `metrics/coranking`. + + +## Multi modality - Joint Embedding + +### New functionality + +* `api/anndata_*`: Created a file format specifications for the h5ad files throughout the pipeline. + +* `api/comp_*`: Created an api definition for the mask, method and metric components. + +* `mask_dataset`: Added a component for masking raw datasets into task-ready dataset objects. + +* `resources_test/joint_embedding/pancreas` with `src/joint_embedding/resources_test_scripts/pancreas.sh`. + +### neurips 2021 migration + +* `control_methods/random_embed`: Migrated from neurips 2021. Extracted from baseline method `dummy_random`. + +* `control_methods/zeros_embed`: Migrated from neurips 2021. Extracted from baseline method `dummy_zeros`. + +* `methods/lmds`: Migrated from neurips 2021. + +* `methods/mnn`: Migrated and adapted from neurips 2021. + +* `methods/newwave`: Migrated and adapted from neurips 2021. + +* `methods/pca`: Migrated from neurips 2021. + +* `methods/totalvi`: Migrated from neurips 2021. + +* `methods/umap`: Migrated from neurips 2021. + +* `metrics/ari`: Migrated from neurips 2021. + +* `metrics/asw_batch`: Migrated from neurips 2021. + +* `metrics/asw_label`: Migrated from neurips 2021. + +* `metrics/cc_cons`: Migrated from neurips 2021. + +* `metrics/check_format`: Migrated from neurips 2021. + +* `metrics/graph_connectivity`: Migrated from neurips 2021. + +* `metrics/latent_mixing`: Migrated from neurips 2021. + +* `metrics/nmi`: Migrated from neurips 2021. + +* `metrics/rfoob`: Migrated from neurips 2021. + +* `metrics/ti_cons`: Migrated from neurips 2021. + +* `metrics/ti_cons_batch`: Migrated from neurips 2021. + +### changes from neurips 2021 + +* Updated docker config from R script. Was using an old `anndata` package which was giving warnings + +* stores the output from the methods in `.obsm["X_emb"]` instead of `.X` in the `anndata` + +* `X_emb` data is stored as a `Sparse Matrix` + + +* updated configs to latest `viash` \ No newline at end of file diff --git a/src/joint_embedding/README.md b/src/joint_embedding/README.md new file mode 100644 index 0000000000..7d9f65431c --- /dev/null +++ b/src/joint_embedding/README.md @@ -0,0 +1,23 @@ +# Joint embedding + +Structure of this task: + + src/embedding + ├── api Interface specifications for components and datasets in this task + ├── control_methods Baseline (random/ground truth) methods to compare methods against + ├── methods Methods to be benchmarked + ├── metrics Metrics used to quantify performance of methods + ├── README.md This file + ├── resources_scripts Scripts to process the datasets + ├── resources_test_scripts Scripts to process the test resources + ├── split_dataset Component to prepare common datasets + └── workflows Pipelines to run the full benchmark + +Relevant links: + +* [Description and results at openproblems.bio](https://openproblems.bio/neurips_2021/) + +* [Experimental results](https://openproblems-experimental.netlify.app/results/joint_embedding/) + + +* [Contribution guide](https://github.com/openproblems-bio/openproblems-v2/blob/main/CONTRIBUTING.md) diff --git a/src/joint_embedding/api/anndata_dataset.yaml b/src/joint_embedding/api/anndata_dataset.yaml new file mode 100644 index 0000000000..23c6b427be --- /dev/null +++ b/src/joint_embedding/api/anndata_dataset.yaml @@ -0,0 +1,75 @@ +type: file +description: "A raw dataset" +example: "dataset.h5ad" +info: + label: "Dataset" + slots: + layers: + - type: integer + name: counts + description: Raw counts + required: true + obs: + - type: string + name: batch + description: Batch information + required: true + - type: double + name: size_factors + description: The size factors created by the normalisation method, if any. + required: false + - type: string + name: cell_type + description: Type of cells + required: false + - type: string + name: pseudotime_order_GEX + description: + required: false + - type: string + name: pseudotime_order_ATAC + description: + required: false + - type: string + name: pseudotime_order_ADT + description: + required: false + - type: double + name: S_score + description: + required: false + - type: double + name: G2M_score + description: + required: false + - type: boolean + name: is_train + description: if sample is train data + required: true + var: + - type: string + name: gene_ids + description: + required: false + - type: string + name: feature_types + description: + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: organism + description: "data from which organism " + required: false + - type: string + name: gene_activity_var_names + description: + required: false + - type: string + name: sample_pm_varnames + description: + required: false + diff --git a/src/joint_embedding/api/anndata_masked_mod1.yaml b/src/joint_embedding/api/anndata_masked_mod1.yaml new file mode 100644 index 0000000000..7ca6820671 --- /dev/null +++ b/src/joint_embedding/api/anndata_masked_mod1.yaml @@ -0,0 +1,37 @@ +type: file +description: "The masked data" +example: "masked.h5ad" +info: + short_description: "masked data" + slots: + layers: + - type: integer + name: counts + description: Raw counts + obs: + - type: string + name: batch + description: Batch information + required: true + - type: double + name: size_factors + description: + required: false + var: + - type: string + name: feature_types + description: + required: true + - type: string + name: gene_ids + description: + required: false + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: organism + description: which organism + required: true diff --git a/src/joint_embedding/api/anndata_masked_mod2.yaml b/src/joint_embedding/api/anndata_masked_mod2.yaml new file mode 100644 index 0000000000..ad735fffba --- /dev/null +++ b/src/joint_embedding/api/anndata_masked_mod2.yaml @@ -0,0 +1,39 @@ +type: file +description: "The masked data for mod2 file" +example: "masked.h5ad" +info: + short_description: "Masked data" + slots: + layers: + - type: integer + name: counts + description: Raw counts + required: true + obs: + - type: string + name: batch + description: Batch information + required: true + var: + - type: string + name: feature_types + description: + required: true + - type: string + name: gene_ids + description: + required: false + obsm: + - type: double + name: gene_activity + description: + required: false + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: organism + description: which organism + required: true diff --git a/src/joint_embedding/api/anndata_prediction.yaml b/src/joint_embedding/api/anndata_prediction.yaml new file mode 100644 index 0000000000..49d8ae7d79 --- /dev/null +++ b/src/joint_embedding/api/anndata_prediction.yaml @@ -0,0 +1,25 @@ +type: file +description: "The prediction file" +example: "prediction.h5ad" +info: + short_description: "Prediction" + slots: + obs: + - type: string + name: batch + description: Batch information + required: true + obsm: + - type: double + name: X_emb + description: + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: method_id + description: "A unique identifier for the method" + required: true diff --git a/src/joint_embedding/api/anndata_score.yaml b/src/joint_embedding/api/anndata_score.yaml new file mode 100644 index 0000000000..bfe79f07cc --- /dev/null +++ b/src/joint_embedding/api/anndata_score.yaml @@ -0,0 +1,25 @@ +type: file +description: "Metric score file" +example: "output.h5ad" +info: + short_description: "Score" + slots: + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: method_id + description: "A unique identifier for the method" + required: true + - type: string + name: metric_ids + description: "One or more unique metric identifiers" + multiple: true + required: true + - type: double + name: metric_values + description: "The metric values obtained for the given prediction. Must be of same length as 'metric_ids'." + multiple: true + required: true diff --git a/src/joint_embedding/api/anndata_solution.yaml b/src/joint_embedding/api/anndata_solution.yaml new file mode 100644 index 0000000000..2ed03e1cc3 --- /dev/null +++ b/src/joint_embedding/api/anndata_solution.yaml @@ -0,0 +1,57 @@ +type: file +description: "The solution for the data" +example: "solution.h5ad" +info: + short_description: "Solution" + slots: + layers: + - type: integer + name: counts + description: Raw counts + obs: + - type: string + name: batch + description: Batch information + required: false + - type: string + name: cell_type + description: Type of cells + required: false + - type: string + name: pseudotime_order_GEX + description: + required: false + - type: string + name: pseudotime_order_ATAC + description: + required: false + - type: string + name: pseudotime_order_ADT + description: + required: false + - type: double + name: S_score + description: + required: false + - type: double + name: G2M_score + description: + required: false + var: + - type: string + name: feature_types + description: + required: true + - type: string + name: gene_ids + description: + required: false + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: organism + description: which organism + required: true diff --git a/src/joint_embedding/api/authors.yaml b/src/joint_embedding/api/authors.yaml new file mode 100644 index 0000000000..f2e96c35fd --- /dev/null +++ b/src/joint_embedding/api/authors.yaml @@ -0,0 +1,24 @@ +functionality: + authors: + - name: Robrecht Cannoodt + roles: [ author ] + props: { github: rcannood, orcid: "0000-0003-3641-729X" } + - name: Kai Waldrant + roles: [ contributor ] + props: { github: KaiWaldrant } + - name: Alex Tong + email: alexandertongdev@gmail.com + roles: [ author, maintainer ] + props: { github: atong01 } + - name: Christopher Lance + email: clance.connect@gmail.com + roles: [ author, maintainer ] + props: { github: xlancelottx } + - name: Michaela Mueller + email: mumichae@in.tum.de + roles: [ author, maintainer ] + props: { github: mumichae, orcid: "0000-0002-1401-1785" } + - name: Ann Chen + email: ann.chen@czbiohub.org + roles: [ author, maintainer ] + props: { github: atchen} \ No newline at end of file diff --git a/src/joint_embedding/api/comp_control_method.yaml b/src/joint_embedding/api/comp_control_method.yaml new file mode 100644 index 0000000000..d1ec13ed17 --- /dev/null +++ b/src/joint_embedding/api/comp_control_method.yaml @@ -0,0 +1,46 @@ +functionality: + arguments: + - name: "--input_mod1" + __merge__: anndata_masked_mod1.yaml + - name: "--output" + __merge__: anndata_prediction.yaml + direction: output + test_resources: + - path: ../../../../resources_test + - type: python_script + path: generic_test.py + text: | + from os import path + import subprocess + import anndata as ad + from scipy import sparse + + input_mod1_path = "resources_test/common/joint_embedding/openproblems_bmmc_multiome_starter.mod1.h5ad" + output_path = "output.h5ad" + + cmd = [ + meta['executable'], + "--input_mod1", input_mod1_path, + "--output", output_path + ] + + print(">> Running script as test") + out = subprocess.run(cmd, check=True, capture_output=True, text=True) + + print("> Checking whether output files were created") + assert path.exists(output_path) + + print("> Reading h5ad files") + input_mod1 = ad.read_h5ad(input_mod1_path) + output = ad.read_h5ad(output_path) + + print("> Checking contents of output.h5ad") + assert output.uns['dataset_id'] == input_mod1.uns['dataset_id'] + assert output.uns['method_id'] == meta['functionality_name'] + assert output.n_obs == input_mod1.n_obs + assert output.n_vars >= 1 + assert output.n_vars <= 100 + assert all(output.obs_names == input_mod1.obs_names) + assert sparse.issparse(output.obsm['X_emb']) + + print("> Test succeeded!") diff --git a/src/joint_embedding/api/comp_mask_dataset.yaml b/src/joint_embedding/api/comp_mask_dataset.yaml new file mode 100644 index 0000000000..0b97e89fca --- /dev/null +++ b/src/joint_embedding/api/comp_mask_dataset.yaml @@ -0,0 +1,15 @@ +functionality: + arguments: + - name: "--input_mod1" + __merge__: anndata_dataset.yaml + - name: "--input_mod2" + __merge__: anndata_dataset.yaml + - name: "--output_mod1" + __merge__: anndata_masked_mod1.yaml + direction: output + - name: "--output_mod2" + __merge__: anndata_masked_mod2.yaml + direction: output + - name: "--output_solution" + __merge__: anndata_solution.yaml + direction: output \ No newline at end of file diff --git a/src/joint_embedding/api/comp_method.yaml b/src/joint_embedding/api/comp_method.yaml new file mode 100644 index 0000000000..415f42ea3d --- /dev/null +++ b/src/joint_embedding/api/comp_method.yaml @@ -0,0 +1,50 @@ +functionality: + arguments: + - name: "--input_mod1" + __merge__: anndata_masked_mod1.yaml + - name: "--input_mod2" + __merge__: anndata_masked_mod2.yaml + - name: "--output" + __merge__: anndata_prediction.yaml + direction: output + test_resources: + - path: ../../../../resources_test + - type: python_script + path: generic_test.py + text: | + from os import path + import subprocess + import anndata as ad + from scipy import sparse + + input_mod1_path = "resources_test/common/joint_embedding/openproblems_bmmc_multiome_starter.mod1.h5ad" + input_mod2_path = "resources_test/common/joint_embedding/openproblems_bmmc_multiome_starter.mod2.h5ad" + output_path = "output.h5ad" + + cmd = [ + meta['executable'], + "--input_mod1", input_mod1_path, + "--input_mod2", input_mod2_path, + "--output", output_path + ] + + print(">> Running script as test") + out = subprocess.run(cmd, check=True, capture_output=True, text=True) + + print("> Checking whether output files were created") + assert path.exists(output_path) + + print("> Reading h5ad files") + input_mod1 = ad.read_h5ad(input_mod1_path) + output = ad.read_h5ad(output_path) + + print("> Checking contents of output.h5ad") + assert output.uns['dataset_id'] == input_mod1.uns['dataset_id'] + assert output.uns['method_id'] == meta['functionality_name'] + assert output.n_obs == input_mod1.n_obs + assert output.n_vars >= 1 + assert output.n_vars <= 100 + assert all(output.obs_names == input_mod1.obs_names) + assert sparse.issparse(output.obsm['X_emb']) + + print("> Test succeeded!") \ No newline at end of file diff --git a/src/joint_embedding/api/comp_metric.yaml b/src/joint_embedding/api/comp_metric.yaml new file mode 100644 index 0000000000..76e3b17430 --- /dev/null +++ b/src/joint_embedding/api/comp_metric.yaml @@ -0,0 +1,87 @@ +functionality: + arguments: + - name: --input_prediction + __merge__: anndata_prediction.yaml + - name: --input_solution + __merge__: anndata_solution.yaml + - name: --output + __merge__: anndata_score.yaml + direction: output + - name: --debug + type: boolean_true + description: Verbose output for debugging. + test_resources: + - path: ../../../../resources_test + - type: python_script + path: generic_test.py + text: | + from os import path + import subprocess + import anndata as ad + import pandas as pd + import yaml + + input_prediction_path = "resources_test/common/joint_embedding/cite_random_prediction.h5ad" + input_solution_path = "resources_test/common/joint_embedding/cite_solution.h5ad" + output_path = "output.h5ad" + + # load config yaml + with open(meta["config"], "r") as file: + config = yaml.safe_load(file) + + cmd = [ + meta['executable'], + "--input_prediction", input_prediction_path, + "--input_solution", input_solution_path, + "--output", output_path + ] + + print("> Running method", flush=True) + out = subprocess.run(cmd, stderr=subprocess.STDOUT, check=True) + + print("> Checking whether output files were created", flush=True) + assert path.exists(output_path) + + print("> Reading h5ad files", flush=True) + input_prediction = ad.read_h5ad(input_prediction_path) + input_solution = ad.read_h5ad(input_solution_path) + output = ad.read_h5ad(output_path) + + # Create DF from metric config info + metric_info = config['functionality']['info']['metrics'] + metric_meta = pd.DataFrame(metric_info) + metric_meta = metric_meta.astype({'metric_id': str, 'metric_name': str, 'metric_description': str, 'min': float, 'max': float, 'maximize': bool}) + print("> Checking contents of metric info", flush=True) + assert 'metric_id' in metric_meta + assert 'min' in metric_meta + assert 'max' in metric_meta + assert 'maximize' in metric_meta + + print("> Checking .uns['dataset_id']", flush=True) + assert 'dataset_id' in output.uns + assert output.uns['dataset_id'] == input_prediction.uns['dataset_id'] + + print("> Checking .uns['method_id']", flush=True) + assert 'method_id' in output.uns + assert output.uns['method_id'] == input_prediction.uns['method_id'] + + print("> Checking .uns['metric_ids']", flush=True) + assert 'metric_ids' in output.uns + assert set(output.uns['metric_ids']) == set(metric_meta.metric_id) + + print("> Checking .uns['metric_values']", flush=True) + assert 'metric_values' in output.uns + assert output.uns['metric_ids'].size == output.uns['metric_values'].size + + # merge with metric_meta to see if metric_value lies within the expected range + output_uns = pd.DataFrame({ + 'metric_id': output.uns['metric_ids'], + 'value': output.uns['metric_values'] + }) + + scores = metric_meta.merge(output_uns, on="metric_id") + + assert all(scores.value >= scores['min']) + assert all(scores.value <= scores['max']) + + print("> Test succeeded!", flush=True) diff --git a/src/joint_embedding/api/task_info.yaml b/src/joint_embedding/api/task_info.yaml new file mode 100644 index 0000000000..79267651ce --- /dev/null +++ b/src/joint_embedding/api/task_info.yaml @@ -0,0 +1,14 @@ +task_id: joint_embedding +task_name: Joint Embedding +summary: Learning of an embedded space that leverages the information of multiple modalities (e.g. for improved cell type annotation). +description: | + The functioning of organs, tissues, and whole organisms is determined by the interplay of cells. + Cells are characterised into broad types, which in turn can take on different states. Here, a cell + state is made up of the sum of all processes that are occurring within the cell. We can gain insight + into the state of a cell by different types of measurements: e.g., RNA expression, protein abundance, + or chromatin conformation. Combining this information to describe cellular heterogeneity requires the + formation of joint embeddings generated from this multimodal data. These embeddings must account for + and remove possible batch effects between different measurement batches. The reward for methods that + can achieve this is great: a highly resolved description of the underlying biological state of a cell + that determines its function, how it interacts with other cells, and thus the cell’s role in the f + unctioning of the whole tissue. \ No newline at end of file diff --git a/src/joint_embedding/control_methods/random_embed/config.vsh.yaml b/src/joint_embedding/control_methods/random_embed/config.vsh.yaml new file mode 100644 index 0000000000..15b0c745e9 --- /dev/null +++ b/src/joint_embedding/control_methods/random_embed/config.vsh.yaml @@ -0,0 +1,25 @@ +__merge__: ../../api/comp_control_method.yaml +functionality: + name: random_embed + namespace: joint_embedding/control_methods + description: Generate a random embedding from a normal distribution. + info: + type: negative_control + method_name: Normal Dist. + arguments: + - name: "--n_dims" + type: "integer" + default: 100 + description: Number of dimensions to output. + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: "python:3.10" + setup: + - type: python + pip: [ anndata>=0.8, numpy , scipy] + - type: nextflow + directives: + label: [ lowmem, lowcpu ] diff --git a/src/joint_embedding/control_methods/random_embed/script.py b/src/joint_embedding/control_methods/random_embed/script.py new file mode 100644 index 0000000000..199ad08176 --- /dev/null +++ b/src/joint_embedding/control_methods/random_embed/script.py @@ -0,0 +1,26 @@ +import anndata +import numpy as np +from scipy import sparse + +## VIASH START +par = { + "input_mod1": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.mod1.h5ad", + "output": "output/output_prediction.h5ad", + "n_dims": 100, +} +## VIASH END + +print("Load and prepare data") +adata_mod1 = anndata.read_h5ad(par["input_mod1"]) + +X = np.random.randn(adata_mod1.shape[0], par["n_dims"]) +print("Saving output") +adata_out = anndata.AnnData( + X=X, + obsm= {"X_emb": sparse.csr_matrix(X)}, + obs=adata_mod1.obs[["batch"]], + uns={"dataset_id": adata_mod1.uns["dataset_id"], "method_id": "random_embed"}, +) +del adata_out.X + +adata_out.write_h5ad(par["output"], compression="gzip") diff --git a/src/joint_embedding/control_methods/zeros_embed/config.vsh.yaml b/src/joint_embedding/control_methods/zeros_embed/config.vsh.yaml new file mode 100644 index 0000000000..2cd0a9b71f --- /dev/null +++ b/src/joint_embedding/control_methods/zeros_embed/config.vsh.yaml @@ -0,0 +1,25 @@ +__merge__: ../../api/comp_control_method.yaml +functionality: + name: zeros_embed + namespace: joint_embedding/control_methods + description: Generate an embedding containing only zero values. + info: + type: negative_control + method_name: zeros_embed + arguments: + - name: "--n_dims" + type: "integer" + default: 1 + description: Number of dimensions to output. + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: "python:3.10" + setup: + - type: python + pip: [ anndata, numpy, scipy ] + - type: nextflow + directives: + label: [ lowmem, lowcpu ] diff --git a/src/joint_embedding/control_methods/zeros_embed/script.py b/src/joint_embedding/control_methods/zeros_embed/script.py new file mode 100644 index 0000000000..f54ef3ce18 --- /dev/null +++ b/src/joint_embedding/control_methods/zeros_embed/script.py @@ -0,0 +1,27 @@ +import anndata +import numpy as np +from scipy import sparse + +## VIASH START +par = { + "input_mod1": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.mod1.h5ad", + "output": "tmp/output_prediction.h5ad", + "n_dims": 1, +} +## VIASH END + +print("Load and prepare data") +adata_mod1 = anndata.read_h5ad(par["input_mod1"]) + +X = np.zeros((adata_mod1.shape[0], par["n_dims"])) +print("Saving output") +adata_out = anndata.AnnData( + X=X, + obs=adata_mod1.obs, + uns={"dataset_id": adata_mod1.uns["dataset_id"], "method_id": "zeros_embed"}, + obsm={"X_emb": sparse.csr_matrix(X) } +) + +del adata_out.X + +adata_out.write_h5ad(par["output"], compression="gzip") diff --git a/src/joint_embedding/mask_dataset/config.vsh.yaml b/src/joint_embedding/mask_dataset/config.vsh.yaml new file mode 100644 index 0000000000..872df1d9fa --- /dev/null +++ b/src/joint_embedding/mask_dataset/config.vsh.yaml @@ -0,0 +1,31 @@ +__merge__: ../api/comp_mask_dataset.yaml +functionality: + name: mask_dataset + namespace: joint_embedding + description: | + A component for censoring joint embedding datasets to be given + to competition participants for the 'joint embedding' task. + arguments: + - name: "--train_only" + type: "boolean_true" + description: Whether or not to only omit the train cells. + resources: + - type: r_script + path: script.R + test_resources: + - type: r_script + path: test.R + - path: ../../../resources_test +platforms: + - type: docker + image: eddelbuettel/r2u:22.04 + setup: + - type: r + cran: [ anndata, tidyverse , testthat] + - type: apt + packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3] + - type: python + pip: [ anndata>=0.8 ] + - type: nextflow + directives: + label: [ midmem, lowcpu ] diff --git a/src/joint_embedding/mask_dataset/script.R b/src/joint_embedding/mask_dataset/script.R new file mode 100644 index 0000000000..d417775314 --- /dev/null +++ b/src/joint_embedding/mask_dataset/script.R @@ -0,0 +1,97 @@ +cat("Loading dependencies\n") +options(tidyverse.quiet = TRUE) +library(tidyverse) +library(anndata, warn.conflicts = FALSE) +library(assertthat, quietly = TRUE, warn.conflicts = FALSE) +library(Matrix, quietly = TRUE, warn.conflicts = FALSE) + +## VIASH START +input_path <- "resources_test/common/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter." +output_path <- "" + +par <- list( + input_mod1 = paste0(input_path, "output_rna.h5ad"), + input_mod2 = paste0(input_path, "output_mod2.h5ad"), + output_mod1 = paste0(output_path, "output_mod1.h5ad"), + output_mod2 = paste0(output_path, "output_mod2.h5ad"), + output_solution = paste0(output_path, "solution.h5ad"), + train_only = TRUE +) +## VIASH END + +cat("Reading mod1 data\n") +input_mod1 <- anndata::read_h5ad(par$input_mod1) +ad1_mod <- unique(input_mod1$var[["feature_types"]]) +new_dataset_id <- paste0(input_mod1$uns[["dataset_id"]], "_JE") +ad1_uns <- list(dataset_id = new_dataset_id, organism = "human") +ad2_uns <- list(dataset_id = new_dataset_id, organism = "human") + +cat("Creating mod1 object\n") +out_mod1 <- anndata::AnnData( + X = input_mod1$X, + layers = list(counts = input_mod1$layers[["counts"]]), + var = input_mod1$var %>% select(one_of("gene_ids"), feature_types), + obs = input_mod1$obs %>% select(one_of("batch", "size_factors")), + uns = ad1_uns +) + +cat("Create solution object\n") +out_solution <- anndata::AnnData( + X = input_mod1$X, + var = input_mod1$var %>% select(one_of("gene_ids"), feature_types), + obs = input_mod1$obs %>% select( + one_of("batch", "cell_type", "pseudotime_order_GEX", "pseudotime_order_ATAC", "pseudotime_order_ADT", "S_score", "G2M_score") + ), + uns = ad1_uns +) + +is_train <- input_mod1$obs$is_train + +if (par$train_only) { + cat("Filtering out test cells\n", sep = "") + out_mod1 <- out_mod1[is_train, ] #$copy() + out_solution <- out_solution[is_train, ]# $copy() +} + +rm(input_mod1) +gc() + +cat("Reading mod2 data\n") +input_mod2 <- anndata::read_h5ad(par$input_mod2) +ad2_mod <- unique(input_mod2$var[["feature_types"]]) +ad2_obsm <- list() + +if (ad2_mod == "ATAC") { + ad2_uns$gene_activity_var_names <- input_mod2$uns$gene_activity_var_names + ad2_obsm$gene_activity <- as(input_mod2$obsm$gene_activity, "CsparseMatrix") +} + +cat("Creating mod2 object\n") +out_mod2 <- anndata::AnnData( + X = input_mod2$X, + layers = list(counts = input_mod2$layers[["counts"]]), + var = input_mod2$var %>% select(one_of("gene_ids"), feature_types), + obs = input_mod2$obs %>% select(one_of("batch")), + obsm = ad2_obsm, + uns = ad2_uns +) +rm(input_mod2) +gc() + +if (par$train_only) { + cat("Filtering out test cells\n", sep = "") + out_mod2 <- out_mod2[is_train, ] #$copy() +} + +cat("Saving output files as h5ad\n") +cat("output_mod1:") +print(out_mod1) +zzz <- out_mod1$write_h5ad(par$output_mod1, compression = "gzip") + +cat("output_mod2:") +print(out_mod2) +zzz <- out_mod2$write_h5ad(par$output_mod2, compression = "gzip") + +cat("output_solution:") +print(out_solution) +zzz <- out_solution$write_h5ad(par$output_solution, compression = "gzip") diff --git a/src/joint_embedding/mask_dataset/test.R b/src/joint_embedding/mask_dataset/test.R new file mode 100644 index 0000000000..88930e00d2 --- /dev/null +++ b/src/joint_embedding/mask_dataset/test.R @@ -0,0 +1,55 @@ +library(testthat, quietly = TRUE, warn.conflicts = FALSE) +library(anndata, warn.conflicts = FALSE) + +par <- list( + input_mod1 = "resources_test/common/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.output_rna.h5ad", + input_mod2 = "resources_test/common/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.output_mod2.h5ad", + output_mod1 = "output_mod1.h5ad", + output_mod2 = "output_mod2.h5ad", + output_solution = "solution.h5ad" +) + +cat("> Running mask component\n") +out <- processx::run( + command = paste0("./", meta["functionality_name"]), + args = c( + "--input_mod1", par$input_mod1, + "--input_mod2", par$input_mod2, + "--output_mod1", par$output_mod1, + "--output_mod2", par$output_mod2, + "--output_solution", par$output_solution + ), + stderr_to_stdout = TRUE +) + +cat("> Checking whether output files were created\n") +expect_true(file.exists(par$output_mod1)) +expect_true(file.exists(par$output_mod2)) +expect_true(file.exists(par$output_solution)) + +cat("> Reading h5ad files\n") +input_mod1 <- anndata::read_h5ad(par$input_mod1) +input_mod2 <- anndata::read_h5ad(par$input_mod2) +output_mod1 <- anndata::read_h5ad(par$output_mod1) +output_mod2 <- anndata::read_h5ad(par$output_mod2) +output_solution <- anndata::read_h5ad(par$output_solution) + +cat("> Checking contents of h5ad files\n") +expect_equal(output_mod1$uns[["dataset_id"]], paste0(input_mod1$uns[["dataset_id"]], "_JE")) +expect_equal(output_mod2$uns[["dataset_id"]], paste0(input_mod1$uns[["dataset_id"]], "_JE")) +expect_equal(output_solution$uns[["dataset_id"]], paste0(input_mod1$uns[["dataset_id"]], "_JE")) +expect_equal(output_mod1$uns[["organism"]], input_mod1$uns[["organism"]]) +expect_equal(output_mod2$uns[["organism"]], input_mod1$uns[["organism"]]) +expect_equal(output_solution$uns[["organism"]], input_mod1$uns[["organism"]]) +expect_equal(output_mod1$n_obs, input_mod1$n_obs) +expect_equal(output_mod2$n_obs, input_mod2$n_obs) +expect_equal(output_mod1$n_vars, input_mod1$n_vars) +expect_equal(output_mod2$n_vars, input_mod2$n_vars) +expect_equal(output_mod1$var_names, input_mod1$var_names) +expect_equal(output_mod2$var_names, input_mod2$var_names) +expect_equal(output_mod1$obs_names, input_mod1$obs_names) +expect_equal(output_mod2$obs_names, input_mod2$obs_names) + +# TODO check contents of matrices, check rownames + +cat("> Test succeeded!\n") diff --git a/src/joint_embedding/methods/lmds/config.vsh.yaml b/src/joint_embedding/methods/lmds/config.vsh.yaml new file mode 100644 index 0000000000..440172d748 --- /dev/null +++ b/src/joint_embedding/methods/lmds/config.vsh.yaml @@ -0,0 +1,33 @@ +__merge__: ../../api/comp_method.yamllowmem +functionality: + name: lmds + namespace: joint_embedding/methods + description: Landmark MDS dimensionality reduction on the Spearman distance. + info: + type: method + method_name: "LMDS" + arguments: + - name: "--distance_method" + type: "string" + default: "spearman" + description: The distance method to use. Possible values are euclidean, pearson, spearman and others. + - name: "--n_dims" + type: integer + default: 10 + description: Number of dimensions to output. + resources: + - type: r_script + path: script.R +platforms: + - type: docker + image: eddelbuettel/r2u:22.04 + setup: + - type: r + cran: [ anndata, lmds, tidyverse ] + - type: apt + packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3] + - type: python + pip: [anndata>=0.8] + - type: nextflow + directives: + label: [ lowmem, lowcpu ] diff --git a/src/joint_embedding/methods/lmds/script.R b/src/joint_embedding/methods/lmds/script.R new file mode 100644 index 0000000000..7dda2a3c4c --- /dev/null +++ b/src/joint_embedding/methods/lmds/script.R @@ -0,0 +1,44 @@ +cat("Loading dependencies\n") +options(tidyverse.quiet = TRUE) +library(tidyverse) +library(anndata, warn.conflicts = FALSE) +library(Matrix, warn.conflicts = FALSE, quietly = TRUE) + +## VIASH START +# path <- "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter." +path <- "resources_test/common/joint_embedding/openproblems_bmmc_multiome_starter." +par <- list( + input_mod1 = paste0(path, "mod1.h5ad"), + input_mod2 = paste0(path, "mod2.h5ad"), + output = "output/lmds/output.h5ad", + n_dims = 10L, + distance_method = "spearman" +) +## VIASH END + +cat("Reading h5ad files\n") +ad1 <- anndata::read_h5ad(par$input_mod1) +ad2 <- anndata::read_h5ad(par$input_mod2) + +cat("Performing DR\n") +dr <- lmds::lmds( + cbind(ad1$X, ad2$X), + ndim = par$n_dims, + distance_method = par$distance_method +) + +rownames(dr) <- rownames(ad1) +colnames(dr) <- paste0("comp_", seq_len(par$n_dims)) + +out <- anndata::AnnData( + X = dr, + uns = list( + dataset_id = ad1$uns[["dataset_id"]], + method_id = meta$functionality_name + ), + obsm = list(X_emb = as(dr, "CsparseMatrix")) +) + + +cat("Writing predictions to file\n") +zzz <- out$write_h5ad(par$output, compression = "gzip") diff --git a/src/joint_embedding/methods/mnn/config.vsh.yaml b/src/joint_embedding/methods/mnn/config.vsh.yaml new file mode 100644 index 0000000000..30dc728fd9 --- /dev/null +++ b/src/joint_embedding/methods/mnn/config.vsh.yaml @@ -0,0 +1,31 @@ +__merge__: ../../api/comp_method.yaml +functionality: + name: mnn + namespace: joint_embedding/methods + description: Mutual nearest neighbors correction followed by PCA. + info: + type: method + method_name: "MNN" + arguments: + - name: "--hvg_sel" + type: "integer" + default: 1000 + description: Number of features per modality to use. + resources: + - type: r_script + path: script.R +platforms: + - type: docker + image: eddelbuettel/r2u:22.04 + setup: + - type: r + cran: [ anndata, lmds, tidyverse, bioconductor] + - type: r + bioc: [ SingleCellExperiment, batchelor, proxyC ] + - type: apt + packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3] + - type: python + pip: [anndata>=0.8] + - type: nextflow + directives: + label: [ lowmem, lowcpu ] diff --git a/src/joint_embedding/methods/mnn/script.R b/src/joint_embedding/methods/mnn/script.R new file mode 100644 index 0000000000..27a4454bff --- /dev/null +++ b/src/joint_embedding/methods/mnn/script.R @@ -0,0 +1,67 @@ +cat("Loading dependencies\n") +options(tidyverse.quiet = TRUE) +library(tidyverse) +requireNamespace("anndata", quietly = TRUE) +library(Matrix, warn.conflicts = FALSE, quietly = TRUE) +requireNamespace("batchelor", quietly = TRUE) +requireNamespace("SingleCellExperiment", quietly = TRUE) + +## VIASH START +# path <- "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter." +path <- "output/datasets/joint_embedding/openproblems_bmmc_multiome_phase1/openproblems_bmmc_multiome_phase1.censor_dataset.output_" +# path <- "output/public_datasets/joint_embedding/dyngen_citeseq_1/dyngen_citeseq_1.censor_dataset.output_" +par <- list( + input_mod1 = paste0(path, "mod1.h5ad"), + input_mod2 = paste0(path, "mod2.h5ad"), + output = "output.h5ad", + hvg_sel = 1000L +) +meta <- list(functionality_name = "foo") +## VIASH END + +method_id <- meta$functionality_name + +cat("Reading h5ad files\n") +input_mod1 <- anndata::read_h5ad(par$input_mod1) + +rn <- rownames(input_mod1) +batch <- input_mod1$obs$batch +dataset_id <- input_mod1$uns[["dataset_id"]] +Xt_mod1 <- t(input_mod1$X) + +# select hvg +if (!is.null(par$hvg_sel) && nrow(Xt_mod1) > par$hvg_sel) { + sd_mod1 <- proxyC::rowSds(Xt_mod1) + Xt_mod1 <- Xt_mod1[order(sd_mod1, decreasing = TRUE)[seq_len(par$hvg_sel)], ] +} + +rm(input_mod1) +gc() + +Xt_mod2 <- t(anndata::read_h5ad(par$input_mod2)$X) +if (!is.null(par$hvg_sel) && nrow(Xt_mod2) > par$hvg_sel) { + sd_mod2 <- proxyC::rowSds(Xt_mod2) + Xt_mod2 <- Xt_mod2[order(sd_mod2, decreasing = TRUE)[seq_len(par$hvg_sel)], ] +} + +cat("Running fastMNN\n") +mnn_out <- batchelor::fastMNN( + rbind(Xt_mod1, Xt_mod2), + batch = batch +) +dr <- SingleCellExperiment::reducedDim(mnn_out, "corrected") + +rownames(dr) <- rn +colnames(dr) <- paste0("comp_", seq_len(ncol(dr))) + +out <- anndata::AnnData( + X = dr, + uns = list( + dataset_id = dataset_id, + method_id = meta$functionality_name + ), + obsm = list(X_emb = as(dr, "CsparseMatrix")) +) + +cat("Writing predictions to file\n") +zzz <- out$write_h5ad(par$output, compression = "gzip") diff --git a/src/joint_embedding/methods/newwave/config.vsh.yaml b/src/joint_embedding/methods/newwave/config.vsh.yaml new file mode 100644 index 0000000000..0939d7b6f7 --- /dev/null +++ b/src/joint_embedding/methods/newwave/config.vsh.yaml @@ -0,0 +1,40 @@ +__merge__: ../../api/comp_method.yaml +functionality: + name: newwave + namespace: joint_embedding/methods + description: Concatenated NewWave. + info: + type: method + method_name: "NewWave" + doi: "10.1101/2021.08.02.453487" + arguments: + - name: "--maxiter" + type: "integer" + default: 100 + description: Maximum number of NewWave iterations. + - name: "--k" + type: "integer" + default: 10 + description: NewWave K parameter. + - name: "--hvg_sel" + type: "integer" + default: 1000 + description: Number of features per modality to use. + resources: + - type: r_script + path: script.R +platforms: + - type: docker + image: eddelbuettel/r2u:22.04 + setup: + - type: r + cran: [ anndata, lmds, tidyverse, bioconductor] + - type: r + bioc: [ SingleCellExperiment, NewWave, proxyC ] + - type: apt + packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3] + - type: python + pip: [anndata>=0.8] + - type: nextflow + directives: + label: [ highmem, highcpu ] diff --git a/src/joint_embedding/methods/newwave/script.R b/src/joint_embedding/methods/newwave/script.R new file mode 100644 index 0000000000..f87d79cdd5 --- /dev/null +++ b/src/joint_embedding/methods/newwave/script.R @@ -0,0 +1,111 @@ +cat("Loading dependencies\n") +options(tidyverse.quiet = TRUE) +library(tidyverse) +requireNamespace("anndata", quietly = TRUE) +library(Matrix, warn.conflicts = FALSE, quietly = TRUE) +requireNamespace("NewWave", quietly = TRUE) +requireNamespace("SingleCellExperiment", quietly = TRUE) + +## VIASH START +# path <- "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter." +path <- "output/datasets/joint_embedding/openproblems_bmmc_multiome_phase1/openproblems_bmmc_multiome_phase1.censor_dataset.output_" +# path <- "output/public_datasets/joint_embedding/dyngen_citeseq_1/dyngen_citeseq_1.censor_dataset.output_" +par <- list( + input_mod1 = paste0(path, "mod1.h5ad"), + input_mod2 = paste0(path, "mod2.h5ad"), + output = "output.h5ad", + maxiter = 2L, + k = 3L, + hvg_sel = 1000 +) +meta <- list(functionality_name = "foo") +## VIASH END + +method_id <- meta$functionality_name + +cat("Reading mod1 h5ad\n") +input_mod1 <- anndata::read_h5ad(par$input_mod1) + +rn <- rownames(input_mod1) +batch <- input_mod1$obs$batch +dataset_id <- input_mod1$uns[["dataset_id"]] + +sd1 <- proxyC::colSds(input_mod1$X) +fil1 <- + if (!is.null(par$hvg_sel) && ncol(input_mod1) > par$hvg_sel) { + head(order(sd1, decreasing = TRUE), par$hvg_sel) + } else { + which(sd1 > 0) + } +data1 <- SummarizedExperiment::SummarizedExperiment( + assays = list(counts = t(input_mod1$layers[["counts"]][, fil1])), + colData = data.frame(batch = factor(batch)) +) +rm(input_mod1) +gc() + +cat("Running NewWave on mod1\n") +res1 <- NewWave::newWave( + data1, + X = "~batch", + verbose = TRUE, + K = par$k, + maxiter_optimize = par$maxiter, + n_gene_par = min(300, nrow(data1)), + n_cell_par = min(300, ncol(data1)), + commondispersion = FALSE +) +rm(data1) + +dr_x1 <- SingleCellExperiment::reducedDim(res1) + +cat("Reading mod2 anndata\n") +input_mod2 <- anndata::read_h5ad(par$input_mod2) +sd2 <- proxyC::colSds(input_mod2$X) +fil2 <- + if (!is.null(par$hvg_sel) && ncol(input_mod2) > par$hvg_sel) { + head(order(sd2, decreasing = TRUE), par$hvg_sel) + } else { + which(sd2 > 0) + } +data2 <- SummarizedExperiment::SummarizedExperiment( + assays = list(counts = t(input_mod2$layers[["counts"]][, fil2])), + colData = data.frame(batch = factor(batch)) +) +rm(input_mod2) +gc() + +cat("Running NewWave on mod2\n") +res2 <- NewWave::newWave( + data2, + X = "~batch", + verbose = TRUE, + K = par$k, + maxiter_optimize = par$maxiter, + n_gene_par = min(300, nrow(data2)), + n_cell_par = min(300, ncol(data2)), + commondispersion = FALSE +) +dr_x2 <- SingleCellExperiment::reducedDim(res2) +rm(data2) + +cat("Spline separate DRs\n") +dr <- do.call(cbind, lapply(seq_len(ncol(dr_x1)), function(i) { + cbind(dr_x1[, i], dr_x2[, i]) +})) + +rownames(dr) <- rn +colnames(dr) <- paste0("comp_", seq_len(ncol(dr))) + +out <- anndata::AnnData( + X = dr, + uns = list( + dataset_id = dataset_id, + method_id = meta$functionality_name + ), + obsm = list(X_emb = as(dr, "CsparseMatrix")) + +) + +cat("Writing predictions to file\n") +zzz <- out$write_h5ad(par$output, compression = "gzip") diff --git a/src/joint_embedding/methods/pca/config.vsh.yaml b/src/joint_embedding/methods/pca/config.vsh.yaml new file mode 100644 index 0000000000..860a48c300 --- /dev/null +++ b/src/joint_embedding/methods/pca/config.vsh.yaml @@ -0,0 +1,34 @@ +__merge__: ../../api/comp_method.yaml +functionality: + name: pca + namespace: joint_embedding/methods + description: PCA dimensionality reduction. + info: + type: method + method_name: "PCA" + authors: + arguments: + - name: "--n_dims" + type: "integer" + default: 10 + description: Number of dimensions to output. + - name: "--hvg_sel" + type: "integer" + default: 1000 + description: Number of features per modality to use. + resources: + - type: r_script + path: script.R +platforms: + - type: docker + image: eddelbuettel/r2u:22.04 + setup: + - type: r + cran: [ anndata, lmds, tidyverse, bioconductor, irlba, proxyC] + - type: apt + packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3] + - type: python + pip: [anndata>=0.8] + - type: nextflow + directives: + label: [ lowmem, lowcpu ] diff --git a/src/joint_embedding/methods/pca/script.R b/src/joint_embedding/methods/pca/script.R new file mode 100644 index 0000000000..d16eb027cf --- /dev/null +++ b/src/joint_embedding/methods/pca/script.R @@ -0,0 +1,63 @@ +cat("Loading dependencies\n") +options(tidyverse.quiet = TRUE) +library(tidyverse) +requireNamespace("anndata", quietly = TRUE) +library(Matrix, warn.conflicts = FALSE, quietly = TRUE) + +## VIASH START +# path <- "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter." +path <- "output/datasets/joint_embedding/openproblems_bmmc_multiome_phase1/openproblems_bmmc_multiome_phase1.censor_dataset.output_" +# path <- "output/public_datasets/joint_embedding/dyngen_citeseq_1/dyngen_citeseq_1.censor_dataset.output_" +par <- list( + input_mod1 = paste0(path, "mod1.h5ad"), + input_mod2 = paste0(path, "mod2.h5ad"), + output = "output.h5ad", + n_dims = 4L, + hvg_sel = 1000L +) +meta <- list(functionality_name = "foo") +## VIASH END + +cat("Reading h5ad files\n") +input_mod1 <- anndata::read_h5ad(par$input_mod1) + +rn <- rownames(input_mod1) +batch <- input_mod1$obs$batch +dataset_id <- input_mod1$uns[["dataset_id"]] +X_mod1 <- input_mod1$X + +# select hvg +if (!is.null(par$hvg_sel) && ncol(X_mod1) > par$hvg_sel) { + sd_mod1 <- proxyC::colSds(X_mod1) + X_mod1 <- X_mod1[, head(order(sd_mod1, decreasing = TRUE), par$hvg_sel)] +} + +rm(input_mod1) +gc() + +X_mod2 <- anndata::read_h5ad(par$input_mod2)$X +if (!is.null(par$hvg_sel) && ncol(X_mod2) > par$hvg_sel) { + sd_mod2 <- proxyC::colSds(X_mod2) + X_mod2 <- X_mod2[, head(order(sd_mod2, decreasing = TRUE), par$hvg_sel)] +} + +cat("Performing DR\n") +dr <- irlba::prcomp_irlba( + cbind(X_mod1, X_mod2), + n = par$n_dims +)$x + +rownames(dr) <- rn +colnames(dr) <- paste0("comp_", seq_len(par$n_dims)) + +out <- anndata::AnnData( + X = dr, + uns = list( + dataset_id = dataset_id, + method_id = meta$functionality_name + ), + obsm = list( X_emb = as(dr, "CsparseMatrix")) +) + +cat("Writing predictions to file\n") +zzz <- out$write_h5ad(par$output, compression = "gzip") diff --git a/src/joint_embedding/methods/totalvi/config.vsh.yaml b/src/joint_embedding/methods/totalvi/config.vsh.yaml new file mode 100644 index 0000000000..5b0969172a --- /dev/null +++ b/src/joint_embedding/methods/totalvi/config.vsh.yaml @@ -0,0 +1,34 @@ +__merge__: ../../api/comp_method.yaml +functionality: + name: totalvi + namespace: joint_embedding/methods + version: dev + description: "totalVI: joint probabilistic modeling with Total Variational Inference" + info: + type: method + method_name: totalVI + doi: 10.1038/s41592-020-01050-x + arguments: + - name: --hvg_number + type: integer + default: 4000 + description: Number of HVG to include in totalVI + - name: --max_epochs + type: integer + default: 400 + description: Number of max epochs to run totalVI + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: "python:3.10" + setup: + - type: python + pip: [ anndata>=0.8, scanpy, scikit-misc, scipy, scikit-learn, scvi-tools] + - type: nextflow + directives: + label: [ lowmem, lowcpu ] + + + diff --git a/src/joint_embedding/methods/totalvi/script.py b/src/joint_embedding/methods/totalvi/script.py new file mode 100644 index 0000000000..9b40e1f3d1 --- /dev/null +++ b/src/joint_embedding/methods/totalvi/script.py @@ -0,0 +1,58 @@ +import anndata +import scanpy as sc +from scipy import sparse +from scvi.model import TOTALVI + +## VIASH START +par = { + "input_mod1": "output/public_datasets/joint_embedding/totalvi_spleen_lymph_111/totalvi_spleen_lymph_111.censor_dataset.output_mod1.h5ad", + "input_mod2": "output/public_datasets/joint_embedding/totalvi_spleen_lymph_111/totalvi_spleen_lymph_111.censor_dataset.output_mod2.h5ad", + "output": "tmp/output_prediction.h5ad", + "hvg_number": 4000, + "max_epochs": 20 +} + +meta = { + 'funcionality_name': "foo" +} +## VIASH END + +print("Load and prepare data", flush=True) +adata_mod1 = anndata.read_h5ad(par['input_mod1']) +adata_mod2 = anndata.read_h5ad(par['input_mod2']) +adata_mod1.obsm['protein_expression'] = adata_mod2.layers["counts"].toarray() + +print('Select highly variable genes', flush=True) +sc.pp.highly_variable_genes( + adata_mod1, + n_top_genes=par['hvg_number'], + flavor="seurat_v3", + batch_key="batch", + subset=True +) + +print("Set up model", flush=True) +TOTALVI.setup_anndata( + adata_mod1, + batch_key="batch", + protein_expression_obsm_key="protein_expression" +) + +print('Train totalVI with', par['max_epochs'], 'epochs', flush=True) +vae = TOTALVI(adata_mod1, latent_distribution="normal") +vae.train(max_epochs = par['max_epochs']) + +print("Postprocessing and saving output", flush=True) +adata_out = anndata.AnnData( + X=vae.get_latent_representation(), + obs=adata_mod1.obs[['batch']], + uns={ + "dataset_id": adata_mod1.uns["dataset_id"], + "method_id": meta["functionality_name"] + }, + obsm = {"X_emb": sparse.csr_matrix(vae.get_latent_representation())} +) + +del adata_out.X + +adata_out.write_h5ad(par['output'], compression = "gzip") diff --git a/src/joint_embedding/methods/umap/config.vsh.yaml b/src/joint_embedding/methods/umap/config.vsh.yaml new file mode 100644 index 0000000000..5ef2d3d11e --- /dev/null +++ b/src/joint_embedding/methods/umap/config.vsh.yaml @@ -0,0 +1,46 @@ +__merge__: ../../api/comp_method.yaml +functionality: + name: umap + namespace: joint_embedding/methods + version: dev + description: UMAP dimensionality reduction on the Euclidean distance. + info: + type: method + method_name: UMAP + arguments: + - name: "--n_dims" + type: "integer" + default: 10 + description: Number of dimensions to output. + - name: "--metric" + type: "string" + default: "euclidean" + description: The metric to use. Possible values are euclidean, cosine, manhattan. + - name: "--n_neighbors" + type: "integer" + default: 15 + description: Number of neighbor to use int he KNN. + - name: "--n_pcs" + type: "integer" + default: 50 + description: Number of principal components to use in the PCA step. + - name: "--hvg_sel" + type: "integer" + default: 1000 + description: Number of features per modality to use. + resources: + - type: r_script + path: script.R +platforms: + - type: docker + image: eddelbuettel/r2u:22.04 + setup: + - type: r + cran: [ anndata, lmds, tidyverse, irlba, proxyC, uwot] + - type: apt + packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3] + - type: python + pip: [anndata>=0.8] + - type: nextflow + directives: + label: [ lowmem, lowcpu ] diff --git a/src/joint_embedding/methods/umap/script.R b/src/joint_embedding/methods/umap/script.R new file mode 100644 index 0000000000..2f654be245 --- /dev/null +++ b/src/joint_embedding/methods/umap/script.R @@ -0,0 +1,80 @@ +cat("Loading dependencies\n") +options(tidyverse.quiet = TRUE) +library(tidyverse) +requireNamespace("anndata", quietly = TRUE) +library(Matrix, warn.conflicts = FALSE, quietly = TRUE) + +## VIASH START +# path <- "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter." +path <- "output/datasets/joint_embedding/openproblems_bmmc_multiome_phase1/openproblems_bmmc_multiome_phase1.censor_dataset.output_" +# path <- "output/public_datasets/joint_embedding/dyngen_citeseq_1/dyngen_citeseq_1.censor_dataset.output_" +par <- list( + input_mod1 = paste0(path, "mod1.h5ad"), + input_mod2 = paste0(path, "mod2.h5ad"), + output = "output.h5ad", + n_dims = 10L, + n_neighbors = 15L, + metric = "euclidean", + n_pcs = 50L, + hvg_sel = 100L +) +meta <- list(functionality_name = "foo") +## VIASH END + +n_cores <- parallel::detectCores(all.tests = FALSE, logical = TRUE) + +cat("Reading h5ad files\n") +input_mod1 <- anndata::read_h5ad(par$input_mod1) + +rn <- rownames(input_mod1) +batch <- input_mod1$obs$batch +dataset_id <- input_mod1$uns[["dataset_id"]] +X_mod1 <- input_mod1$X + +# select hvg +if (!is.null(par$hvg_sel) && ncol(X_mod1) > par$hvg_sel) { + sd_mod1 <- proxyC::colSds(X_mod1) + X_mod1 <- X_mod1[, head(order(sd_mod1, decreasing = TRUE), par$hvg_sel)] +} + +rm(input_mod1) +gc() + +X_mod2 <- anndata::read_h5ad(par$input_mod2)$X +if (!is.null(par$hvg_sel) && ncol(X_mod2) > par$hvg_sel) { + sd_mod2 <- proxyC::colSds(X_mod2) + X_mod2 <- X_mod2[, head(order(sd_mod2, decreasing = TRUE), par$hvg_sel)] +} + +cat("Performing PCA\n") +X_pca <- irlba::prcomp_irlba( + cbind(X_mod1, X_mod2), + n = 100 +)$x + +cat("Performing UMap\n") +dr <- uwot::umap( + X_pca, + n_components = par$n_dims, + n_neighbors = par$n_neighbors, + metric = par$metric, + n_threads = n_cores, + nn_method = "annoy" +) + +rownames(dr) <- rn +colnames(dr) <- paste0("comp_", seq_len(par$n_dims)) + +out <- anndata::AnnData( + X = dr, + uns = list( + dataset_id = dataset_id, + method_id = meta$functionality_name + ), + obsm = list( + X_emb = as(dr, "CsparseMatrix") + ) +) + +cat("Writing predictions to file\n") +zzz <- out$write_h5ad(par$output, compression = "gzip") diff --git a/src/joint_embedding/metrics/ari/config.vsh.yaml b/src/joint_embedding/metrics/ari/config.vsh.yaml new file mode 100644 index 0000000000..41ef532e5a --- /dev/null +++ b/src/joint_embedding/metrics/ari/config.vsh.yaml @@ -0,0 +1,28 @@ +__merge__: ../../api/comp_metric.yaml +functionality: + name: ari + namespace: joint_embedding/metrics + description: Adjusted rand index (ARI) + info: + metrics: + - metric_id: ari + metric_name: ari + metric_description: Adjusted rand index (ARI) + min: 0 + max: 1 + maximize: true + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: python:3.10 + setup: + - type: python + pip: [anndata>=0.8, scib, scanpy] + test_setup: + - type: python + pip: [ pyyaml ] + - type: nextflow + directives: + label: [ lowmem, lowcpu ] diff --git a/src/joint_embedding/metrics/ari/script.py b/src/joint_embedding/metrics/ari/script.py new file mode 100644 index 0000000000..dc7c195f66 --- /dev/null +++ b/src/joint_embedding/metrics/ari/script.py @@ -0,0 +1,58 @@ +import pprint +import scanpy as sc +import anndata as ad +import scib + +## VIASH START +par = dict( + input_prediction="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad", + input_solution="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad", + output="openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.ari.had", + debug=True +) + +## VIASH END + +if par['debug']: + pprint.pprint(par) + +print("Read prediction anndata") +adata = ad.read_h5ad(par['input_prediction']) +dataset_id = adata.uns['dataset_id'] + +print("Read solution anndata") +adata_solution = ad.read_h5ad(par['input_solution']) + +print('Transfer obs annotations') +adata.obs['batch'] = adata_solution.obs['batch'][adata.obs_names] +adata.obs['cell_type'] = adata_solution.obs['cell_type'][adata.obs_names] + +print('Preprocessing') +sc.pp.neighbors(adata, use_rep='X_emb') + +print('Clustering') +scib.cl.opt_louvain( + adata, + label_key='cell_type', + cluster_key='cluster', + plot=False, + inplace=True, + force=True +) + +print('Compute score') +score = scib.me.ari(adata, group1='cluster', group2='cell_type') + +# store adata with metrics +print("Create output object") +out = ad.AnnData( + uns=dict( + dataset_id=adata.uns['dataset_id'], + method_id=adata.uns['method_id'], + metric_ids=["ari"], + metric_values=[score] + ) +) + +print("Write output to h5ad file") +out.write(par['output'], compression='gzip') diff --git a/src/joint_embedding/metrics/asw_batch/config.vsh.yaml b/src/joint_embedding/metrics/asw_batch/config.vsh.yaml new file mode 100644 index 0000000000..e89b30dc90 --- /dev/null +++ b/src/joint_embedding/metrics/asw_batch/config.vsh.yaml @@ -0,0 +1,28 @@ +__merge__: ../../api/comp_metric.yaml +functionality: + name: asw_batch + namespace: joint_embedding/metrics + description: Average silhouette width (ASW) of batches per label + info: + metrics: + - metric_id: asw_batch + metric_name: asw_batch + metric_description: Average silhouette width (ASW) of batches per label + min: 0 + max: 1 + maximize: True + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: python:3.10 + setup: + - type: python + pip: [ anndata>=0.8, scanpy, scib] + test_setup: + - type: python + pip: [ pyyaml ] + - type: nextflow + directives: + label: [ lowmem, lowcpu ] diff --git a/src/joint_embedding/metrics/asw_batch/script.py b/src/joint_embedding/metrics/asw_batch/script.py new file mode 100644 index 0000000000..88e89c21a8 --- /dev/null +++ b/src/joint_embedding/metrics/asw_batch/script.py @@ -0,0 +1,55 @@ +import pprint +import scanpy as sc +import anndata as ad +import scib + +## VIASH START +par = dict( + input_prediction="resources_test/common/joint_embedding/cite_random_prediction.h5ad", + input_solution="resources_test/common/joint_embedding/cite_solution.h5ad", + output="resources_test/common/joint_embedding/score_cc_cons.h5ad", + debug=False +) + +## VIASH END + +if par['debug']: + pprint.pprint(par) + +input_prediction = par['input_prediction'] +input_solution = par['input_solution'] +output = par['output'] + +print("Read prediction anndata") +adata = ad.read_h5ad(input_prediction) +dataset_id = adata.uns['dataset_id'] + +print("Read solution anndata") +adata_solution = ad.read_h5ad(input_solution) + +print('Transfer obs annotations') +adata.obs['batch'] = adata_solution.obs['batch'][adata.obs_names] +adata.obs['cell_type'] = adata_solution.obs['cell_type'][adata.obs_names] + +print('Compute score') +score = scib.me.silhouette_batch( + adata, + batch_key='batch', + group_key='cell_type', + embed='X_emb', + verbose=False +) + +# store adata with metrics +print("Create output object") +out = ad.AnnData( + uns=dict( + dataset_id=adata.uns['dataset_id'], + method_id=adata.uns['method_id'], + metric_ids=['asw_batch'], + metric_values=[score] + ) +) + +print("Write output to h5ad file") +out.write(output, compression='gzip') diff --git a/src/joint_embedding/metrics/asw_label/config.vsh.yaml b/src/joint_embedding/metrics/asw_label/config.vsh.yaml new file mode 100644 index 0000000000..262b48d6f4 --- /dev/null +++ b/src/joint_embedding/metrics/asw_label/config.vsh.yaml @@ -0,0 +1,28 @@ +__merge__: ../../api/comp_metric.yaml +functionality: + name: asw_label + namespace: joint_embedding/metrics + description: Average silhouette width (ASW) of labels + info: + metrics: + - metric_id: asw_label + metric_name: asw_label + metric_description: Average silhouette width (ASW) of labels + min: 0 + max: 1 + maximize: True + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: python:3.10 + setup: + - type: python + pip: [anndata>=0.8, scib] + test_setup: + - type: python + pip: [ pyyaml ] + - type: nextflow + directives: + label: [ lowmem, lowcpu ] diff --git a/src/joint_embedding/metrics/asw_label/script.py b/src/joint_embedding/metrics/asw_label/script.py new file mode 100644 index 0000000000..4d04092004 --- /dev/null +++ b/src/joint_embedding/metrics/asw_label/script.py @@ -0,0 +1,49 @@ +import pprint +import anndata as ad +import scib + +## VIASH START +par = dict( + input_prediction="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad", + input_solution="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad", + output="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.asw_batch.tsv", + debug=True +) + +## VIASH END + +if par['debug']: + pprint.pprint(par) + +input_prediction = par['input_prediction'] +input_solution = par['input_solution'] +output = par['output'] + +print("Read prediction anndata") +adata = ad.read_h5ad(input_prediction) +dataset_id = adata.uns['dataset_id'] + +print("Read solution anndata") +adata_solution = ad.read_h5ad(input_solution) + +print('Transfer obs annotations') +adata.obs['batch'] = adata_solution.obs['batch'][adata.obs_names] +adata.obs['cell_type'] = adata_solution.obs['cell_type'][adata.obs_names] + +print('Compute score') +score = scib.me.silhouette(adata, group_key='cell_type', embed='X_emb') + +# store adata with metrics +print("Create output object") +out = ad.AnnData( + uns=dict( + dataset_id=adata.uns['dataset_id'], + method_id=adata.uns['method_id'], + metric_ids=['asw_label'], + metric_values=[score] + ) +) + +print("Write output to h5ad file") +out.write(output, compression='gzip') + diff --git a/src/joint_embedding/metrics/cc_cons/config.vsh.yaml b/src/joint_embedding/metrics/cc_cons/config.vsh.yaml new file mode 100644 index 0000000000..ba501a5e21 --- /dev/null +++ b/src/joint_embedding/metrics/cc_cons/config.vsh.yaml @@ -0,0 +1,28 @@ +__merge__: ../../api/comp_metric.yaml +functionality: + name: cc_cons + namespace: joint_embedding/metrics + description: Cell cycle conservation score + info: + metrics: + - metric_id: cc_cons + metric_name: cc_cons + metric_description: Cell cycle conservation score + min: 0 + max: 1 + maximize: True + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: python:3.10 + setup: + - type: python + pip: [anndata>=0.8, scib] + test_setup: + - type: python + pip: [ pyyaml ] + - type: nextflow + directives: + label: [ vhighmem, midcpu ] diff --git a/src/joint_embedding/metrics/cc_cons/script.py b/src/joint_embedding/metrics/cc_cons/script.py new file mode 100644 index 0000000000..feecf45f12 --- /dev/null +++ b/src/joint_embedding/metrics/cc_cons/script.py @@ -0,0 +1,56 @@ +import pprint +import anndata as ad +import scib + +## VIASH START +par = dict( + input_prediction="resources_test/common/joint_embedding/cite_random_prediction.h5ad", + input_solution="resources_test/common/joint_embedding/cite_solution.h5ad", + output="resources_test/common/joint_embedding/score_cc_cons.h5ad", + debug=False +) +## VIASH END + + +if par['debug']: + pprint.pprint(par) + +print("Read prediction anndata") +adata = ad.read_h5ad(par['input_prediction']) +dataset_id = adata.uns['dataset_id'] + +print("Read solution anndata") +adata_solution = ad.read_h5ad(par['input_solution']) +organism = adata_solution.uns['organism'] + +print('Transfer obs annotations') +adata.obs['batch'] = adata_solution.obs['batch'][adata.obs_names] +adata.obs['cell_type'] = adata_solution.obs['cell_type'][adata.obs_names] +recompute_cc = 'S_score' not in adata_solution.obs_keys() or \ + 'G2M_score' not in adata_solution.obs_keys() + +print('Compute score') +adata.obsm['X_emb'] = adata.obsm['X_emb'].toarray() + +score = scib.me.cell_cycle( + adata_pre=adata_solution, + adata_post=adata, + batch_key='batch', + embed='X_emb', + recompute_cc=recompute_cc, + organism=organism +) + +# store adata with metrics +print("Create output object") +out = ad.AnnData( + uns= { + "dataset_id":adata.uns['dataset_id'], + "method_id":adata.uns['method_id'], + "metric_ids":['cc_cons'], + "metric_values":[score], + } +) + +print("Write output to h5ad file") +out.write(par['output'], compression='gzip') diff --git a/src/joint_embedding/metrics/check_format/config.vsh.yaml b/src/joint_embedding/metrics/check_format/config.vsh.yaml new file mode 100644 index 0000000000..90294f01e6 --- /dev/null +++ b/src/joint_embedding/metrics/check_format/config.vsh.yaml @@ -0,0 +1,38 @@ +__merge__: ../../api/comp_metric.yaml +functionality: + name: check_format + namespace: joint_embedding/metrics + description: Checking whether the prediction of a method has the right format. + info: + metrics: + - metric_id: finished + metric_name: finished + metric_description: check if metric finished + min: 0 + max: 1 + maximize: true + - metric_id: correct_format + metric_name: correct_format + metric_description: check if format is correct + min: 0 + max: 1 + maximize: true + resources: + - type: r_script + path: script.R +platforms: + - type: docker + image: eddelbuettel/r2u:22.04 + setup: + - type: r + cran: [ anndata, lmds, tidyverse ] + - type: apt + packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3] + - type: python + pip: [ anndata>=0.8 ] + test_setup: + - type: python + pip: [ pyyaml ] + - type: nextflow + directives: + label: [ lowmem, lowcpu ] diff --git a/src/joint_embedding/metrics/check_format/script.R b/src/joint_embedding/metrics/check_format/script.R new file mode 100644 index 0000000000..afe8ed10fd --- /dev/null +++ b/src/joint_embedding/metrics/check_format/script.R @@ -0,0 +1,61 @@ +cat("Load dependencies\n") +library(assertthat, quietly = TRUE, warn.conflicts = FALSE) +library(anndata, warn.conflicts = FALSE) + +## VIASH START +task <- "joint_embedding" +par <- list( + input_solution = paste0("resources_test/", task, "/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad"), + input_prediction = paste0("resources_test/", task, "/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad"), + output = paste0("resources_test/", task, "/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.scores.h5ad") +) +## VIASH END + +cat("Read prediction h5ad\n") +ad_sol <- read_h5ad(par$input_solution) + +cat("Checking solution h5ad\n") +correct_format <- tryCatch({ + # read prediction + ad_pred <- read_h5ad(par$input_prediction) + + # check dataset id + dataset_id <- ad_pred$uns[["dataset_id"]] + assert_that(dataset_id == ad_sol$uns[["dataset_id"]]) + + # check method id + method_id <- ad_pred$uns[["method_id"]] + assert_that( + is.character(method_id), + method_id != "" + ) + + # check X + assert_that( + ad_pred$n_obs == ad_sol$n_obs, + ad_pred$n_vars >= 1, + ad_pred$n_vars <= 100, + !is.null(ad_pred$obs_names), + all(ad_pred$obs_names == ad_sol$obs_names) + ) + + 1 +}, error = function(e) { + cat("ERROR: ", e$message, "\n", sep = "") + 0 +}) + + +cat("Create output object\n") +out <- AnnData( + shape = c(0, 0), + uns = list( + dataset_id = ad_pred$uns$dataset_id, + method_id = ad_pred$uns$method_id, + metric_ids = c("finished", "correct_format"), + metric_values = c(1, correct_format) + ) +) + +cat("Write output to h5ad file\n") +zzz <- out$write_h5ad(par$output, compression = "gzip") diff --git a/src/joint_embedding/metrics/graph_connectivity/config.vsh.yaml b/src/joint_embedding/metrics/graph_connectivity/config.vsh.yaml new file mode 100644 index 0000000000..c79966893c --- /dev/null +++ b/src/joint_embedding/metrics/graph_connectivity/config.vsh.yaml @@ -0,0 +1,28 @@ +__merge__: ../../api/comp_metric.yaml +functionality: + name: graph_connectivity + namespace: joint_embedding/metrics + description: Graph connectivity + info: + metrics: + - metric_id: graph_conn + metric_name: graph_conn + metric_description: Graph connectivity + min: 0 + max: 1 + maximize: True + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: python:3.10 + setup: + - type: python + pip: [anndata>=0.8, scib] + test_setup: + - type: python + pip: [ pyyaml ] + - type: nextflow + directives: + label: [ lowmem, lowcpu ] diff --git a/src/joint_embedding/metrics/graph_connectivity/script.py b/src/joint_embedding/metrics/graph_connectivity/script.py new file mode 100644 index 0000000000..ab9089d74a --- /dev/null +++ b/src/joint_embedding/metrics/graph_connectivity/script.py @@ -0,0 +1,53 @@ +import pprint +import scanpy as sc +import anndata as ad +import scib + +## VIASH START +par = dict( + input_prediction="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad", + input_solution="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad", + output="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.graph_conn.tsv", + debug=True +) + +## VIASH END + +if par['debug']: + pprint.pprint(par) + + +input_prediction = par['input_prediction'] +input_solution = par['input_solution'] +output = par['output'] + +print("Read prediction anndata") +adata = ad.read_h5ad(input_prediction) +dataset_id = adata.uns['dataset_id'] + +print("Read solution anndata") +adata_solution = ad.read_h5ad(input_solution) + +print('Transfer obs annotations') +adata.obs['batch'] = adata_solution.obs['batch'][adata.obs_names] +adata.obs['cell_type'] = adata_solution.obs['cell_type'][adata.obs_names] + +print('Preprocessing') +sc.pp.neighbors(adata, use_rep='X_emb') + +print('Compute score') +score = scib.me.graph_connectivity(adata, label_key='cell_type') + +# store adata with metrics +print("Create output object") +out = ad.AnnData( + uns = { + 'dataset_id':adata.uns['dataset_id'], + 'method_id':adata.uns['method_id'], + 'metric_ids':['graph_conn'], + 'metric_values':[score] + } +) + +print("Write output to h5ad file") +out.write(output, compression='gzip') diff --git a/src/joint_embedding/metrics/latent_mixing/config.vsh.yaml b/src/joint_embedding/metrics/latent_mixing/config.vsh.yaml new file mode 100644 index 0000000000..bf989bfc20 --- /dev/null +++ b/src/joint_embedding/metrics/latent_mixing/config.vsh.yaml @@ -0,0 +1,33 @@ +__merge__: ../../api/comp_metric.yaml +functionality: + name: latent_mixing + namespace: joint_embedding/metrics + description: Calculate latent mixing metric for joint embedding task. + info: + metrics: + - metric_id: latent_mixing + metric_name: Latent mixing + metric_description: Calculate latent mixing + min: -1 + max: 0 + maximize: True + arguments: + - name: "--n_neighbors" + type: integer + default: 100 + description: Number of neighbors for the entropy_batch_mixing metric. + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: python:3.10 + setup: + - type: python + pip: [anndata>=0.8, scikit-learn, scipy] + test_setup: + - type: python + pip: [ pyyaml ] + - type: nextflow + directives: + label: [ lowmem, lowcpu ] \ No newline at end of file diff --git a/src/joint_embedding/metrics/latent_mixing/script.py b/src/joint_embedding/metrics/latent_mixing/script.py new file mode 100644 index 0000000000..511e7f8d2a --- /dev/null +++ b/src/joint_embedding/metrics/latent_mixing/script.py @@ -0,0 +1,92 @@ +import anndata as ad +import numpy as np +import scipy +from sklearn.neighbors import NearestNeighbors + +# VIASH START +par = { + "input_prediction": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad", + "input_solution": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad", + "output": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.scores_totalvi.h5ad", + "n_neighbors": 100 +} +# VIASH END + +print("Read input files") +predict_adata = ad.read_h5ad(par["input_prediction"]) +solution_adata = ad.read_h5ad(par["input_solution"]) + +print("Merge prediction with solution") +merged_adata = predict_adata.copy() + +batch_val = solution_adata.obs["batch"].astype(str) +batch_unique_values, batch_index = np.unique(batch_val, return_inverse=True) + +merged_adata.obs["batch"] = batch_index + +def entropy_batch_mixing( + latent_space, batches, n_neighbors=50, n_pools=50, n_samples_per_pool=100 +): + + def neg_kl(hist_data, global_freq): + n_batches = len(np.unique(hist_data)) + if n_batches > 2: + raise ValueError("Should be only two clusters for this metric") + frequency = np.mean(hist_data == 1) + if frequency == 0 or frequency == 1: + return 0 + return -( + frequency * np.log(frequency / global_freq) + + (1 - frequency) * np.log((1 - frequency) / (1 - global_freq)) + ) + + n_neighbors = min(n_neighbors, latent_space.getnnz() - 1) + nne = NearestNeighbors(n_neighbors=1 + n_neighbors, n_jobs=8) + nne.fit(latent_space) + kmatrix = nne.kneighbors_graph(latent_space) - scipy.sparse.identity( + latent_space.shape[0] + ) + + global_freq = np.mean(batches) + print(global_freq) + score = 0 + for t in range(n_pools): + indices = np.random.choice( + np.arange(latent_space.shape[0]), size=n_samples_per_pool + ) + score += np.mean( + [ + neg_kl( + batches[ # the batches of cell i's neighbors + kmatrix[indices].nonzero()[ + 1 + ][ # the neighbors of cell i (columns in row i) + kmatrix[indices].nonzero()[0] == i # the row of cell i + ] + ], + global_freq, + ) + for i in range(n_samples_per_pool) + ] + ) + return score / float(n_pools) + + +print("Calculate latent mixing metric") +latent_mixing = entropy_batch_mixing( + latent_space=merged_adata.obsm['X_emb'], + batches=merged_adata.obs["batch"].values, + n_neighbors=par["n_neighbors"] +) + +print("Write output") +adata_out = ad.AnnData( + uns = { + "dataset_id": predict_adata.uns["dataset_id"], + "method_id" : predict_adata.uns["method_id"], + "metric_ids" : ["latent_mixing"], + "metric_values" : [latent_mixing] + } +) + +adata_out.write_h5ad(par['output'], compression = "gzip") \ No newline at end of file diff --git a/src/joint_embedding/metrics/nmi/config.vsh.yaml b/src/joint_embedding/metrics/nmi/config.vsh.yaml new file mode 100644 index 0000000000..e83772d65e --- /dev/null +++ b/src/joint_embedding/metrics/nmi/config.vsh.yaml @@ -0,0 +1,28 @@ +__merge__: ../../api/comp_metric.yaml +functionality: + name: nmi + namespace: joint_embedding/metrics + description: Normalised mutual information (NMI) + info: + metrics: + - metric_id: nmi + metric_name: NMI + metric_description: Normalised mutual information (NMI) + min: 0 + max: 1 + maximize: True + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: python:3.10 + setup: + - type: python + pip: [anndata>=0.8, scib] + test_setup: + - type: python + pip: [ pyyaml ] + - type: nextflow + directives: + label: [ lowmem, lowcpu ] diff --git a/src/joint_embedding/metrics/nmi/script.py b/src/joint_embedding/metrics/nmi/script.py new file mode 100644 index 0000000000..73ef901bd7 --- /dev/null +++ b/src/joint_embedding/metrics/nmi/script.py @@ -0,0 +1,64 @@ +import pprint +import scanpy as sc +import anndata as ad +import scib + + +## VIASH START +par = dict( + input_prediction="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad", + input_solution="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad", + output="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.nmi.tsv", + debug=True +) + +## VIASH END + +if par['debug']: + pprint.pprint(par) + + +input_prediction = par['input_prediction'] +input_solution = par['input_solution'] +output = par['output'] + +print("Read prediction anndata") +adata = ad.read_h5ad(input_prediction) +dataset_id = adata.uns['dataset_id'] + +print("Read solution anndata") +adata_solution = ad.read_h5ad(input_solution) + +print('Transfer obs annotations') +adata.obs['batch'] = adata_solution.obs['batch'][adata.obs_names] +adata.obs['cell_type'] = adata_solution.obs['cell_type'][adata.obs_names] + +print('Preprocessing') +sc.pp.neighbors(adata, use_rep='X_emb') + +print('Clustering') +scib.cl.opt_louvain( + adata, + label_key='cell_type', + cluster_key='cluster', + plot=False, + inplace=True, + force=True +) + +print('Compute score') +score = scib.me.nmi(adata, group1='cluster', group2='cell_type') + +# store adata with metrics +print("Create output object") +out = ad.AnnData( + uns=dict( + dataset_id=adata.uns['dataset_id'], + method_id=adata.uns['method_id'], + metric_ids=['nmi'], + metric_values=[score] + ) +) + +print("Write output to h5ad file") +out.write(output, compression='gzip') \ No newline at end of file diff --git a/src/joint_embedding/metrics/rfoob/config.vsh.yaml b/src/joint_embedding/metrics/rfoob/config.vsh.yaml new file mode 100644 index 0000000000..099219de47 --- /dev/null +++ b/src/joint_embedding/metrics/rfoob/config.vsh.yaml @@ -0,0 +1,50 @@ +__merge__: ../../api/comp_metric.yaml +functionality: + name: rfoob + namespace: joint_embedding/metrics + description: Calculating basic metrics for the joint embedding task. + info: + metrics: + - metric_id: rfoob_celltype_accuracy + metric_name: Rfoob Celltype Accuray + metric_description: check the celltype accuracy + min: 0 + max: 1 + maximize: True + - metric_metric_id: rfoob_pseudotimegex_rsq + metric_name: rfoob pseudotimegex rsq + metric_description: rfoob pseudotimegex rsq + min: -1 + max: 1 + maximize: true + - metric_id: rfoob_pseudotimeadt_rsq + metric_name: rfoob pseudotimeadt rsq + metric_description: rfoob pseudotimeadt rsq + min: -1 + max: 1 + maximize: True + - metric_id: rfoob_batch_error + metric_name: rfoob batch error + metric_description: rfoob batch error + min: 0 + max: 1 + maximize: True + resources: + - type: r_script + path: script.R +platforms: + - type: docker + image: eddelbuettel/r2u:22.04 + setup: + - type: apt + packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3, git ] + - type: python + pip: [ anndata>=0.8 ] + - type: r + cran: [ anndata, ranger, tidyverse, testthat] + test_setup: + - type: python + pip: [ pyyaml ] + - type: nextflow + directives: + label: [ lowmem, lowcpu ] \ No newline at end of file diff --git a/src/joint_embedding/metrics/rfoob/script.R b/src/joint_embedding/metrics/rfoob/script.R new file mode 100644 index 0000000000..e237192fe3 --- /dev/null +++ b/src/joint_embedding/metrics/rfoob/script.R @@ -0,0 +1,68 @@ +cat("Load dependencies\n") +options(tidyverse.quiet = TRUE) +library(tidyverse) +library(testthat, quietly = TRUE, warn.conflicts = FALSE) +requireNamespace("anndata", quietly = TRUE) + +## VIASH START +par <- list( + input_solution = "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad", + input_prediction = "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad", + output = "scores.h5ad" +) +## VIASH END + +cat("Read solution h5ad\n") +ad_sol <- anndata::read_h5ad(par$input_solution) + +cat("Read prediction h5ad\n") +expect_true( + grepl("\\.h5ad$", par$input_prediction), + info = "Prediction file should be an h5ad file" +) +ad_pred <- + tryCatch({ + anndata::read_h5ad(par$input_prediction) + }, error = function(e) { + stop(paste0("Can't open prediction h5ad file. Detailed error message:\n", e$message)) + }) +expect_true( + ad_sol$uns$dataset_id == ad_pred$uns$dataset_id +) + +cat("Calculating metrics\n") +df <- data.frame(as.matrix(ad_pred$obsm[["X_emb"]]), SOLUTION_CELL_TYPE = ad_sol$obs[["cell_type"]]) +rf1 <- ranger::ranger(SOLUTION_CELL_TYPE ~ ., df) + +df <- data.frame(as.matrix(ad_pred$obsm[["X_emb"]]), SOLUTION_PSEUDOTIME_ORDER = ad_sol$obs$pseudotime_order_GEX) +df <- df[is.finite(df$SOLUTION_PSEUDOTIME_ORDER), , drop = FALSE] +rf2 <- ranger::ranger(SOLUTION_PSEUDOTIME_ORDER ~ ., df) + +colname <- colnames(ad_sol$obs)[grepl("pseudotime_order_A.*", colnames(ad_sol$obs))] +df <- data.frame(as.matrix(ad_pred$obsm[["X_emb"]]), SOLUTION_PSEUDOTIME_ORDER = ad_sol$obs[[colname]]) +df <- df[is.finite(df$SOLUTION_PSEUDOTIME_ORDER), , drop = FALSE] +rf3 <- ranger::ranger(SOLUTION_PSEUDOTIME_ORDER ~ ., df) + +df <- data.frame(as.matrix(ad_pred$obsm[["X_emb"]]), SOLUTION_BATCH = ad_sol$obs$batch) +rf4 <- ranger::ranger(SOLUTION_BATCH ~ ., df) + +metric_values <- c( + rfoob_celltype_accuracy = 1 - rf1$prediction.error, + rfoob_pseudotimegex_rsq = rf2$r.squared, + rfoob_pseudotimeadt_rsq = rf3$r.squared, + rfoob_batch_error = rf4$prediction.error +) + +cat("Create output object\n") +out <- anndata::AnnData( + shape = c(0, 0), + uns = list( + dataset_id = ad_pred$uns$dataset_id, + method_id = ad_pred$uns$method_id, + metric_ids = names(metric_values), + metric_values = metric_values + ) +) + +cat("Write output to h5ad file\n") +zzz <- out$write_h5ad(par$output, compression = "gzip") diff --git a/src/joint_embedding/metrics/ti_cons/config.vsh.yaml b/src/joint_embedding/metrics/ti_cons/config.vsh.yaml new file mode 100644 index 0000000000..68525b337b --- /dev/null +++ b/src/joint_embedding/metrics/ti_cons/config.vsh.yaml @@ -0,0 +1,40 @@ +__merge__: ../../api/comp_metric.yaml +functionality: + name: ti_cons + namespace: joint_embedding/metrics + description: Trajectory inference conservation score + info: + metrics: + - metric_id: ti_cons_RNA + metric_name: ti cons RNA + metric_description: ti cons RNA + min: 0 + max: 1 + maximize: True + - metric_id: ti_cons_ADT_ATAC + metric_name: ti cons ADT ATAC + metric_description: ti cons ADT ATAC + min: 0 + max: 1 + maximize: True + - metric_id: ti_cons_mean + metric_name: ti cons mean + metric_description: ti cons mean + min: 0 + max: 1 + maximize: True + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: python:3.10 + setup: + - type: python + pip: [anndata>=0.8, scib] + test_setup: + - type: python + pip: [ pyyaml ] + - type: nextflow + directives: + label: [ midmem, midcpu ] diff --git a/src/joint_embedding/metrics/ti_cons/script.py b/src/joint_embedding/metrics/ti_cons/script.py new file mode 100644 index 0000000000..1d04067024 --- /dev/null +++ b/src/joint_embedding/metrics/ti_cons/script.py @@ -0,0 +1,81 @@ + + +print('Importing libraries') +import pprint +import numpy as np +import scanpy as sc +import anndata as ad +import scib + +## VIASH START +par = dict( + input_prediction="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad", + input_solution="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad", + output="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.ti_cons.h5ad", + debug=True +) +## VIASH END + +if par['debug']: + pprint.pprint(par) + +OUTPUT_TYPE = 'graph' +METRIC = 'ti_cons' + +input_prediction = par['input_prediction'] +input_solution = par['input_solution'] +output = par['output'] + +print("Read prediction anndata") +adata = ad.read_h5ad(input_prediction) +dataset_id = adata.uns['dataset_id'] + +print("Read solution anndata") +adata_solution = ad.read_h5ad(input_solution) + +print('Transfer obs annotations') +adata.obs['batch'] = adata_solution.obs['batch'][adata.obs_names] +adata.obs['cell_type'] = adata_solution.obs['cell_type'][adata.obs_names] +adt_atac_trajectory = 'pseudotime_order_ATAC' if 'pseudotime_order_ATAC' in adata_solution.obs else 'pseudotime_order_ADT' + +print('Preprocessing') +sc.pp.neighbors(adata, use_rep='X_emb') + +print('Compute scores') +obs_keys = adata_solution.obs_keys() + +if 'pseudotime_order_GEX' in obs_keys: + score_rna = scib.me.trajectory_conservation( + adata_pre=adata_solution, + adata_post=adata, + label_key='cell_type', + pseudotime_key='pseudotime_order_GEX' + ) +else: + score_rna = np.nan + +if adt_atac_trajectory in obs_keys: + score_adt_atac = scib.me.trajectory_conservation( + adata_pre=adata_solution, + adata_post=adata, + label_key='cell_type', + pseudotime_key=adt_atac_trajectory + ) +else: + score_adt_atac = np.nan + +score_mean = (score_rna + score_adt_atac) / 2 + +# store adata with metrics +print("Create output object") +out = ad.AnnData( + uns=dict( + dataset_id=adata.uns['dataset_id'], + method_id=adata.uns['method_id'], + metric_ids=['ti_cons_RNA', 'ti_cons_ADT_ATAC', 'ti_cons_mean'], + metric_values=[score_rna, score_adt_atac, score_mean] + ) +) + +print("Write output to h5ad file") +out.write(output, compression='gzip') \ No newline at end of file diff --git a/src/joint_embedding/metrics/ti_cons_batch/config.vsh.yaml b/src/joint_embedding/metrics/ti_cons_batch/config.vsh.yaml new file mode 100644 index 0000000000..7139b8aadc --- /dev/null +++ b/src/joint_embedding/metrics/ti_cons_batch/config.vsh.yaml @@ -0,0 +1,40 @@ +__merge__: ../../api/comp_metric.yaml +functionality: + name: ti_cons_batch + namespace: joint_embedding/metrics + description: Trajectory inference conservation score per batch + info: + metrics: + - metric_id: ti_cons_batch_RNA + metric_name: ti cons batch RNA + metric_description: placeholder + min: 0 + max: 1 + maximize: True + - metric_id: ti_cons_batch_ADT_ATAC + metric_name: ti cons batch ADT ATAC + metric_description: placeholder + min: 0 + max: 1 + maximize: True + - metric_id: ti_cons_batch_mean + metric_name: ti cons batch mean + metric_description: placeholder + min: 0 + max: 1 + maximize: True + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: python:3.10 + setup: + - type: python + pip: [anndata>=0.8, scib] + test_setup: + - type: python + pip: [ pyyaml ] + - type: nextflow + directives: + label: [ midmem, midcpu ] diff --git a/src/joint_embedding/metrics/ti_cons_batch/script.py b/src/joint_embedding/metrics/ti_cons_batch/script.py new file mode 100644 index 0000000000..5206b59610 --- /dev/null +++ b/src/joint_embedding/metrics/ti_cons_batch/script.py @@ -0,0 +1,87 @@ +import pprint +import numpy as np +import scanpy as sc +import anndata as ad +import scib + +## VIASH START +par = dict( + input_prediction="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad", + input_solution="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad", + output="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.ti_cons.h5ad", + debug=True +) +## VIASH END + +if par['debug']: + pprint.pprint(par) + +OUTPUT_TYPE = 'graph' +METRIC = 'ti_cons_batch' + +input_prediction = par['input_prediction'] +input_solution = par['input_solution'] +output = par['output'] + +print("Read prediction anndata") +adata = ad.read_h5ad(input_prediction) +dataset_id = adata.uns['dataset_id'] + +print("Read solution anndata") +adata_solution = ad.read_h5ad(input_solution) + +print('Transfer obs annotations') +adata.obs['batch'] = adata_solution.obs['batch'][adata.obs_names] +adata.obs['cell_type'] = adata_solution.obs['cell_type'][adata.obs_names] +adt_atac_trajectory = 'pseudotime_order_ATAC' if 'pseudotime_order_ATAC' in adata_solution.obs else 'pseudotime_order_ADT' + +print('Preprocessing') +sc.pp.neighbors(adata, use_rep='X_emb') + +print('Compute scores') +obs_keys = adata_solution.obs_keys() + +if 'pseudotime_order_GEX' in obs_keys: + score_rna = scib.me.trajectory_conservation( + adata_pre=adata_solution, + adata_post=adata, + label_key='cell_type', + batch_key='batch', + pseudotime_key='pseudotime_order_GEX' + ) +else: + score_rna = np.nan + +if adt_atac_trajectory in obs_keys: + score_adt_atac = scib.me.trajectory_conservation( + adata_pre=adata_solution, + adata_post=adata, + label_key='cell_type', + batch_key='batch', + pseudotime_key=adt_atac_trajectory + ) +else: + score_adt_atac = np.nan + +score_mean = (score_rna + score_adt_atac) / 2 + +# store adata with metrics +print("Create output object") +out = ad.AnnData( + uns=dict( + dataset_id=adata.uns['dataset_id'], + method_id=adata.uns['method_id'], + metric_ids=['ti_cons_batch_RNA', 'ti_cons_batch_ADT_ATAC', 'ti_cons_batch_mean'], + metric_values=[score_rna, score_adt_atac, score_mean] + ) +) + +print("Write output to h5ad file") +out.write(output, compression='gzip') + +# # store score as tsv +# with open(output, 'w') as file: +# header = ['dataset', 'output_type', 'metric', 'value'] +# entry = [dataset_id, OUTPUT_TYPE, METRIC, score] +# file.write('\t'.join(header) + '\n') +# file.write('\t'.join([str(x) for x in entry])) diff --git a/src/joint_embedding/resources_scripts/mask_datasets.sh b/src/joint_embedding/resources_scripts/mask_datasets.sh new file mode 100644 index 0000000000..b5a194e7b8 --- /dev/null +++ b/src/joint_embedding/resources_scripts/mask_datasets.sh @@ -0,0 +1,64 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +COMMON_DATASETS="resources/datasets/openproblems_v1" +OUTPUT_DIR="resources/joint_embedding/datasets/openproblems_v1" + +if [ ! -d "$OUTPUT_DIR" ]; then + mkdir -p "$OUTPUT_DIR" +fi + +params_file="$OUTPUT_DIR/params.yaml" + +if [ ! -f $params_file ]; then + python << HERE +import anndata as ad +import glob +import yaml + +h5ad_files = glob.glob("$COMMON_DATASETS/**.h5ad") + +# this task doesn't use normalizations +# +param_list = {} + +for h5ad_file in h5ad_files: + print(f"Checking {h5ad_file}") + adata = ad.read_h5ad(h5ad_file, backed=True) + if "counts" in adata.layers: + dataset_id = adata.uns["dataset_id"].replace("/", ".") + obj = { + 'id': dataset_id, + 'input': h5ad_file, + 'dataset_id': dataset_id, + } + param_list[dataset_id] = obj + +output = { + "param_list": list(param_list.values()), + "seed": 123, + "output_train": "\$id.train.h5ad", + "output_test": "\$id.test.h5ad" +} + +with open("$params_file", "w") as file: + yaml.dump(output, file) +HERE +fi + +export NXF_VER=22.04.5 +nextflow \ + run . \ + -main-script target/nextflow/joint_embedding/split_dataset/main.nf \ + -profile docker \ + -resume \ + -params-file $params_file \ + --publish_dir "$OUTPUT_DIR" + +bin/tools/docker/nextflow/process_log/process_log \ + --output "$OUTPUT_DIR/nextflow_log.tsv" \ No newline at end of file diff --git a/src/joint_embedding/resources_scripts/run_benchmarks.sh b/src/joint_embedding/resources_scripts/run_benchmarks.sh new file mode 100644 index 0000000000..8a74b5a49a --- /dev/null +++ b/src/joint_embedding/resources_scripts/run_benchmarks.sh @@ -0,0 +1,74 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +set -e + +export TOWER_WORKSPACE_ID=53907369739130 + +DATASETS_DIR="resources/joint_embedding/datasets/openproblems_v1" +OUTPUT_DIR="resources/joint_embedding/benchmarks/openproblems_v1" + +if [ ! -d "$OUTPUT_DIR" ]; then + mkdir -p "$OUTPUT_DIR" +fi + +params_file="$OUTPUT_DIR/params.yaml" + +if [ ! -f $params_file ]; then + python << HERE +import yaml +import os + +dataset_dir = "$DATASETS_DIR" +output_dir = "$OUTPUT_DIR" + +# read split datasets yaml +with open(dataset_dir + "/params.yaml", "r") as file: + split_list = yaml.safe_load(file) +datasets = split_list['param_list'] + +# figure out where train/test files were stored +param_list = [] + +for dataset in datasets: + id = dataset["id"] + input_train = dataset_dir + "/" + id + ".train.h5ad" + input_test = dataset_dir + "/" + id + ".test.h5ad" + + if os.path.exists(input_test): + obj = { + 'id': id, + 'id': id, + 'id': id, + 'dataset_id': dataset["dataset_id"], + 'input_train': input_train, + 'input_test': input_test + } + param_list.append(obj) + +# write as output file +output = { + "param_list": param_list, +} + +with open(output_dir + "/params.yaml", "w") as file: + yaml.dump(output, file) +HERE +fi + +export NXF_VER=22.04.5 +nextflow \ + run . \ + -main-script src/joint_embedding/workflows/run/main.nf \ + -profile docker \ + -params-file "$params_file" \ + --publish_dir "$OUTPUT_DIR" \ + -with-tower + +bin/tools/docker/nextflow/process_log/process_log \ + --output "$OUTPUT_DIR/nextflow_log.tsv" \ No newline at end of file diff --git a/src/joint_embedding/resources_test_scripts/bmmc_cite.sh b/src/joint_embedding/resources_test_scripts/bmmc_cite.sh new file mode 100644 index 0000000000..5541edfcb5 --- /dev/null +++ b/src/joint_embedding/resources_test_scripts/bmmc_cite.sh @@ -0,0 +1,57 @@ +#!/bin/bash +# +#make sure the following command has been executed +#bin/viash_build -q 'joint_embedding|common' + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +MOD_1_DATA=resources_test/common/openproblems_bmmc_cite_starter/openproblems_bmmc_cite_starter.output_rna.h5ad +MOD_2_DATA=resources_test/common/openproblems_bmmc_cite_starter/openproblems_bmmc_cite_starter.output_mod2.h5ad +DATASET_DIR=resources_test/joint_embedding/bmmc_cite + +if [ ! -f $MOD_1_DATA ]; then + echo "Error! Could not find raw data" + exit 1 +fi + +mkdir -p $DATASET_DIR + +# split dataset +viash run src/joint_embedding/mask_dataset/config.vsh.yaml -- \ + --input_mod1 $MOD_1_DATA \ + --input_mod2 $MOD_2_DATA \ + --output_mod1 $DATASET_DIR/cite_mod1.h5ad \ + --output_mod2 $DATASET_DIR/cite_mod2.h5ad \ + --output_solution $DATASET_DIR/cite_solution.h5ad + +# run one method +viash run src/joint_embedding/methods/pca/config.vsh.yaml -- \ + --input_mod1 $DATASET_DIR/cite_mod1.h5ad \ + --input_mod2 $DATASET_DIR/cite_mod2.h5ad \ + --output $DATASET_DIR/pca.h5ad + +# run one metric +viash run src/joint_embedding/metrics/ari/config.vsh.yaml -- \ + --input_prediction $DATASET_DIR/pca.h5ad \ + --input_solution $DATASET_DIR/cite_solution.h5ad \ + --output $DATASET_DIR/ari.h5ad + +# run benchmark +export NXF_VER=22.04.5 + +nextflow \ + run . \ + -main-script src/joint_embedding/workflows/run/main.nf \ + -profile docker \ + -resume \ + --id bmmc_cite \ + --dataset_id bmmc_site \ + --input_mod1 $DATASET_DIR/cite_mod1.h5ad \ + --input_mod2 $DATASET_DIR/cite_mod2.h5ad \ + --input_solution $DATASET_DIR/cite_solution.h5ad \ + --output scores.tsv \ + --publish_dir $DATASET_DIR/ \ No newline at end of file diff --git a/src/joint_embedding/workflows/run/config.vsh.yaml b/src/joint_embedding/workflows/run/config.vsh.yaml new file mode 100644 index 0000000000..5f3e7800cf --- /dev/null +++ b/src/joint_embedding/workflows/run/config.vsh.yaml @@ -0,0 +1,26 @@ +functionality: + name: "run_benchmark" + namespace: "joint_embedding/workflows" + argument_groups: + - name: Inputs + arguments: + - name: "--id" + type: "string" + description: "The ID of the dataset" + required: true + - name: "--input_mod1" + type: "file" # todo: replace with includes + - name: "--input_mod2" + type: "file" # todo: replace with includes + - name: "--input_solution" + type: "file" # todo: replace with includes + - name: Outputs + arguments: + - name: "--output" + direction: "output" + type: file + resources: + - type: nextflow_script + path: main.nf +platforms: + - type: nextflow \ No newline at end of file diff --git a/src/joint_embedding/workflows/run/main.nf b/src/joint_embedding/workflows/run/main.nf new file mode 100644 index 0000000000..45ac7ad76d --- /dev/null +++ b/src/joint_embedding/workflows/run/main.nf @@ -0,0 +1,152 @@ +nextflow.enable.dsl=2 + +sourceDir = params.rootDir + "/src" +targetDir = params.rootDir + "/target/nextflow" + +// import control methods +include { random_embed } from "$targetDir/joint_embedding/control_methods/random_embed/main.nf" +include { zeros_embed } from "$targetDir/joint_embedding/control_methods/zeros_embed/main.nf" + +// import methods +include { lmds } from "$targetDir/joint_embedding/methods/lmds/main.nf" +include { mnn } from "$targetDir/joint_embedding/methods/mnn/main.nf" +include { newwave } from "$targetDir/joint_embedding/methods/newwave/main.nf" +include { pca } from "$targetDir/joint_embedding/methods/pca/main.nf" +include { totalvi } from "$targetDir/joint_embedding/methods/totalvi/main.nf" +include { umap } from "$targetDir/joint_embedding/methods/umap/main.nf" + +// import metrics +include { ari } from "$targetDir/joint_embedding/metrics/ari/main.nf" +include { asw_batch } from "$targetDir/joint_embedding/metrics/asw_batch/main.nf" +include { asw_label } from "$targetDir/joint_embedding/metrics/asw_label/main.nf" +include { cc_cons } from "$targetDir/joint_embedding/metrics/cc_cons/main.nf" +include { check_format } from "$targetDir/joint_embedding/metrics/check_format/main.nf" +include { graph_connectivity } from "$targetDir/joint_embedding/metrics/graph_connectivity/main.nf" +include { latent_mixing } from "$targetDir/joint_embedding/metrics/latent_mixing/main.nf" +include { nmi } from "$targetDir/joint_embedding/metrics/nmi/main.nf" +include { rfoob } from "$targetDir/joint_embedding/metrics/rfoob/main.nf" +include { ti_cons } from "$targetDir/joint_embedding/metrics/ti_cons/main.nf" +include { ti_cons_batch } from "$targetDir/joint_embedding/metrics/ti_cons_batch/main.nf" + +// tsv generation component +include { extract_scores } from "$targetDir/common/extract_scores/main.nf" + +// import helper functions +include { readConfig; viashChannel; helpMessage } from sourceDir + "/wf_utils/WorkflowHelper.nf" +include { setWorkflowArguments; getWorkflowArguments; passthroughMap as pmap } from sourceDir + "/wf_utils/DataflowHelper.nf" + +config = readConfig("$projectDir/config.vsh.yaml") + +// construct a map of methods (id -> method_module) +methods = [ lmds, mnn, newwave, pca, totalvi, umap] + .collectEntries{method -> + [method.config.functionality.name, method] + } + +workflow { + helpMessage(config) + + viashChannel(params, config) + | run_wf +} + +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + + // split params for downstream components + | setWorkflowArguments( + method: ["input_mod1", "input_mod2"], + metric: ["input_solution"], + output: ["output"] + ) + + // multiply events by the number of method + | add_methods + + // run methods + | getWorkflowArguments(key: "method") + | run_methods + + // construct tuples for metrics + | pmap{ id, file, passthrough -> + // derive unique ids from output filenames + def newId = file.getName().replaceAll(".output.*", "") + // combine prediction with solution + def newData = [ input_prediction: file, input_solution: passthrough.metric.input_solution ] + [ newId, newData, passthrough ] + } + + // run metrics + | getWorkflowArguments(key: "metric") + | run_metrics + + // convert to tsv + | aggregate_results + + emit: + output_ch +} + +workflow add_methods { + take: input_ch + main: + output_ch = Channel.fromList(methods.keySet()) + | combine(input_ch) + + // generate combined id for method_id and dataset_id + | pmap{method_id, dataset_id, data -> + def new_id = dataset_id + "." + method_id + def new_data = data.clone() + [method_id: method_id] + new_data.remove("id") + [new_id, new_data] + } + emit: output_ch +} + +workflow run_methods { + take: input_ch + main: + // generate one channel per method + method_chs = methods.collect { method_id, method_module -> + input_ch + | filter{it[1].method_id == method_id} + | method_module + } + // mix all results + output_ch = method_chs[0].mix(*method_chs.drop(1)) + + emit: output_ch +} + +workflow run_metrics { + take: input_ch + main: + + output_ch = input_ch + | (ari & asw_batch & asw_label & cc_cons & check_format & graph_connectivity & latent_mixing & nmi & rfoob & ti_cons & ti_cons_batch) + | mix + + emit: output_ch +} + +workflow aggregate_results { + take: input_ch + main: + + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ it -> + [ "combined", it.collect{ it[1] } ] + it[0].drop(2) + } + | getWorkflowArguments(key: "output") + | extract_scores.run( + auto: [ publish: true ] + ) + + emit: output_ch +} \ No newline at end of file diff --git a/src/joint_embedding/workflows/run/nextflow.config b/src/joint_embedding/workflows/run/nextflow.config new file mode 100644 index 0000000000..ea674ffa07 --- /dev/null +++ b/src/joint_embedding/workflows/run/nextflow.config @@ -0,0 +1,14 @@ +manifest { + name = 'joint_embedding/workflows/run' + mainScript = 'main.nf' + nextflowVersion = '!>=22.04.5' + description = 'Multi modality - joint embedding' +} + +params { + rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString() +} + +// include common settings +includeConfig("${params.rootDir}/src/wf_utils/ProfilesHelper.config") +includeConfig("${params.rootDir}/src/wf_utils/labels.config") \ No newline at end of file