diff --git a/CHANGELOG.md b/CHANGELOG.md index aa367f2377..1befb3b2e1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -179,3 +179,67 @@ * `metrics/rmse` should be removed because RMSE metrics don't really make sense here. * `metrics/trustworthiness` should be removed because it is already included in `metrics/coranking`. + + +## Multi modality - Joint Embedding + +### New functionality + +* `api/anndata_*`: Created a file format specifications for the h5ad files throughout the pipeline. + +* `api/comp_*`: Created an api definition for the mask, method and metric components. + +* `mask_dataset`: Added a component for masking raw datasets into task-ready dataset objects. + +* `resources_test/joint_embedding/pancreas` with `src/joint_embedding/resources_test_scripts/pancreas.sh`. + +### neurips 2021 migration + +* `control_methods/random_embed`: Migrated from neurips 2021. Extracted from baseline method `dummy_random`. + +* `control_methods/zeros_embed`: Migrated from neurips 2021. Extracted from baseline method `dummy_zeros`. + +* `methods/lmds`: Migrated from neurips 2021. + +* `methods/mnn`: Migrated and adapted from neurips 2021. + +* `methods/newwave`: Migrated and adapted from neurips 2021. + +* `methods/pca`: Migrated from neurips 2021. + +* `methods/totalvi`: Migrated from neurips 2021. + +* `methods/umap`: Migrated from neurips 2021. + +* `metrics/ari`: Migrated from neurips 2021. + +* `metrics/asw_batch`: Migrated from neurips 2021. + +* `metrics/asw_label`: Migrated from neurips 2021. + +* `metrics/cc_cons`: Migrated from neurips 2021. + +* `metrics/check_format`: Migrated from neurips 2021. + +* `metrics/graph_connectivity`: Migrated from neurips 2021. + +* `metrics/latent_mixing`: Migrated from neurips 2021. + +* `metrics/nmi`: Migrated from neurips 2021. + +* `metrics/rfoob`: Migrated from neurips 2021. + +* `metrics/ti_cons`: Migrated from neurips 2021. + +* `metrics/ti_cons_batch`: Migrated from neurips 2021. + +### changes from neurips 2021 + +* Updated docker config from R script. Was using an old `anndata` package which was giving warnings + +* stores the output from the methods in `.obsm["X_emb"]` instead of `.X` in the `anndata` + +* `X_emb` data is stored as a `Sparse Matrix` + + +* updated configs to latest `viash` \ No newline at end of file diff --git a/src/joint_embedding/README.md b/src/joint_embedding/README.md new file mode 100644 index 0000000000..7d9f65431c --- /dev/null +++ b/src/joint_embedding/README.md @@ -0,0 +1,23 @@ +# Joint embedding + +Structure of this task: + + src/embedding + ├── api Interface specifications for components and datasets in this task + ├── control_methods Baseline (random/ground truth) methods to compare methods against + ├── methods Methods to be benchmarked + ├── metrics Metrics used to quantify performance of methods + ├── README.md This file + ├── resources_scripts Scripts to process the datasets + ├── resources_test_scripts Scripts to process the test resources + ├── split_dataset Component to prepare common datasets + └── workflows Pipelines to run the full benchmark + +Relevant links: + +* [Description and results at openproblems.bio](https://openproblems.bio/neurips_2021/) + +* [Experimental results](https://openproblems-experimental.netlify.app/results/joint_embedding/) + + +* [Contribution guide](https://github.com/openproblems-bio/openproblems-v2/blob/main/CONTRIBUTING.md) diff --git a/src/joint_embedding/api/anndata_dataset.yaml b/src/joint_embedding/api/anndata_dataset.yaml new file mode 100644 index 0000000000..23c6b427be --- /dev/null +++ b/src/joint_embedding/api/anndata_dataset.yaml @@ -0,0 +1,75 @@ +type: file +description: "A raw dataset" +example: "dataset.h5ad" +info: + label: "Dataset" + slots: + layers: + - type: integer + name: counts + description: Raw counts + required: true + obs: + - type: string + name: batch + description: Batch information + required: true + - type: double + name: size_factors + description: The size factors created by the normalisation method, if any. + required: false + - type: string + name: cell_type + description: Type of cells + required: false + - type: string + name: pseudotime_order_GEX + description: + required: false + - type: string + name: pseudotime_order_ATAC + description: + required: false + - type: string + name: pseudotime_order_ADT + description: + required: false + - type: double + name: S_score + description: + required: false + - type: double + name: G2M_score + description: + required: false + - type: boolean + name: is_train + description: if sample is train data + required: true + var: + - type: string + name: gene_ids + description: + required: false + - type: string + name: feature_types + description: + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: organism + description: "data from which organism " + required: false + - type: string + name: gene_activity_var_names + description: + required: false + - type: string + name: sample_pm_varnames + description: + required: false + diff --git a/src/joint_embedding/api/anndata_masked_mod1.yaml b/src/joint_embedding/api/anndata_masked_mod1.yaml new file mode 100644 index 0000000000..7ca6820671 --- /dev/null +++ b/src/joint_embedding/api/anndata_masked_mod1.yaml @@ -0,0 +1,37 @@ +type: file +description: "The masked data" +example: "masked.h5ad" +info: + short_description: "masked data" + slots: + layers: + - type: integer + name: counts + description: Raw counts + obs: + - type: string + name: batch + description: Batch information + required: true + - type: double + name: size_factors + description: + required: false + var: + - type: string + name: feature_types + description: + required: true + - type: string + name: gene_ids + description: + required: false + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: organism + description: which organism + required: true diff --git a/src/joint_embedding/api/anndata_masked_mod2.yaml b/src/joint_embedding/api/anndata_masked_mod2.yaml new file mode 100644 index 0000000000..ad735fffba --- /dev/null +++ b/src/joint_embedding/api/anndata_masked_mod2.yaml @@ -0,0 +1,39 @@ +type: file +description: "The masked data for mod2 file" +example: "masked.h5ad" +info: + short_description: "Masked data" + slots: + layers: + - type: integer + name: counts + description: Raw counts + required: true + obs: + - type: string + name: batch + description: Batch information + required: true + var: + - type: string + name: feature_types + description: + required: true + - type: string + name: gene_ids + description: + required: false + obsm: + - type: double + name: gene_activity + description: + required: false + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: organism + description: which organism + required: true diff --git a/src/joint_embedding/api/anndata_prediction.yaml b/src/joint_embedding/api/anndata_prediction.yaml new file mode 100644 index 0000000000..49d8ae7d79 --- /dev/null +++ b/src/joint_embedding/api/anndata_prediction.yaml @@ -0,0 +1,25 @@ +type: file +description: "The prediction file" +example: "prediction.h5ad" +info: + short_description: "Prediction" + slots: + obs: + - type: string + name: batch + description: Batch information + required: true + obsm: + - type: double + name: X_emb + description: + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: method_id + description: "A unique identifier for the method" + required: true diff --git a/src/joint_embedding/api/anndata_score.yaml b/src/joint_embedding/api/anndata_score.yaml new file mode 100644 index 0000000000..bfe79f07cc --- /dev/null +++ b/src/joint_embedding/api/anndata_score.yaml @@ -0,0 +1,25 @@ +type: file +description: "Metric score file" +example: "output.h5ad" +info: + short_description: "Score" + slots: + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: method_id + description: "A unique identifier for the method" + required: true + - type: string + name: metric_ids + description: "One or more unique metric identifiers" + multiple: true + required: true + - type: double + name: metric_values + description: "The metric values obtained for the given prediction. Must be of same length as 'metric_ids'." + multiple: true + required: true diff --git a/src/joint_embedding/api/anndata_solution.yaml b/src/joint_embedding/api/anndata_solution.yaml new file mode 100644 index 0000000000..2ed03e1cc3 --- /dev/null +++ b/src/joint_embedding/api/anndata_solution.yaml @@ -0,0 +1,57 @@ +type: file +description: "The solution for the data" +example: "solution.h5ad" +info: + short_description: "Solution" + slots: + layers: + - type: integer + name: counts + description: Raw counts + obs: + - type: string + name: batch + description: Batch information + required: false + - type: string + name: cell_type + description: Type of cells + required: false + - type: string + name: pseudotime_order_GEX + description: + required: false + - type: string + name: pseudotime_order_ATAC + description: + required: false + - type: string + name: pseudotime_order_ADT + description: + required: false + - type: double + name: S_score + description: + required: false + - type: double + name: G2M_score + description: + required: false + var: + - type: string + name: feature_types + description: + required: true + - type: string + name: gene_ids + description: + required: false + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: organism + description: which organism + required: true diff --git a/src/joint_embedding/api/authors.yaml b/src/joint_embedding/api/authors.yaml new file mode 100644 index 0000000000..f2e96c35fd --- /dev/null +++ b/src/joint_embedding/api/authors.yaml @@ -0,0 +1,24 @@ +functionality: + authors: + - name: Robrecht Cannoodt + roles: [ author ] + props: { github: rcannood, orcid: "0000-0003-3641-729X" } + - name: Kai Waldrant + roles: [ contributor ] + props: { github: KaiWaldrant } + - name: Alex Tong + email: alexandertongdev@gmail.com + roles: [ author, maintainer ] + props: { github: atong01 } + - name: Christopher Lance + email: clance.connect@gmail.com + roles: [ author, maintainer ] + props: { github: xlancelottx } + - name: Michaela Mueller + email: mumichae@in.tum.de + roles: [ author, maintainer ] + props: { github: mumichae, orcid: "0000-0002-1401-1785" } + - name: Ann Chen + email: ann.chen@czbiohub.org + roles: [ author, maintainer ] + props: { github: atchen} \ No newline at end of file diff --git a/src/joint_embedding/api/comp_control_method.yaml b/src/joint_embedding/api/comp_control_method.yaml new file mode 100644 index 0000000000..d1ec13ed17 --- /dev/null +++ b/src/joint_embedding/api/comp_control_method.yaml @@ -0,0 +1,46 @@ +functionality: + arguments: + - name: "--input_mod1" + __merge__: anndata_masked_mod1.yaml + - name: "--output" + __merge__: anndata_prediction.yaml + direction: output + test_resources: + - path: ../../../../resources_test + - type: python_script + path: generic_test.py + text: | + from os import path + import subprocess + import anndata as ad + from scipy import sparse + + input_mod1_path = "resources_test/common/joint_embedding/openproblems_bmmc_multiome_starter.mod1.h5ad" + output_path = "output.h5ad" + + cmd = [ + meta['executable'], + "--input_mod1", input_mod1_path, + "--output", output_path + ] + + print(">> Running script as test") + out = subprocess.run(cmd, check=True, capture_output=True, text=True) + + print("> Checking whether output files were created") + assert path.exists(output_path) + + print("> Reading h5ad files") + input_mod1 = ad.read_h5ad(input_mod1_path) + output = ad.read_h5ad(output_path) + + print("> Checking contents of output.h5ad") + assert output.uns['dataset_id'] == input_mod1.uns['dataset_id'] + assert output.uns['method_id'] == meta['functionality_name'] + assert output.n_obs == input_mod1.n_obs + assert output.n_vars >= 1 + assert output.n_vars <= 100 + assert all(output.obs_names == input_mod1.obs_names) + assert sparse.issparse(output.obsm['X_emb']) + + print("> Test succeeded!") diff --git a/src/joint_embedding/api/comp_mask_dataset.yaml b/src/joint_embedding/api/comp_mask_dataset.yaml new file mode 100644 index 0000000000..0b97e89fca --- /dev/null +++ b/src/joint_embedding/api/comp_mask_dataset.yaml @@ -0,0 +1,15 @@ +functionality: + arguments: + - name: "--input_mod1" + __merge__: anndata_dataset.yaml + - name: "--input_mod2" + __merge__: anndata_dataset.yaml + - name: "--output_mod1" + __merge__: anndata_masked_mod1.yaml + direction: output + - name: "--output_mod2" + __merge__: anndata_masked_mod2.yaml + direction: output + - name: "--output_solution" + __merge__: anndata_solution.yaml + direction: output \ No newline at end of file diff --git a/src/joint_embedding/api/comp_method.yaml b/src/joint_embedding/api/comp_method.yaml new file mode 100644 index 0000000000..415f42ea3d --- /dev/null +++ b/src/joint_embedding/api/comp_method.yaml @@ -0,0 +1,50 @@ +functionality: + arguments: + - name: "--input_mod1" + __merge__: anndata_masked_mod1.yaml + - name: "--input_mod2" + __merge__: anndata_masked_mod2.yaml + - name: "--output" + __merge__: anndata_prediction.yaml + direction: output + test_resources: + - path: ../../../../resources_test + - type: python_script + path: generic_test.py + text: | + from os import path + import subprocess + import anndata as ad + from scipy import sparse + + input_mod1_path = "resources_test/common/joint_embedding/openproblems_bmmc_multiome_starter.mod1.h5ad" + input_mod2_path = "resources_test/common/joint_embedding/openproblems_bmmc_multiome_starter.mod2.h5ad" + output_path = "output.h5ad" + + cmd = [ + meta['executable'], + "--input_mod1", input_mod1_path, + "--input_mod2", input_mod2_path, + "--output", output_path + ] + + print(">> Running script as test") + out = subprocess.run(cmd, check=True, capture_output=True, text=True) + + print("> Checking whether output files were created") + assert path.exists(output_path) + + print("> Reading h5ad files") + input_mod1 = ad.read_h5ad(input_mod1_path) + output = ad.read_h5ad(output_path) + + print("> Checking contents of output.h5ad") + assert output.uns['dataset_id'] == input_mod1.uns['dataset_id'] + assert output.uns['method_id'] == meta['functionality_name'] + assert output.n_obs == input_mod1.n_obs + assert output.n_vars >= 1 + assert output.n_vars <= 100 + assert all(output.obs_names == input_mod1.obs_names) + assert sparse.issparse(output.obsm['X_emb']) + + print("> Test succeeded!") \ No newline at end of file diff --git a/src/joint_embedding/api/comp_metric.yaml b/src/joint_embedding/api/comp_metric.yaml new file mode 100644 index 0000000000..bde43a316f --- /dev/null +++ b/src/joint_embedding/api/comp_metric.yaml @@ -0,0 +1,97 @@ +functionality: + arguments: + - name: --input_prediction + __merge__: anndata_prediction.yaml + - name: --input_solution + __merge__: anndata_solution.yaml + - name: --output + __merge__: anndata_score.yaml + direction: output + - name: --debug + type: boolean_true + description: Verbose output for debugging. + test_resources: + - path: ../../../../resources_test + - type: python_script + path: generic_test.py + text: | + from os import path + import subprocess + import anndata as ad + import pandas as pd + import yaml + + ## VIASH START + # This code block will be replaced by viash at runtime. + par = { + "input_prediction": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad", + "input_solution": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad", + "output": "output.h5ad" + } + meta = { 'functionality_name': 'foo' } + + ## VIASH END + + input_prediction_path = "resources_test/common/joint_embedding/cite_random_prediction.h5ad" + input_solution_path = "resources_test/common/joint_embedding/cite_solution.h5ad" + output_path = "output.h5ad" + # define some filenames + with open(meta["config"], "r") as file: + config = yaml.safe_load(file) + + cmd = [ + meta['executable'], + "--input_prediction", input_prediction_path, + "--input_solution", input_solution_path, + "--output", output_path + ] + + print("> Running method", flush=True) + out = subprocess.run(cmd, capture_output=True, text=True, check=True).stdout + + print("> Checking whether output files were created", flush=True) + assert path.exists(output_path) + + print("> Reading h5ad files", flush=True) + input_prediction = ad.read_h5ad(input_prediction_path) + input_solution = ad.read_h5ad(input_solution_path) + output = ad.read_h5ad(output_path) + + # Create DF from metric config info + metric_info = config['functionality']['info']['metrics'] + metric_meta = pd.DataFrame(metric_info) + metric_meta = metric_meta.astype({'id': str, 'label': str, 'description': str, 'min': float, 'max': float, 'maximize': bool}) + print("> Checking contents of metric info", flush=True) + assert 'id' in metric_meta + assert 'min' in metric_meta + assert 'max' in metric_meta + assert 'maximize' in metric_meta + + print("> Checking .uns['dataset_id']", flush=True) + assert 'dataset_id' in output.uns + assert output.uns['dataset_id'] == input_prediction.uns['dataset_id'] + + print("> Checking .uns['method_id']", flush=True) + assert 'method_id' in output.uns + assert output.uns['method_id'] == input_prediction.uns['method_id'] + + print("> Checking .uns['metric_ids']", flush=True) + assert 'metric_ids' in output.uns + assert set(output.uns['metric_ids']) == set(metric_meta.id) + + print("> Checking .uns['metric_values']", flush=True) + assert 'metric_values' in output.uns + assert output.uns['metric_ids'].size == output.uns['metric_values'].size + + # merge with metric_meta to see if metric_value lies within the expected range + output_uns = pd.DataFrame({ + 'id': output.uns['metric_ids'], + 'value': output.uns['metric_values'] + }) + + scores = metric_meta.merge(output_uns, on="id") + + assert all(scores.value >= scores['min']) + assert all(scores.value <= scores['max']) + + print("> Test succeeded!", flush=True) diff --git a/src/joint_embedding/api/task_info.yaml b/src/joint_embedding/api/task_info.yaml new file mode 100644 index 0000000000..79267651ce --- /dev/null +++ b/src/joint_embedding/api/task_info.yaml @@ -0,0 +1,14 @@ +task_id: joint_embedding +task_name: Joint Embedding +summary: Learning of an embedded space that leverages the information of multiple modalities (e.g. for improved cell type annotation). +description: | + The functioning of organs, tissues, and whole organisms is determined by the interplay of cells. + Cells are characterised into broad types, which in turn can take on different states. Here, a cell + state is made up of the sum of all processes that are occurring within the cell. We can gain insight + into the state of a cell by different types of measurements: e.g., RNA expression, protein abundance, + or chromatin conformation. Combining this information to describe cellular heterogeneity requires the + formation of joint embeddings generated from this multimodal data. These embeddings must account for + and remove possible batch effects between different measurement batches. The reward for methods that + can achieve this is great: a highly resolved description of the underlying biological state of a cell + that determines its function, how it interacts with other cells, and thus the cell’s role in the f + unctioning of the whole tissue. \ No newline at end of file diff --git a/src/joint_embedding/control_methods/random_embed/config.vsh.yaml b/src/joint_embedding/control_methods/random_embed/config.vsh.yaml new file mode 100644 index 0000000000..15b0c745e9 --- /dev/null +++ b/src/joint_embedding/control_methods/random_embed/config.vsh.yaml @@ -0,0 +1,25 @@ +__merge__: ../../api/comp_control_method.yaml +functionality: + name: random_embed + namespace: joint_embedding/control_methods + description: Generate a random embedding from a normal distribution. + info: + type: negative_control + method_name: Normal Dist. + arguments: + - name: "--n_dims" + type: "integer" + default: 100 + description: Number of dimensions to output. + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: "python:3.10" + setup: + - type: python + pip: [ anndata>=0.8, numpy , scipy] + - type: nextflow + directives: + label: [ lowmem, lowcpu ] diff --git a/src/joint_embedding/control_methods/random_embed/script.py b/src/joint_embedding/control_methods/random_embed/script.py new file mode 100644 index 0000000000..199ad08176 --- /dev/null +++ b/src/joint_embedding/control_methods/random_embed/script.py @@ -0,0 +1,26 @@ +import anndata +import numpy as np +from scipy import sparse + +## VIASH START +par = { + "input_mod1": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.mod1.h5ad", + "output": "output/output_prediction.h5ad", + "n_dims": 100, +} +## VIASH END + +print("Load and prepare data") +adata_mod1 = anndata.read_h5ad(par["input_mod1"]) + +X = np.random.randn(adata_mod1.shape[0], par["n_dims"]) +print("Saving output") +adata_out = anndata.AnnData( + X=X, + obsm= {"X_emb": sparse.csr_matrix(X)}, + obs=adata_mod1.obs[["batch"]], + uns={"dataset_id": adata_mod1.uns["dataset_id"], "method_id": "random_embed"}, +) +del adata_out.X + +adata_out.write_h5ad(par["output"], compression="gzip") diff --git a/src/joint_embedding/control_methods/zeros_embed/config.vsh.yaml b/src/joint_embedding/control_methods/zeros_embed/config.vsh.yaml new file mode 100644 index 0000000000..2cd0a9b71f --- /dev/null +++ b/src/joint_embedding/control_methods/zeros_embed/config.vsh.yaml @@ -0,0 +1,25 @@ +__merge__: ../../api/comp_control_method.yaml +functionality: + name: zeros_embed + namespace: joint_embedding/control_methods + description: Generate an embedding containing only zero values. + info: + type: negative_control + method_name: zeros_embed + arguments: + - name: "--n_dims" + type: "integer" + default: 1 + description: Number of dimensions to output. + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: "python:3.10" + setup: + - type: python + pip: [ anndata, numpy, scipy ] + - type: nextflow + directives: + label: [ lowmem, lowcpu ] diff --git a/src/joint_embedding/control_methods/zeros_embed/script.py b/src/joint_embedding/control_methods/zeros_embed/script.py new file mode 100644 index 0000000000..f54ef3ce18 --- /dev/null +++ b/src/joint_embedding/control_methods/zeros_embed/script.py @@ -0,0 +1,27 @@ +import anndata +import numpy as np +from scipy import sparse + +## VIASH START +par = { + "input_mod1": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.mod1.h5ad", + "output": "tmp/output_prediction.h5ad", + "n_dims": 1, +} +## VIASH END + +print("Load and prepare data") +adata_mod1 = anndata.read_h5ad(par["input_mod1"]) + +X = np.zeros((adata_mod1.shape[0], par["n_dims"])) +print("Saving output") +adata_out = anndata.AnnData( + X=X, + obs=adata_mod1.obs, + uns={"dataset_id": adata_mod1.uns["dataset_id"], "method_id": "zeros_embed"}, + obsm={"X_emb": sparse.csr_matrix(X) } +) + +del adata_out.X + +adata_out.write_h5ad(par["output"], compression="gzip") diff --git a/src/joint_embedding/mask_dataset/config.vsh.yaml b/src/joint_embedding/mask_dataset/config.vsh.yaml new file mode 100644 index 0000000000..872df1d9fa --- /dev/null +++ b/src/joint_embedding/mask_dataset/config.vsh.yaml @@ -0,0 +1,31 @@ +__merge__: ../api/comp_mask_dataset.yaml +functionality: + name: mask_dataset + namespace: joint_embedding + description: | + A component for censoring joint embedding datasets to be given + to competition participants for the 'joint embedding' task. + arguments: + - name: "--train_only" + type: "boolean_true" + description: Whether or not to only omit the train cells. + resources: + - type: r_script + path: script.R + test_resources: + - type: r_script + path: test.R + - path: ../../../resources_test +platforms: + - type: docker + image: eddelbuettel/r2u:22.04 + setup: + - type: r + cran: [ anndata, tidyverse , testthat] + - type: apt + packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3] + - type: python + pip: [ anndata>=0.8 ] + - type: nextflow + directives: + label: [ midmem, lowcpu ] diff --git a/src/joint_embedding/mask_dataset/script.R b/src/joint_embedding/mask_dataset/script.R new file mode 100644 index 0000000000..d417775314 --- /dev/null +++ b/src/joint_embedding/mask_dataset/script.R @@ -0,0 +1,97 @@ +cat("Loading dependencies\n") +options(tidyverse.quiet = TRUE) +library(tidyverse) +library(anndata, warn.conflicts = FALSE) +library(assertthat, quietly = TRUE, warn.conflicts = FALSE) +library(Matrix, quietly = TRUE, warn.conflicts = FALSE) + +## VIASH START +input_path <- "resources_test/common/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter." +output_path <- "" + +par <- list( + input_mod1 = paste0(input_path, "output_rna.h5ad"), + input_mod2 = paste0(input_path, "output_mod2.h5ad"), + output_mod1 = paste0(output_path, "output_mod1.h5ad"), + output_mod2 = paste0(output_path, "output_mod2.h5ad"), + output_solution = paste0(output_path, "solution.h5ad"), + train_only = TRUE +) +## VIASH END + +cat("Reading mod1 data\n") +input_mod1 <- anndata::read_h5ad(par$input_mod1) +ad1_mod <- unique(input_mod1$var[["feature_types"]]) +new_dataset_id <- paste0(input_mod1$uns[["dataset_id"]], "_JE") +ad1_uns <- list(dataset_id = new_dataset_id, organism = "human") +ad2_uns <- list(dataset_id = new_dataset_id, organism = "human") + +cat("Creating mod1 object\n") +out_mod1 <- anndata::AnnData( + X = input_mod1$X, + layers = list(counts = input_mod1$layers[["counts"]]), + var = input_mod1$var %>% select(one_of("gene_ids"), feature_types), + obs = input_mod1$obs %>% select(one_of("batch", "size_factors")), + uns = ad1_uns +) + +cat("Create solution object\n") +out_solution <- anndata::AnnData( + X = input_mod1$X, + var = input_mod1$var %>% select(one_of("gene_ids"), feature_types), + obs = input_mod1$obs %>% select( + one_of("batch", "cell_type", "pseudotime_order_GEX", "pseudotime_order_ATAC", "pseudotime_order_ADT", "S_score", "G2M_score") + ), + uns = ad1_uns +) + +is_train <- input_mod1$obs$is_train + +if (par$train_only) { + cat("Filtering out test cells\n", sep = "") + out_mod1 <- out_mod1[is_train, ] #$copy() + out_solution <- out_solution[is_train, ]# $copy() +} + +rm(input_mod1) +gc() + +cat("Reading mod2 data\n") +input_mod2 <- anndata::read_h5ad(par$input_mod2) +ad2_mod <- unique(input_mod2$var[["feature_types"]]) +ad2_obsm <- list() + +if (ad2_mod == "ATAC") { + ad2_uns$gene_activity_var_names <- input_mod2$uns$gene_activity_var_names + ad2_obsm$gene_activity <- as(input_mod2$obsm$gene_activity, "CsparseMatrix") +} + +cat("Creating mod2 object\n") +out_mod2 <- anndata::AnnData( + X = input_mod2$X, + layers = list(counts = input_mod2$layers[["counts"]]), + var = input_mod2$var %>% select(one_of("gene_ids"), feature_types), + obs = input_mod2$obs %>% select(one_of("batch")), + obsm = ad2_obsm, + uns = ad2_uns +) +rm(input_mod2) +gc() + +if (par$train_only) { + cat("Filtering out test cells\n", sep = "") + out_mod2 <- out_mod2[is_train, ] #$copy() +} + +cat("Saving output files as h5ad\n") +cat("output_mod1:") +print(out_mod1) +zzz <- out_mod1$write_h5ad(par$output_mod1, compression = "gzip") + +cat("output_mod2:") +print(out_mod2) +zzz <- out_mod2$write_h5ad(par$output_mod2, compression = "gzip") + +cat("output_solution:") +print(out_solution) +zzz <- out_solution$write_h5ad(par$output_solution, compression = "gzip") diff --git a/src/joint_embedding/mask_dataset/test.R b/src/joint_embedding/mask_dataset/test.R new file mode 100644 index 0000000000..88930e00d2 --- /dev/null +++ b/src/joint_embedding/mask_dataset/test.R @@ -0,0 +1,55 @@ +library(testthat, quietly = TRUE, warn.conflicts = FALSE) +library(anndata, warn.conflicts = FALSE) + +par <- list( + input_mod1 = "resources_test/common/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.output_rna.h5ad", + input_mod2 = "resources_test/common/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.output_mod2.h5ad", + output_mod1 = "output_mod1.h5ad", + output_mod2 = "output_mod2.h5ad", + output_solution = "solution.h5ad" +) + +cat("> Running mask component\n") +out <- processx::run( + command = paste0("./", meta["functionality_name"]), + args = c( + "--input_mod1", par$input_mod1, + "--input_mod2", par$input_mod2, + "--output_mod1", par$output_mod1, + "--output_mod2", par$output_mod2, + "--output_solution", par$output_solution + ), + stderr_to_stdout = TRUE +) + +cat("> Checking whether output files were created\n") +expect_true(file.exists(par$output_mod1)) +expect_true(file.exists(par$output_mod2)) +expect_true(file.exists(par$output_solution)) + +cat("> Reading h5ad files\n") +input_mod1 <- anndata::read_h5ad(par$input_mod1) +input_mod2 <- anndata::read_h5ad(par$input_mod2) +output_mod1 <- anndata::read_h5ad(par$output_mod1) +output_mod2 <- anndata::read_h5ad(par$output_mod2) +output_solution <- anndata::read_h5ad(par$output_solution) + +cat("> Checking contents of h5ad files\n") +expect_equal(output_mod1$uns[["dataset_id"]], paste0(input_mod1$uns[["dataset_id"]], "_JE")) +expect_equal(output_mod2$uns[["dataset_id"]], paste0(input_mod1$uns[["dataset_id"]], "_JE")) +expect_equal(output_solution$uns[["dataset_id"]], paste0(input_mod1$uns[["dataset_id"]], "_JE")) +expect_equal(output_mod1$uns[["organism"]], input_mod1$uns[["organism"]]) +expect_equal(output_mod2$uns[["organism"]], input_mod1$uns[["organism"]]) +expect_equal(output_solution$uns[["organism"]], input_mod1$uns[["organism"]]) +expect_equal(output_mod1$n_obs, input_mod1$n_obs) +expect_equal(output_mod2$n_obs, input_mod2$n_obs) +expect_equal(output_mod1$n_vars, input_mod1$n_vars) +expect_equal(output_mod2$n_vars, input_mod2$n_vars) +expect_equal(output_mod1$var_names, input_mod1$var_names) +expect_equal(output_mod2$var_names, input_mod2$var_names) +expect_equal(output_mod1$obs_names, input_mod1$obs_names) +expect_equal(output_mod2$obs_names, input_mod2$obs_names) + +# TODO check contents of matrices, check rownames + +cat("> Test succeeded!\n") diff --git a/src/joint_embedding/methods/lmds/config.vsh.yaml b/src/joint_embedding/methods/lmds/config.vsh.yaml new file mode 100644 index 0000000000..440172d748 --- /dev/null +++ b/src/joint_embedding/methods/lmds/config.vsh.yaml @@ -0,0 +1,33 @@ +__merge__: ../../api/comp_method.yamllowmem +functionality: + name: lmds + namespace: joint_embedding/methods + description: Landmark MDS dimensionality reduction on the Spearman distance. + info: + type: method + method_name: "LMDS" + arguments: + - name: "--distance_method" + type: "string" + default: "spearman" + description: The distance method to use. Possible values are euclidean, pearson, spearman and others. + - name: "--n_dims" + type: integer + default: 10 + description: Number of dimensions to output. + resources: + - type: r_script + path: script.R +platforms: + - type: docker + image: eddelbuettel/r2u:22.04 + setup: + - type: r + cran: [ anndata, lmds, tidyverse ] + - type: apt + packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3] + - type: python + pip: [anndata>=0.8] + - type: nextflow + directives: + label: [ lowmem, lowcpu ] diff --git a/src/joint_embedding/methods/lmds/script.R b/src/joint_embedding/methods/lmds/script.R new file mode 100644 index 0000000000..7dda2a3c4c --- /dev/null +++ b/src/joint_embedding/methods/lmds/script.R @@ -0,0 +1,44 @@ +cat("Loading dependencies\n") +options(tidyverse.quiet = TRUE) +library(tidyverse) +library(anndata, warn.conflicts = FALSE) +library(Matrix, warn.conflicts = FALSE, quietly = TRUE) + +## VIASH START +# path <- "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter." +path <- "resources_test/common/joint_embedding/openproblems_bmmc_multiome_starter." +par <- list( + input_mod1 = paste0(path, "mod1.h5ad"), + input_mod2 = paste0(path, "mod2.h5ad"), + output = "output/lmds/output.h5ad", + n_dims = 10L, + distance_method = "spearman" +) +## VIASH END + +cat("Reading h5ad files\n") +ad1 <- anndata::read_h5ad(par$input_mod1) +ad2 <- anndata::read_h5ad(par$input_mod2) + +cat("Performing DR\n") +dr <- lmds::lmds( + cbind(ad1$X, ad2$X), + ndim = par$n_dims, + distance_method = par$distance_method +) + +rownames(dr) <- rownames(ad1) +colnames(dr) <- paste0("comp_", seq_len(par$n_dims)) + +out <- anndata::AnnData( + X = dr, + uns = list( + dataset_id = ad1$uns[["dataset_id"]], + method_id = meta$functionality_name + ), + obsm = list(X_emb = as(dr, "CsparseMatrix")) +) + + +cat("Writing predictions to file\n") +zzz <- out$write_h5ad(par$output, compression = "gzip") diff --git a/src/joint_embedding/methods/mnn/config.vsh.yaml b/src/joint_embedding/methods/mnn/config.vsh.yaml new file mode 100644 index 0000000000..30dc728fd9 --- /dev/null +++ b/src/joint_embedding/methods/mnn/config.vsh.yaml @@ -0,0 +1,31 @@ +__merge__: ../../api/comp_method.yaml +functionality: + name: mnn + namespace: joint_embedding/methods + description: Mutual nearest neighbors correction followed by PCA. + info: + type: method + method_name: "MNN" + arguments: + - name: "--hvg_sel" + type: "integer" + default: 1000 + description: Number of features per modality to use. + resources: + - type: r_script + path: script.R +platforms: + - type: docker + image: eddelbuettel/r2u:22.04 + setup: + - type: r + cran: [ anndata, lmds, tidyverse, bioconductor] + - type: r + bioc: [ SingleCellExperiment, batchelor, proxyC ] + - type: apt + packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3] + - type: python + pip: [anndata>=0.8] + - type: nextflow + directives: + label: [ lowmem, lowcpu ] diff --git a/src/joint_embedding/methods/mnn/script.R b/src/joint_embedding/methods/mnn/script.R new file mode 100644 index 0000000000..27a4454bff --- /dev/null +++ b/src/joint_embedding/methods/mnn/script.R @@ -0,0 +1,67 @@ +cat("Loading dependencies\n") +options(tidyverse.quiet = TRUE) +library(tidyverse) +requireNamespace("anndata", quietly = TRUE) +library(Matrix, warn.conflicts = FALSE, quietly = TRUE) +requireNamespace("batchelor", quietly = TRUE) +requireNamespace("SingleCellExperiment", quietly = TRUE) + +## VIASH START +# path <- "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter." +path <- "output/datasets/joint_embedding/openproblems_bmmc_multiome_phase1/openproblems_bmmc_multiome_phase1.censor_dataset.output_" +# path <- "output/public_datasets/joint_embedding/dyngen_citeseq_1/dyngen_citeseq_1.censor_dataset.output_" +par <- list( + input_mod1 = paste0(path, "mod1.h5ad"), + input_mod2 = paste0(path, "mod2.h5ad"), + output = "output.h5ad", + hvg_sel = 1000L +) +meta <- list(functionality_name = "foo") +## VIASH END + +method_id <- meta$functionality_name + +cat("Reading h5ad files\n") +input_mod1 <- anndata::read_h5ad(par$input_mod1) + +rn <- rownames(input_mod1) +batch <- input_mod1$obs$batch +dataset_id <- input_mod1$uns[["dataset_id"]] +Xt_mod1 <- t(input_mod1$X) + +# select hvg +if (!is.null(par$hvg_sel) && nrow(Xt_mod1) > par$hvg_sel) { + sd_mod1 <- proxyC::rowSds(Xt_mod1) + Xt_mod1 <- Xt_mod1[order(sd_mod1, decreasing = TRUE)[seq_len(par$hvg_sel)], ] +} + +rm(input_mod1) +gc() + +Xt_mod2 <- t(anndata::read_h5ad(par$input_mod2)$X) +if (!is.null(par$hvg_sel) && nrow(Xt_mod2) > par$hvg_sel) { + sd_mod2 <- proxyC::rowSds(Xt_mod2) + Xt_mod2 <- Xt_mod2[order(sd_mod2, decreasing = TRUE)[seq_len(par$hvg_sel)], ] +} + +cat("Running fastMNN\n") +mnn_out <- batchelor::fastMNN( + rbind(Xt_mod1, Xt_mod2), + batch = batch +) +dr <- SingleCellExperiment::reducedDim(mnn_out, "corrected") + +rownames(dr) <- rn +colnames(dr) <- paste0("comp_", seq_len(ncol(dr))) + +out <- anndata::AnnData( + X = dr, + uns = list( + dataset_id = dataset_id, + method_id = meta$functionality_name + ), + obsm = list(X_emb = as(dr, "CsparseMatrix")) +) + +cat("Writing predictions to file\n") +zzz <- out$write_h5ad(par$output, compression = "gzip") diff --git a/src/joint_embedding/methods/newwave/config.vsh.yaml b/src/joint_embedding/methods/newwave/config.vsh.yaml new file mode 100644 index 0000000000..0939d7b6f7 --- /dev/null +++ b/src/joint_embedding/methods/newwave/config.vsh.yaml @@ -0,0 +1,40 @@ +__merge__: ../../api/comp_method.yaml +functionality: + name: newwave + namespace: joint_embedding/methods + description: Concatenated NewWave. + info: + type: method + method_name: "NewWave" + doi: "10.1101/2021.08.02.453487" + arguments: + - name: "--maxiter" + type: "integer" + default: 100 + description: Maximum number of NewWave iterations. + - name: "--k" + type: "integer" + default: 10 + description: NewWave K parameter. + - name: "--hvg_sel" + type: "integer" + default: 1000 + description: Number of features per modality to use. + resources: + - type: r_script + path: script.R +platforms: + - type: docker + image: eddelbuettel/r2u:22.04 + setup: + - type: r + cran: [ anndata, lmds, tidyverse, bioconductor] + - type: r + bioc: [ SingleCellExperiment, NewWave, proxyC ] + - type: apt + packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3] + - type: python + pip: [anndata>=0.8] + - type: nextflow + directives: + label: [ highmem, highcpu ] diff --git a/src/joint_embedding/methods/newwave/script.R b/src/joint_embedding/methods/newwave/script.R new file mode 100644 index 0000000000..f87d79cdd5 --- /dev/null +++ b/src/joint_embedding/methods/newwave/script.R @@ -0,0 +1,111 @@ +cat("Loading dependencies\n") +options(tidyverse.quiet = TRUE) +library(tidyverse) +requireNamespace("anndata", quietly = TRUE) +library(Matrix, warn.conflicts = FALSE, quietly = TRUE) +requireNamespace("NewWave", quietly = TRUE) +requireNamespace("SingleCellExperiment", quietly = TRUE) + +## VIASH START +# path <- "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter." +path <- "output/datasets/joint_embedding/openproblems_bmmc_multiome_phase1/openproblems_bmmc_multiome_phase1.censor_dataset.output_" +# path <- "output/public_datasets/joint_embedding/dyngen_citeseq_1/dyngen_citeseq_1.censor_dataset.output_" +par <- list( + input_mod1 = paste0(path, "mod1.h5ad"), + input_mod2 = paste0(path, "mod2.h5ad"), + output = "output.h5ad", + maxiter = 2L, + k = 3L, + hvg_sel = 1000 +) +meta <- list(functionality_name = "foo") +## VIASH END + +method_id <- meta$functionality_name + +cat("Reading mod1 h5ad\n") +input_mod1 <- anndata::read_h5ad(par$input_mod1) + +rn <- rownames(input_mod1) +batch <- input_mod1$obs$batch +dataset_id <- input_mod1$uns[["dataset_id"]] + +sd1 <- proxyC::colSds(input_mod1$X) +fil1 <- + if (!is.null(par$hvg_sel) && ncol(input_mod1) > par$hvg_sel) { + head(order(sd1, decreasing = TRUE), par$hvg_sel) + } else { + which(sd1 > 0) + } +data1 <- SummarizedExperiment::SummarizedExperiment( + assays = list(counts = t(input_mod1$layers[["counts"]][, fil1])), + colData = data.frame(batch = factor(batch)) +) +rm(input_mod1) +gc() + +cat("Running NewWave on mod1\n") +res1 <- NewWave::newWave( + data1, + X = "~batch", + verbose = TRUE, + K = par$k, + maxiter_optimize = par$maxiter, + n_gene_par = min(300, nrow(data1)), + n_cell_par = min(300, ncol(data1)), + commondispersion = FALSE +) +rm(data1) + +dr_x1 <- SingleCellExperiment::reducedDim(res1) + +cat("Reading mod2 anndata\n") +input_mod2 <- anndata::read_h5ad(par$input_mod2) +sd2 <- proxyC::colSds(input_mod2$X) +fil2 <- + if (!is.null(par$hvg_sel) && ncol(input_mod2) > par$hvg_sel) { + head(order(sd2, decreasing = TRUE), par$hvg_sel) + } else { + which(sd2 > 0) + } +data2 <- SummarizedExperiment::SummarizedExperiment( + assays = list(counts = t(input_mod2$layers[["counts"]][, fil2])), + colData = data.frame(batch = factor(batch)) +) +rm(input_mod2) +gc() + +cat("Running NewWave on mod2\n") +res2 <- NewWave::newWave( + data2, + X = "~batch", + verbose = TRUE, + K = par$k, + maxiter_optimize = par$maxiter, + n_gene_par = min(300, nrow(data2)), + n_cell_par = min(300, ncol(data2)), + commondispersion = FALSE +) +dr_x2 <- SingleCellExperiment::reducedDim(res2) +rm(data2) + +cat("Spline separate DRs\n") +dr <- do.call(cbind, lapply(seq_len(ncol(dr_x1)), function(i) { + cbind(dr_x1[, i], dr_x2[, i]) +})) + +rownames(dr) <- rn +colnames(dr) <- paste0("comp_", seq_len(ncol(dr))) + +out <- anndata::AnnData( + X = dr, + uns = list( + dataset_id = dataset_id, + method_id = meta$functionality_name + ), + obsm = list(X_emb = as(dr, "CsparseMatrix")) + +) + +cat("Writing predictions to file\n") +zzz <- out$write_h5ad(par$output, compression = "gzip") diff --git a/src/joint_embedding/methods/pca/config.vsh.yaml b/src/joint_embedding/methods/pca/config.vsh.yaml new file mode 100644 index 0000000000..860a48c300 --- /dev/null +++ b/src/joint_embedding/methods/pca/config.vsh.yaml @@ -0,0 +1,34 @@ +__merge__: ../../api/comp_method.yaml +functionality: + name: pca + namespace: joint_embedding/methods + description: PCA dimensionality reduction. + info: + type: method + method_name: "PCA" + authors: + arguments: + - name: "--n_dims" + type: "integer" + default: 10 + description: Number of dimensions to output. + - name: "--hvg_sel" + type: "integer" + default: 1000 + description: Number of features per modality to use. + resources: + - type: r_script + path: script.R +platforms: + - type: docker + image: eddelbuettel/r2u:22.04 + setup: + - type: r + cran: [ anndata, lmds, tidyverse, bioconductor, irlba, proxyC] + - type: apt + packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3] + - type: python + pip: [anndata>=0.8] + - type: nextflow + directives: + label: [ lowmem, lowcpu ] diff --git a/src/joint_embedding/methods/pca/script.R b/src/joint_embedding/methods/pca/script.R new file mode 100644 index 0000000000..d16eb027cf --- /dev/null +++ b/src/joint_embedding/methods/pca/script.R @@ -0,0 +1,63 @@ +cat("Loading dependencies\n") +options(tidyverse.quiet = TRUE) +library(tidyverse) +requireNamespace("anndata", quietly = TRUE) +library(Matrix, warn.conflicts = FALSE, quietly = TRUE) + +## VIASH START +# path <- "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter." +path <- "output/datasets/joint_embedding/openproblems_bmmc_multiome_phase1/openproblems_bmmc_multiome_phase1.censor_dataset.output_" +# path <- "output/public_datasets/joint_embedding/dyngen_citeseq_1/dyngen_citeseq_1.censor_dataset.output_" +par <- list( + input_mod1 = paste0(path, "mod1.h5ad"), + input_mod2 = paste0(path, "mod2.h5ad"), + output = "output.h5ad", + n_dims = 4L, + hvg_sel = 1000L +) +meta <- list(functionality_name = "foo") +## VIASH END + +cat("Reading h5ad files\n") +input_mod1 <- anndata::read_h5ad(par$input_mod1) + +rn <- rownames(input_mod1) +batch <- input_mod1$obs$batch +dataset_id <- input_mod1$uns[["dataset_id"]] +X_mod1 <- input_mod1$X + +# select hvg +if (!is.null(par$hvg_sel) && ncol(X_mod1) > par$hvg_sel) { + sd_mod1 <- proxyC::colSds(X_mod1) + X_mod1 <- X_mod1[, head(order(sd_mod1, decreasing = TRUE), par$hvg_sel)] +} + +rm(input_mod1) +gc() + +X_mod2 <- anndata::read_h5ad(par$input_mod2)$X +if (!is.null(par$hvg_sel) && ncol(X_mod2) > par$hvg_sel) { + sd_mod2 <- proxyC::colSds(X_mod2) + X_mod2 <- X_mod2[, head(order(sd_mod2, decreasing = TRUE), par$hvg_sel)] +} + +cat("Performing DR\n") +dr <- irlba::prcomp_irlba( + cbind(X_mod1, X_mod2), + n = par$n_dims +)$x + +rownames(dr) <- rn +colnames(dr) <- paste0("comp_", seq_len(par$n_dims)) + +out <- anndata::AnnData( + X = dr, + uns = list( + dataset_id = dataset_id, + method_id = meta$functionality_name + ), + obsm = list( X_emb = as(dr, "CsparseMatrix")) +) + +cat("Writing predictions to file\n") +zzz <- out$write_h5ad(par$output, compression = "gzip") diff --git a/src/joint_embedding/methods/totalvi/config.vsh.yaml b/src/joint_embedding/methods/totalvi/config.vsh.yaml new file mode 100644 index 0000000000..5b0969172a --- /dev/null +++ b/src/joint_embedding/methods/totalvi/config.vsh.yaml @@ -0,0 +1,34 @@ +__merge__: ../../api/comp_method.yaml +functionality: + name: totalvi + namespace: joint_embedding/methods + version: dev + description: "totalVI: joint probabilistic modeling with Total Variational Inference" + info: + type: method + method_name: totalVI + doi: 10.1038/s41592-020-01050-x + arguments: + - name: --hvg_number + type: integer + default: 4000 + description: Number of HVG to include in totalVI + - name: --max_epochs + type: integer + default: 400 + description: Number of max epochs to run totalVI + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: "python:3.10" + setup: + - type: python + pip: [ anndata>=0.8, scanpy, scikit-misc, scipy, scikit-learn, scvi-tools] + - type: nextflow + directives: + label: [ lowmem, lowcpu ] + + + diff --git a/src/joint_embedding/methods/totalvi/script.py b/src/joint_embedding/methods/totalvi/script.py new file mode 100644 index 0000000000..9b40e1f3d1 --- /dev/null +++ b/src/joint_embedding/methods/totalvi/script.py @@ -0,0 +1,58 @@ +import anndata +import scanpy as sc +from scipy import sparse +from scvi.model import TOTALVI + +## VIASH START +par = { + "input_mod1": "output/public_datasets/joint_embedding/totalvi_spleen_lymph_111/totalvi_spleen_lymph_111.censor_dataset.output_mod1.h5ad", + "input_mod2": "output/public_datasets/joint_embedding/totalvi_spleen_lymph_111/totalvi_spleen_lymph_111.censor_dataset.output_mod2.h5ad", + "output": "tmp/output_prediction.h5ad", + "hvg_number": 4000, + "max_epochs": 20 +} + +meta = { + 'funcionality_name': "foo" +} +## VIASH END + +print("Load and prepare data", flush=True) +adata_mod1 = anndata.read_h5ad(par['input_mod1']) +adata_mod2 = anndata.read_h5ad(par['input_mod2']) +adata_mod1.obsm['protein_expression'] = adata_mod2.layers["counts"].toarray() + +print('Select highly variable genes', flush=True) +sc.pp.highly_variable_genes( + adata_mod1, + n_top_genes=par['hvg_number'], + flavor="seurat_v3", + batch_key="batch", + subset=True +) + +print("Set up model", flush=True) +TOTALVI.setup_anndata( + adata_mod1, + batch_key="batch", + protein_expression_obsm_key="protein_expression" +) + +print('Train totalVI with', par['max_epochs'], 'epochs', flush=True) +vae = TOTALVI(adata_mod1, latent_distribution="normal") +vae.train(max_epochs = par['max_epochs']) + +print("Postprocessing and saving output", flush=True) +adata_out = anndata.AnnData( + X=vae.get_latent_representation(), + obs=adata_mod1.obs[['batch']], + uns={ + "dataset_id": adata_mod1.uns["dataset_id"], + "method_id": meta["functionality_name"] + }, + obsm = {"X_emb": sparse.csr_matrix(vae.get_latent_representation())} +) + +del adata_out.X + +adata_out.write_h5ad(par['output'], compression = "gzip") diff --git a/src/joint_embedding/methods/umap/config.vsh.yaml b/src/joint_embedding/methods/umap/config.vsh.yaml new file mode 100644 index 0000000000..5ef2d3d11e --- /dev/null +++ b/src/joint_embedding/methods/umap/config.vsh.yaml @@ -0,0 +1,46 @@ +__merge__: ../../api/comp_method.yaml +functionality: + name: umap + namespace: joint_embedding/methods + version: dev + description: UMAP dimensionality reduction on the Euclidean distance. + info: + type: method + method_name: UMAP + arguments: + - name: "--n_dims" + type: "integer" + default: 10 + description: Number of dimensions to output. + - name: "--metric" + type: "string" + default: "euclidean" + description: The metric to use. Possible values are euclidean, cosine, manhattan. + - name: "--n_neighbors" + type: "integer" + default: 15 + description: Number of neighbor to use int he KNN. + - name: "--n_pcs" + type: "integer" + default: 50 + description: Number of principal components to use in the PCA step. + - name: "--hvg_sel" + type: "integer" + default: 1000 + description: Number of features per modality to use. + resources: + - type: r_script + path: script.R +platforms: + - type: docker + image: eddelbuettel/r2u:22.04 + setup: + - type: r + cran: [ anndata, lmds, tidyverse, irlba, proxyC, uwot] + - type: apt + packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3] + - type: python + pip: [anndata>=0.8] + - type: nextflow + directives: + label: [ lowmem, lowcpu ] diff --git a/src/joint_embedding/methods/umap/script.R b/src/joint_embedding/methods/umap/script.R new file mode 100644 index 0000000000..2f654be245 --- /dev/null +++ b/src/joint_embedding/methods/umap/script.R @@ -0,0 +1,80 @@ +cat("Loading dependencies\n") +options(tidyverse.quiet = TRUE) +library(tidyverse) +requireNamespace("anndata", quietly = TRUE) +library(Matrix, warn.conflicts = FALSE, quietly = TRUE) + +## VIASH START +# path <- "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter." +path <- "output/datasets/joint_embedding/openproblems_bmmc_multiome_phase1/openproblems_bmmc_multiome_phase1.censor_dataset.output_" +# path <- "output/public_datasets/joint_embedding/dyngen_citeseq_1/dyngen_citeseq_1.censor_dataset.output_" +par <- list( + input_mod1 = paste0(path, "mod1.h5ad"), + input_mod2 = paste0(path, "mod2.h5ad"), + output = "output.h5ad", + n_dims = 10L, + n_neighbors = 15L, + metric = "euclidean", + n_pcs = 50L, + hvg_sel = 100L +) +meta <- list(functionality_name = "foo") +## VIASH END + +n_cores <- parallel::detectCores(all.tests = FALSE, logical = TRUE) + +cat("Reading h5ad files\n") +input_mod1 <- anndata::read_h5ad(par$input_mod1) + +rn <- rownames(input_mod1) +batch <- input_mod1$obs$batch +dataset_id <- input_mod1$uns[["dataset_id"]] +X_mod1 <- input_mod1$X + +# select hvg +if (!is.null(par$hvg_sel) && ncol(X_mod1) > par$hvg_sel) { + sd_mod1 <- proxyC::colSds(X_mod1) + X_mod1 <- X_mod1[, head(order(sd_mod1, decreasing = TRUE), par$hvg_sel)] +} + +rm(input_mod1) +gc() + +X_mod2 <- anndata::read_h5ad(par$input_mod2)$X +if (!is.null(par$hvg_sel) && ncol(X_mod2) > par$hvg_sel) { + sd_mod2 <- proxyC::colSds(X_mod2) + X_mod2 <- X_mod2[, head(order(sd_mod2, decreasing = TRUE), par$hvg_sel)] +} + +cat("Performing PCA\n") +X_pca <- irlba::prcomp_irlba( + cbind(X_mod1, X_mod2), + n = 100 +)$x + +cat("Performing UMap\n") +dr <- uwot::umap( + X_pca, + n_components = par$n_dims, + n_neighbors = par$n_neighbors, + metric = par$metric, + n_threads = n_cores, + nn_method = "annoy" +) + +rownames(dr) <- rn +colnames(dr) <- paste0("comp_", seq_len(par$n_dims)) + +out <- anndata::AnnData( + X = dr, + uns = list( + dataset_id = dataset_id, + method_id = meta$functionality_name + ), + obsm = list( + X_emb = as(dr, "CsparseMatrix") + ) +) + +cat("Writing predictions to file\n") +zzz <- out$write_h5ad(par$output, compression = "gzip") diff --git a/src/joint_embedding/metrics/ari/config.vsh.yaml b/src/joint_embedding/metrics/ari/config.vsh.yaml new file mode 100644 index 0000000000..41ef532e5a --- /dev/null +++ b/src/joint_embedding/metrics/ari/config.vsh.yaml @@ -0,0 +1,28 @@ +__merge__: ../../api/comp_metric.yaml +functionality: + name: ari + namespace: joint_embedding/metrics + description: Adjusted rand index (ARI) + info: + metrics: + - metric_id: ari + metric_name: ari + metric_description: Adjusted rand index (ARI) + min: 0 + max: 1 + maximize: true + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: python:3.10 + setup: + - type: python + pip: [anndata>=0.8, scib, scanpy] + test_setup: + - type: python + pip: [ pyyaml ] + - type: nextflow + directives: + label: [ lowmem, lowcpu ] diff --git a/src/joint_embedding/metrics/ari/script.py b/src/joint_embedding/metrics/ari/script.py new file mode 100644 index 0000000000..dc7c195f66 --- /dev/null +++ b/src/joint_embedding/metrics/ari/script.py @@ -0,0 +1,58 @@ +import pprint +import scanpy as sc +import anndata as ad +import scib + +## VIASH START +par = dict( + input_prediction="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad", + input_solution="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad", + output="openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.ari.had", + debug=True +) + +## VIASH END + +if par['debug']: + pprint.pprint(par) + +print("Read prediction anndata") +adata = ad.read_h5ad(par['input_prediction']) +dataset_id = adata.uns['dataset_id'] + +print("Read solution anndata") +adata_solution = ad.read_h5ad(par['input_solution']) + +print('Transfer obs annotations') +adata.obs['batch'] = adata_solution.obs['batch'][adata.obs_names] +adata.obs['cell_type'] = adata_solution.obs['cell_type'][adata.obs_names] + +print('Preprocessing') +sc.pp.neighbors(adata, use_rep='X_emb') + +print('Clustering') +scib.cl.opt_louvain( + adata, + label_key='cell_type', + cluster_key='cluster', + plot=False, + inplace=True, + force=True +) + +print('Compute score') +score = scib.me.ari(adata, group1='cluster', group2='cell_type') + +# store adata with metrics +print("Create output object") +out = ad.AnnData( + uns=dict( + dataset_id=adata.uns['dataset_id'], + method_id=adata.uns['method_id'], + metric_ids=["ari"], + metric_values=[score] + ) +) + +print("Write output to h5ad file") +out.write(par['output'], compression='gzip') diff --git a/src/joint_embedding/metrics/asw_batch/config.vsh.yaml b/src/joint_embedding/metrics/asw_batch/config.vsh.yaml new file mode 100644 index 0000000000..e89b30dc90 --- /dev/null +++ b/src/joint_embedding/metrics/asw_batch/config.vsh.yaml @@ -0,0 +1,28 @@ +__merge__: ../../api/comp_metric.yaml +functionality: + name: asw_batch + namespace: joint_embedding/metrics + description: Average silhouette width (ASW) of batches per label + info: + metrics: + - metric_id: asw_batch + metric_name: asw_batch + metric_description: Average silhouette width (ASW) of batches per label + min: 0 + max: 1 + maximize: True + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: python:3.10 + setup: + - type: python + pip: [ anndata>=0.8, scanpy, scib] + test_setup: + - type: python + pip: [ pyyaml ] + - type: nextflow + directives: + label: [ lowmem, lowcpu ] diff --git a/src/joint_embedding/metrics/asw_batch/script.py b/src/joint_embedding/metrics/asw_batch/script.py new file mode 100644 index 0000000000..88e89c21a8 --- /dev/null +++ b/src/joint_embedding/metrics/asw_batch/script.py @@ -0,0 +1,55 @@ +import pprint +import scanpy as sc +import anndata as ad +import scib + +## VIASH START +par = dict( + input_prediction="resources_test/common/joint_embedding/cite_random_prediction.h5ad", + input_solution="resources_test/common/joint_embedding/cite_solution.h5ad", + output="resources_test/common/joint_embedding/score_cc_cons.h5ad", + debug=False +) + +## VIASH END + +if par['debug']: + pprint.pprint(par) + +input_prediction = par['input_prediction'] +input_solution = par['input_solution'] +output = par['output'] + +print("Read prediction anndata") +adata = ad.read_h5ad(input_prediction) +dataset_id = adata.uns['dataset_id'] + +print("Read solution anndata") +adata_solution = ad.read_h5ad(input_solution) + +print('Transfer obs annotations') +adata.obs['batch'] = adata_solution.obs['batch'][adata.obs_names] +adata.obs['cell_type'] = adata_solution.obs['cell_type'][adata.obs_names] + +print('Compute score') +score = scib.me.silhouette_batch( + adata, + batch_key='batch', + group_key='cell_type', + embed='X_emb', + verbose=False +) + +# store adata with metrics +print("Create output object") +out = ad.AnnData( + uns=dict( + dataset_id=adata.uns['dataset_id'], + method_id=adata.uns['method_id'], + metric_ids=['asw_batch'], + metric_values=[score] + ) +) + +print("Write output to h5ad file") +out.write(output, compression='gzip') diff --git a/src/joint_embedding/metrics/asw_label/config.vsh.yaml b/src/joint_embedding/metrics/asw_label/config.vsh.yaml new file mode 100644 index 0000000000..262b48d6f4 --- /dev/null +++ b/src/joint_embedding/metrics/asw_label/config.vsh.yaml @@ -0,0 +1,28 @@ +__merge__: ../../api/comp_metric.yaml +functionality: + name: asw_label + namespace: joint_embedding/metrics + description: Average silhouette width (ASW) of labels + info: + metrics: + - metric_id: asw_label + metric_name: asw_label + metric_description: Average silhouette width (ASW) of labels + min: 0 + max: 1 + maximize: True + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: python:3.10 + setup: + - type: python + pip: [anndata>=0.8, scib] + test_setup: + - type: python + pip: [ pyyaml ] + - type: nextflow + directives: + label: [ lowmem, lowcpu ] diff --git a/src/joint_embedding/metrics/asw_label/script.py b/src/joint_embedding/metrics/asw_label/script.py new file mode 100644 index 0000000000..4d04092004 --- /dev/null +++ b/src/joint_embedding/metrics/asw_label/script.py @@ -0,0 +1,49 @@ +import pprint +import anndata as ad +import scib + +## VIASH START +par = dict( + input_prediction="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad", + input_solution="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad", + output="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.asw_batch.tsv", + debug=True +) + +## VIASH END + +if par['debug']: + pprint.pprint(par) + +input_prediction = par['input_prediction'] +input_solution = par['input_solution'] +output = par['output'] + +print("Read prediction anndata") +adata = ad.read_h5ad(input_prediction) +dataset_id = adata.uns['dataset_id'] + +print("Read solution anndata") +adata_solution = ad.read_h5ad(input_solution) + +print('Transfer obs annotations') +adata.obs['batch'] = adata_solution.obs['batch'][adata.obs_names] +adata.obs['cell_type'] = adata_solution.obs['cell_type'][adata.obs_names] + +print('Compute score') +score = scib.me.silhouette(adata, group_key='cell_type', embed='X_emb') + +# store adata with metrics +print("Create output object") +out = ad.AnnData( + uns=dict( + dataset_id=adata.uns['dataset_id'], + method_id=adata.uns['method_id'], + metric_ids=['asw_label'], + metric_values=[score] + ) +) + +print("Write output to h5ad file") +out.write(output, compression='gzip') + diff --git a/src/joint_embedding/metrics/cc_cons/config.vsh.yaml b/src/joint_embedding/metrics/cc_cons/config.vsh.yaml new file mode 100644 index 0000000000..ba501a5e21 --- /dev/null +++ b/src/joint_embedding/metrics/cc_cons/config.vsh.yaml @@ -0,0 +1,28 @@ +__merge__: ../../api/comp_metric.yaml +functionality: + name: cc_cons + namespace: joint_embedding/metrics + description: Cell cycle conservation score + info: + metrics: + - metric_id: cc_cons + metric_name: cc_cons + metric_description: Cell cycle conservation score + min: 0 + max: 1 + maximize: True + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: python:3.10 + setup: + - type: python + pip: [anndata>=0.8, scib] + test_setup: + - type: python + pip: [ pyyaml ] + - type: nextflow + directives: + label: [ vhighmem, midcpu ] diff --git a/src/joint_embedding/metrics/cc_cons/script.py b/src/joint_embedding/metrics/cc_cons/script.py new file mode 100644 index 0000000000..feecf45f12 --- /dev/null +++ b/src/joint_embedding/metrics/cc_cons/script.py @@ -0,0 +1,56 @@ +import pprint +import anndata as ad +import scib + +## VIASH START +par = dict( + input_prediction="resources_test/common/joint_embedding/cite_random_prediction.h5ad", + input_solution="resources_test/common/joint_embedding/cite_solution.h5ad", + output="resources_test/common/joint_embedding/score_cc_cons.h5ad", + debug=False +) +## VIASH END + + +if par['debug']: + pprint.pprint(par) + +print("Read prediction anndata") +adata = ad.read_h5ad(par['input_prediction']) +dataset_id = adata.uns['dataset_id'] + +print("Read solution anndata") +adata_solution = ad.read_h5ad(par['input_solution']) +organism = adata_solution.uns['organism'] + +print('Transfer obs annotations') +adata.obs['batch'] = adata_solution.obs['batch'][adata.obs_names] +adata.obs['cell_type'] = adata_solution.obs['cell_type'][adata.obs_names] +recompute_cc = 'S_score' not in adata_solution.obs_keys() or \ + 'G2M_score' not in adata_solution.obs_keys() + +print('Compute score') +adata.obsm['X_emb'] = adata.obsm['X_emb'].toarray() + +score = scib.me.cell_cycle( + adata_pre=adata_solution, + adata_post=adata, + batch_key='batch', + embed='X_emb', + recompute_cc=recompute_cc, + organism=organism +) + +# store adata with metrics +print("Create output object") +out = ad.AnnData( + uns= { + "dataset_id":adata.uns['dataset_id'], + "method_id":adata.uns['method_id'], + "metric_ids":['cc_cons'], + "metric_values":[score], + } +) + +print("Write output to h5ad file") +out.write(par['output'], compression='gzip') diff --git a/src/joint_embedding/metrics/check_format/config.vsh.yaml b/src/joint_embedding/metrics/check_format/config.vsh.yaml new file mode 100644 index 0000000000..90294f01e6 --- /dev/null +++ b/src/joint_embedding/metrics/check_format/config.vsh.yaml @@ -0,0 +1,38 @@ +__merge__: ../../api/comp_metric.yaml +functionality: + name: check_format + namespace: joint_embedding/metrics + description: Checking whether the prediction of a method has the right format. + info: + metrics: + - metric_id: finished + metric_name: finished + metric_description: check if metric finished + min: 0 + max: 1 + maximize: true + - metric_id: correct_format + metric_name: correct_format + metric_description: check if format is correct + min: 0 + max: 1 + maximize: true + resources: + - type: r_script + path: script.R +platforms: + - type: docker + image: eddelbuettel/r2u:22.04 + setup: + - type: r + cran: [ anndata, lmds, tidyverse ] + - type: apt + packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3] + - type: python + pip: [ anndata>=0.8 ] + test_setup: + - type: python + pip: [ pyyaml ] + - type: nextflow + directives: + label: [ lowmem, lowcpu ] diff --git a/src/joint_embedding/metrics/check_format/script.R b/src/joint_embedding/metrics/check_format/script.R new file mode 100644 index 0000000000..afe8ed10fd --- /dev/null +++ b/src/joint_embedding/metrics/check_format/script.R @@ -0,0 +1,61 @@ +cat("Load dependencies\n") +library(assertthat, quietly = TRUE, warn.conflicts = FALSE) +library(anndata, warn.conflicts = FALSE) + +## VIASH START +task <- "joint_embedding" +par <- list( + input_solution = paste0("resources_test/", task, "/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad"), + input_prediction = paste0("resources_test/", task, "/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad"), + output = paste0("resources_test/", task, "/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.scores.h5ad") +) +## VIASH END + +cat("Read prediction h5ad\n") +ad_sol <- read_h5ad(par$input_solution) + +cat("Checking solution h5ad\n") +correct_format <- tryCatch({ + # read prediction + ad_pred <- read_h5ad(par$input_prediction) + + # check dataset id + dataset_id <- ad_pred$uns[["dataset_id"]] + assert_that(dataset_id == ad_sol$uns[["dataset_id"]]) + + # check method id + method_id <- ad_pred$uns[["method_id"]] + assert_that( + is.character(method_id), + method_id != "" + ) + + # check X + assert_that( + ad_pred$n_obs == ad_sol$n_obs, + ad_pred$n_vars >= 1, + ad_pred$n_vars <= 100, + !is.null(ad_pred$obs_names), + all(ad_pred$obs_names == ad_sol$obs_names) + ) + + 1 +}, error = function(e) { + cat("ERROR: ", e$message, "\n", sep = "") + 0 +}) + + +cat("Create output object\n") +out <- AnnData( + shape = c(0, 0), + uns = list( + dataset_id = ad_pred$uns$dataset_id, + method_id = ad_pred$uns$method_id, + metric_ids = c("finished", "correct_format"), + metric_values = c(1, correct_format) + ) +) + +cat("Write output to h5ad file\n") +zzz <- out$write_h5ad(par$output, compression = "gzip") diff --git a/src/joint_embedding/metrics/graph_connectivity/config.vsh.yaml b/src/joint_embedding/metrics/graph_connectivity/config.vsh.yaml new file mode 100644 index 0000000000..c79966893c --- /dev/null +++ b/src/joint_embedding/metrics/graph_connectivity/config.vsh.yaml @@ -0,0 +1,28 @@ +__merge__: ../../api/comp_metric.yaml +functionality: + name: graph_connectivity + namespace: joint_embedding/metrics + description: Graph connectivity + info: + metrics: + - metric_id: graph_conn + metric_name: graph_conn + metric_description: Graph connectivity + min: 0 + max: 1 + maximize: True + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: python:3.10 + setup: + - type: python + pip: [anndata>=0.8, scib] + test_setup: + - type: python + pip: [ pyyaml ] + - type: nextflow + directives: + label: [ lowmem, lowcpu ] diff --git a/src/joint_embedding/metrics/graph_connectivity/script.py b/src/joint_embedding/metrics/graph_connectivity/script.py new file mode 100644 index 0000000000..ab9089d74a --- /dev/null +++ b/src/joint_embedding/metrics/graph_connectivity/script.py @@ -0,0 +1,53 @@ +import pprint +import scanpy as sc +import anndata as ad +import scib + +## VIASH START +par = dict( + input_prediction="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad", + input_solution="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad", + output="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.graph_conn.tsv", + debug=True +) + +## VIASH END + +if par['debug']: + pprint.pprint(par) + + +input_prediction = par['input_prediction'] +input_solution = par['input_solution'] +output = par['output'] + +print("Read prediction anndata") +adata = ad.read_h5ad(input_prediction) +dataset_id = adata.uns['dataset_id'] + +print("Read solution anndata") +adata_solution = ad.read_h5ad(input_solution) + +print('Transfer obs annotations') +adata.obs['batch'] = adata_solution.obs['batch'][adata.obs_names] +adata.obs['cell_type'] = adata_solution.obs['cell_type'][adata.obs_names] + +print('Preprocessing') +sc.pp.neighbors(adata, use_rep='X_emb') + +print('Compute score') +score = scib.me.graph_connectivity(adata, label_key='cell_type') + +# store adata with metrics +print("Create output object") +out = ad.AnnData( + uns = { + 'dataset_id':adata.uns['dataset_id'], + 'method_id':adata.uns['method_id'], + 'metric_ids':['graph_conn'], + 'metric_values':[score] + } +) + +print("Write output to h5ad file") +out.write(output, compression='gzip') diff --git a/src/joint_embedding/metrics/latent_mixing/config.vsh.yaml b/src/joint_embedding/metrics/latent_mixing/config.vsh.yaml new file mode 100644 index 0000000000..bf989bfc20 --- /dev/null +++ b/src/joint_embedding/metrics/latent_mixing/config.vsh.yaml @@ -0,0 +1,33 @@ +__merge__: ../../api/comp_metric.yaml +functionality: + name: latent_mixing + namespace: joint_embedding/metrics + description: Calculate latent mixing metric for joint embedding task. + info: + metrics: + - metric_id: latent_mixing + metric_name: Latent mixing + metric_description: Calculate latent mixing + min: -1 + max: 0 + maximize: True + arguments: + - name: "--n_neighbors" + type: integer + default: 100 + description: Number of neighbors for the entropy_batch_mixing metric. + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: python:3.10 + setup: + - type: python + pip: [anndata>=0.8, scikit-learn, scipy] + test_setup: + - type: python + pip: [ pyyaml ] + - type: nextflow + directives: + label: [ lowmem, lowcpu ] \ No newline at end of file diff --git a/src/joint_embedding/metrics/latent_mixing/script.py b/src/joint_embedding/metrics/latent_mixing/script.py new file mode 100644 index 0000000000..511e7f8d2a --- /dev/null +++ b/src/joint_embedding/metrics/latent_mixing/script.py @@ -0,0 +1,92 @@ +import anndata as ad +import numpy as np +import scipy +from sklearn.neighbors import NearestNeighbors + +# VIASH START +par = { + "input_prediction": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad", + "input_solution": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad", + "output": "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.scores_totalvi.h5ad", + "n_neighbors": 100 +} +# VIASH END + +print("Read input files") +predict_adata = ad.read_h5ad(par["input_prediction"]) +solution_adata = ad.read_h5ad(par["input_solution"]) + +print("Merge prediction with solution") +merged_adata = predict_adata.copy() + +batch_val = solution_adata.obs["batch"].astype(str) +batch_unique_values, batch_index = np.unique(batch_val, return_inverse=True) + +merged_adata.obs["batch"] = batch_index + +def entropy_batch_mixing( + latent_space, batches, n_neighbors=50, n_pools=50, n_samples_per_pool=100 +): + + def neg_kl(hist_data, global_freq): + n_batches = len(np.unique(hist_data)) + if n_batches > 2: + raise ValueError("Should be only two clusters for this metric") + frequency = np.mean(hist_data == 1) + if frequency == 0 or frequency == 1: + return 0 + return -( + frequency * np.log(frequency / global_freq) + + (1 - frequency) * np.log((1 - frequency) / (1 - global_freq)) + ) + + n_neighbors = min(n_neighbors, latent_space.getnnz() - 1) + nne = NearestNeighbors(n_neighbors=1 + n_neighbors, n_jobs=8) + nne.fit(latent_space) + kmatrix = nne.kneighbors_graph(latent_space) - scipy.sparse.identity( + latent_space.shape[0] + ) + + global_freq = np.mean(batches) + print(global_freq) + score = 0 + for t in range(n_pools): + indices = np.random.choice( + np.arange(latent_space.shape[0]), size=n_samples_per_pool + ) + score += np.mean( + [ + neg_kl( + batches[ # the batches of cell i's neighbors + kmatrix[indices].nonzero()[ + 1 + ][ # the neighbors of cell i (columns in row i) + kmatrix[indices].nonzero()[0] == i # the row of cell i + ] + ], + global_freq, + ) + for i in range(n_samples_per_pool) + ] + ) + return score / float(n_pools) + + +print("Calculate latent mixing metric") +latent_mixing = entropy_batch_mixing( + latent_space=merged_adata.obsm['X_emb'], + batches=merged_adata.obs["batch"].values, + n_neighbors=par["n_neighbors"] +) + +print("Write output") +adata_out = ad.AnnData( + uns = { + "dataset_id": predict_adata.uns["dataset_id"], + "method_id" : predict_adata.uns["method_id"], + "metric_ids" : ["latent_mixing"], + "metric_values" : [latent_mixing] + } +) + +adata_out.write_h5ad(par['output'], compression = "gzip") \ No newline at end of file diff --git a/src/joint_embedding/metrics/nmi/config.vsh.yaml b/src/joint_embedding/metrics/nmi/config.vsh.yaml new file mode 100644 index 0000000000..e83772d65e --- /dev/null +++ b/src/joint_embedding/metrics/nmi/config.vsh.yaml @@ -0,0 +1,28 @@ +__merge__: ../../api/comp_metric.yaml +functionality: + name: nmi + namespace: joint_embedding/metrics + description: Normalised mutual information (NMI) + info: + metrics: + - metric_id: nmi + metric_name: NMI + metric_description: Normalised mutual information (NMI) + min: 0 + max: 1 + maximize: True + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: python:3.10 + setup: + - type: python + pip: [anndata>=0.8, scib] + test_setup: + - type: python + pip: [ pyyaml ] + - type: nextflow + directives: + label: [ lowmem, lowcpu ] diff --git a/src/joint_embedding/metrics/nmi/script.py b/src/joint_embedding/metrics/nmi/script.py new file mode 100644 index 0000000000..73ef901bd7 --- /dev/null +++ b/src/joint_embedding/metrics/nmi/script.py @@ -0,0 +1,64 @@ +import pprint +import scanpy as sc +import anndata as ad +import scib + + +## VIASH START +par = dict( + input_prediction="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad", + input_solution="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad", + output="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.nmi.tsv", + debug=True +) + +## VIASH END + +if par['debug']: + pprint.pprint(par) + + +input_prediction = par['input_prediction'] +input_solution = par['input_solution'] +output = par['output'] + +print("Read prediction anndata") +adata = ad.read_h5ad(input_prediction) +dataset_id = adata.uns['dataset_id'] + +print("Read solution anndata") +adata_solution = ad.read_h5ad(input_solution) + +print('Transfer obs annotations') +adata.obs['batch'] = adata_solution.obs['batch'][adata.obs_names] +adata.obs['cell_type'] = adata_solution.obs['cell_type'][adata.obs_names] + +print('Preprocessing') +sc.pp.neighbors(adata, use_rep='X_emb') + +print('Clustering') +scib.cl.opt_louvain( + adata, + label_key='cell_type', + cluster_key='cluster', + plot=False, + inplace=True, + force=True +) + +print('Compute score') +score = scib.me.nmi(adata, group1='cluster', group2='cell_type') + +# store adata with metrics +print("Create output object") +out = ad.AnnData( + uns=dict( + dataset_id=adata.uns['dataset_id'], + method_id=adata.uns['method_id'], + metric_ids=['nmi'], + metric_values=[score] + ) +) + +print("Write output to h5ad file") +out.write(output, compression='gzip') \ No newline at end of file diff --git a/src/joint_embedding/metrics/rfoob/config.vsh.yaml b/src/joint_embedding/metrics/rfoob/config.vsh.yaml new file mode 100644 index 0000000000..099219de47 --- /dev/null +++ b/src/joint_embedding/metrics/rfoob/config.vsh.yaml @@ -0,0 +1,50 @@ +__merge__: ../../api/comp_metric.yaml +functionality: + name: rfoob + namespace: joint_embedding/metrics + description: Calculating basic metrics for the joint embedding task. + info: + metrics: + - metric_id: rfoob_celltype_accuracy + metric_name: Rfoob Celltype Accuray + metric_description: check the celltype accuracy + min: 0 + max: 1 + maximize: True + - metric_metric_id: rfoob_pseudotimegex_rsq + metric_name: rfoob pseudotimegex rsq + metric_description: rfoob pseudotimegex rsq + min: -1 + max: 1 + maximize: true + - metric_id: rfoob_pseudotimeadt_rsq + metric_name: rfoob pseudotimeadt rsq + metric_description: rfoob pseudotimeadt rsq + min: -1 + max: 1 + maximize: True + - metric_id: rfoob_batch_error + metric_name: rfoob batch error + metric_description: rfoob batch error + min: 0 + max: 1 + maximize: True + resources: + - type: r_script + path: script.R +platforms: + - type: docker + image: eddelbuettel/r2u:22.04 + setup: + - type: apt + packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3, git ] + - type: python + pip: [ anndata>=0.8 ] + - type: r + cran: [ anndata, ranger, tidyverse, testthat] + test_setup: + - type: python + pip: [ pyyaml ] + - type: nextflow + directives: + label: [ lowmem, lowcpu ] \ No newline at end of file diff --git a/src/joint_embedding/metrics/rfoob/script.R b/src/joint_embedding/metrics/rfoob/script.R new file mode 100644 index 0000000000..e237192fe3 --- /dev/null +++ b/src/joint_embedding/metrics/rfoob/script.R @@ -0,0 +1,68 @@ +cat("Load dependencies\n") +options(tidyverse.quiet = TRUE) +library(tidyverse) +library(testthat, quietly = TRUE, warn.conflicts = FALSE) +requireNamespace("anndata", quietly = TRUE) + +## VIASH START +par <- list( + input_solution = "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad", + input_prediction = "resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad", + output = "scores.h5ad" +) +## VIASH END + +cat("Read solution h5ad\n") +ad_sol <- anndata::read_h5ad(par$input_solution) + +cat("Read prediction h5ad\n") +expect_true( + grepl("\\.h5ad$", par$input_prediction), + info = "Prediction file should be an h5ad file" +) +ad_pred <- + tryCatch({ + anndata::read_h5ad(par$input_prediction) + }, error = function(e) { + stop(paste0("Can't open prediction h5ad file. Detailed error message:\n", e$message)) + }) +expect_true( + ad_sol$uns$dataset_id == ad_pred$uns$dataset_id +) + +cat("Calculating metrics\n") +df <- data.frame(as.matrix(ad_pred$obsm[["X_emb"]]), SOLUTION_CELL_TYPE = ad_sol$obs[["cell_type"]]) +rf1 <- ranger::ranger(SOLUTION_CELL_TYPE ~ ., df) + +df <- data.frame(as.matrix(ad_pred$obsm[["X_emb"]]), SOLUTION_PSEUDOTIME_ORDER = ad_sol$obs$pseudotime_order_GEX) +df <- df[is.finite(df$SOLUTION_PSEUDOTIME_ORDER), , drop = FALSE] +rf2 <- ranger::ranger(SOLUTION_PSEUDOTIME_ORDER ~ ., df) + +colname <- colnames(ad_sol$obs)[grepl("pseudotime_order_A.*", colnames(ad_sol$obs))] +df <- data.frame(as.matrix(ad_pred$obsm[["X_emb"]]), SOLUTION_PSEUDOTIME_ORDER = ad_sol$obs[[colname]]) +df <- df[is.finite(df$SOLUTION_PSEUDOTIME_ORDER), , drop = FALSE] +rf3 <- ranger::ranger(SOLUTION_PSEUDOTIME_ORDER ~ ., df) + +df <- data.frame(as.matrix(ad_pred$obsm[["X_emb"]]), SOLUTION_BATCH = ad_sol$obs$batch) +rf4 <- ranger::ranger(SOLUTION_BATCH ~ ., df) + +metric_values <- c( + rfoob_celltype_accuracy = 1 - rf1$prediction.error, + rfoob_pseudotimegex_rsq = rf2$r.squared, + rfoob_pseudotimeadt_rsq = rf3$r.squared, + rfoob_batch_error = rf4$prediction.error +) + +cat("Create output object\n") +out <- anndata::AnnData( + shape = c(0, 0), + uns = list( + dataset_id = ad_pred$uns$dataset_id, + method_id = ad_pred$uns$method_id, + metric_ids = names(metric_values), + metric_values = metric_values + ) +) + +cat("Write output to h5ad file\n") +zzz <- out$write_h5ad(par$output, compression = "gzip") diff --git a/src/joint_embedding/metrics/ti_cons/config.vsh.yaml b/src/joint_embedding/metrics/ti_cons/config.vsh.yaml new file mode 100644 index 0000000000..68525b337b --- /dev/null +++ b/src/joint_embedding/metrics/ti_cons/config.vsh.yaml @@ -0,0 +1,40 @@ +__merge__: ../../api/comp_metric.yaml +functionality: + name: ti_cons + namespace: joint_embedding/metrics + description: Trajectory inference conservation score + info: + metrics: + - metric_id: ti_cons_RNA + metric_name: ti cons RNA + metric_description: ti cons RNA + min: 0 + max: 1 + maximize: True + - metric_id: ti_cons_ADT_ATAC + metric_name: ti cons ADT ATAC + metric_description: ti cons ADT ATAC + min: 0 + max: 1 + maximize: True + - metric_id: ti_cons_mean + metric_name: ti cons mean + metric_description: ti cons mean + min: 0 + max: 1 + maximize: True + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: python:3.10 + setup: + - type: python + pip: [anndata>=0.8, scib] + test_setup: + - type: python + pip: [ pyyaml ] + - type: nextflow + directives: + label: [ midmem, midcpu ] diff --git a/src/joint_embedding/metrics/ti_cons/script.py b/src/joint_embedding/metrics/ti_cons/script.py new file mode 100644 index 0000000000..1d04067024 --- /dev/null +++ b/src/joint_embedding/metrics/ti_cons/script.py @@ -0,0 +1,81 @@ + + +print('Importing libraries') +import pprint +import numpy as np +import scanpy as sc +import anndata as ad +import scib + +## VIASH START +par = dict( + input_prediction="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad", + input_solution="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad", + output="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.ti_cons.h5ad", + debug=True +) +## VIASH END + +if par['debug']: + pprint.pprint(par) + +OUTPUT_TYPE = 'graph' +METRIC = 'ti_cons' + +input_prediction = par['input_prediction'] +input_solution = par['input_solution'] +output = par['output'] + +print("Read prediction anndata") +adata = ad.read_h5ad(input_prediction) +dataset_id = adata.uns['dataset_id'] + +print("Read solution anndata") +adata_solution = ad.read_h5ad(input_solution) + +print('Transfer obs annotations') +adata.obs['batch'] = adata_solution.obs['batch'][adata.obs_names] +adata.obs['cell_type'] = adata_solution.obs['cell_type'][adata.obs_names] +adt_atac_trajectory = 'pseudotime_order_ATAC' if 'pseudotime_order_ATAC' in adata_solution.obs else 'pseudotime_order_ADT' + +print('Preprocessing') +sc.pp.neighbors(adata, use_rep='X_emb') + +print('Compute scores') +obs_keys = adata_solution.obs_keys() + +if 'pseudotime_order_GEX' in obs_keys: + score_rna = scib.me.trajectory_conservation( + adata_pre=adata_solution, + adata_post=adata, + label_key='cell_type', + pseudotime_key='pseudotime_order_GEX' + ) +else: + score_rna = np.nan + +if adt_atac_trajectory in obs_keys: + score_adt_atac = scib.me.trajectory_conservation( + adata_pre=adata_solution, + adata_post=adata, + label_key='cell_type', + pseudotime_key=adt_atac_trajectory + ) +else: + score_adt_atac = np.nan + +score_mean = (score_rna + score_adt_atac) / 2 + +# store adata with metrics +print("Create output object") +out = ad.AnnData( + uns=dict( + dataset_id=adata.uns['dataset_id'], + method_id=adata.uns['method_id'], + metric_ids=['ti_cons_RNA', 'ti_cons_ADT_ATAC', 'ti_cons_mean'], + metric_values=[score_rna, score_adt_atac, score_mean] + ) +) + +print("Write output to h5ad file") +out.write(output, compression='gzip') \ No newline at end of file diff --git a/src/joint_embedding/metrics/ti_cons_batch/config.vsh.yaml b/src/joint_embedding/metrics/ti_cons_batch/config.vsh.yaml new file mode 100644 index 0000000000..7139b8aadc --- /dev/null +++ b/src/joint_embedding/metrics/ti_cons_batch/config.vsh.yaml @@ -0,0 +1,40 @@ +__merge__: ../../api/comp_metric.yaml +functionality: + name: ti_cons_batch + namespace: joint_embedding/metrics + description: Trajectory inference conservation score per batch + info: + metrics: + - metric_id: ti_cons_batch_RNA + metric_name: ti cons batch RNA + metric_description: placeholder + min: 0 + max: 1 + maximize: True + - metric_id: ti_cons_batch_ADT_ATAC + metric_name: ti cons batch ADT ATAC + metric_description: placeholder + min: 0 + max: 1 + maximize: True + - metric_id: ti_cons_batch_mean + metric_name: ti cons batch mean + metric_description: placeholder + min: 0 + max: 1 + maximize: True + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: python:3.10 + setup: + - type: python + pip: [anndata>=0.8, scib] + test_setup: + - type: python + pip: [ pyyaml ] + - type: nextflow + directives: + label: [ midmem, midcpu ] diff --git a/src/joint_embedding/metrics/ti_cons_batch/script.py b/src/joint_embedding/metrics/ti_cons_batch/script.py new file mode 100644 index 0000000000..5206b59610 --- /dev/null +++ b/src/joint_embedding/metrics/ti_cons_batch/script.py @@ -0,0 +1,87 @@ +import pprint +import numpy as np +import scanpy as sc +import anndata as ad +import scib + +## VIASH START +par = dict( + input_prediction="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad", + input_solution="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.solution.h5ad", + output="resources_test/joint_embedding/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.ti_cons.h5ad", + debug=True +) +## VIASH END + +if par['debug']: + pprint.pprint(par) + +OUTPUT_TYPE = 'graph' +METRIC = 'ti_cons_batch' + +input_prediction = par['input_prediction'] +input_solution = par['input_solution'] +output = par['output'] + +print("Read prediction anndata") +adata = ad.read_h5ad(input_prediction) +dataset_id = adata.uns['dataset_id'] + +print("Read solution anndata") +adata_solution = ad.read_h5ad(input_solution) + +print('Transfer obs annotations') +adata.obs['batch'] = adata_solution.obs['batch'][adata.obs_names] +adata.obs['cell_type'] = adata_solution.obs['cell_type'][adata.obs_names] +adt_atac_trajectory = 'pseudotime_order_ATAC' if 'pseudotime_order_ATAC' in adata_solution.obs else 'pseudotime_order_ADT' + +print('Preprocessing') +sc.pp.neighbors(adata, use_rep='X_emb') + +print('Compute scores') +obs_keys = adata_solution.obs_keys() + +if 'pseudotime_order_GEX' in obs_keys: + score_rna = scib.me.trajectory_conservation( + adata_pre=adata_solution, + adata_post=adata, + label_key='cell_type', + batch_key='batch', + pseudotime_key='pseudotime_order_GEX' + ) +else: + score_rna = np.nan + +if adt_atac_trajectory in obs_keys: + score_adt_atac = scib.me.trajectory_conservation( + adata_pre=adata_solution, + adata_post=adata, + label_key='cell_type', + batch_key='batch', + pseudotime_key=adt_atac_trajectory + ) +else: + score_adt_atac = np.nan + +score_mean = (score_rna + score_adt_atac) / 2 + +# store adata with metrics +print("Create output object") +out = ad.AnnData( + uns=dict( + dataset_id=adata.uns['dataset_id'], + method_id=adata.uns['method_id'], + metric_ids=['ti_cons_batch_RNA', 'ti_cons_batch_ADT_ATAC', 'ti_cons_batch_mean'], + metric_values=[score_rna, score_adt_atac, score_mean] + ) +) + +print("Write output to h5ad file") +out.write(output, compression='gzip') + +# # store score as tsv +# with open(output, 'w') as file: +# header = ['dataset', 'output_type', 'metric', 'value'] +# entry = [dataset_id, OUTPUT_TYPE, METRIC, score] +# file.write('\t'.join(header) + '\n') +# file.write('\t'.join([str(x) for x in entry])) diff --git a/src/joint_embedding/resources_scripts/mask_datasets.sh b/src/joint_embedding/resources_scripts/mask_datasets.sh new file mode 100644 index 0000000000..dfb1295fb7 --- /dev/null +++ b/src/joint_embedding/resources_scripts/mask_datasets.sh @@ -0,0 +1,64 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +COMMON_DATASETS="resources/datasets/openproblems_v1" +OUTPUT_DIR="resources/joint_embedding/datasets/openproblems_v1" + +if [ ! -d "$OUTPUT_DIR" ]; then + mkdir -p "$OUTPUT_DIR" +fi + +params_file="$OUTPUT_DIR/params.yaml" + +if [ ! -f $params_file ]; then + python << HERE +import anndata as ad +import glob +import yaml + +h5ad_files = glob.glob("$COMMON_DATASETS/**.h5ad") + +# this task doesn't use normalizations +# +param_list = {} + +for h5ad_file in h5ad_files: + print(f"Checking {h5ad_file}") + adata = ad.read_h5ad(h5ad_file, backed=True) + if "counts" in adata.layers: + dataset_id = adata.uns["dataset_id"].replace("/", ".") + obj = { + 'id': dataset_id, + 'input': h5ad_file, + 'dataset_id': dataset_id, + } + param_list[dataset_id] = obj + +output = { + "param_list": list(param_list.values()), + "seed": 123, + "output_train": "\$id.train.h5ad", + "output_test": "\$id.test.h5ad" +} + +with open("$params_file", "w") as file: + yaml.dump(output, file) +HERE +fi + +export NXF_VER=22.04.5 +nextflow \ + run . \ + -main-script target/nextflow/denoising/split_dataset/main.nf \ + -profile docker \ + -resume \ + -params-file $params_file \ + --publish_dir "$OUTPUT_DIR" + +bin/tools/docker/nextflow/process_log/process_log \ + --output "$OUTPUT_DIR/nextflow_log.tsv" \ No newline at end of file diff --git a/src/joint_embedding/resources_scripts/run_benchmarks.sh b/src/joint_embedding/resources_scripts/run_benchmarks.sh new file mode 100644 index 0000000000..01d6ef92a5 --- /dev/null +++ b/src/joint_embedding/resources_scripts/run_benchmarks.sh @@ -0,0 +1,74 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +set -e + +export TOWER_WORKSPACE_ID=53907369739130 + +DATASETS_DIR="resources/denoising/datasets/openproblems_v1" +OUTPUT_DIR="resources/denoising/benchmarks/openproblems_v1" + +if [ ! -d "$OUTPUT_DIR" ]; then + mkdir -p "$OUTPUT_DIR" +fi + +params_file="$OUTPUT_DIR/params.yaml" + +if [ ! -f $params_file ]; then + python << HERE +import yaml +import os + +dataset_dir = "$DATASETS_DIR" +output_dir = "$OUTPUT_DIR" + +# read split datasets yaml +with open(dataset_dir + "/params.yaml", "r") as file: + split_list = yaml.safe_load(file) +datasets = split_list['param_list'] + +# figure out where train/test files were stored +param_list = [] + +for dataset in datasets: + id = dataset["id"] + input_train = dataset_dir + "/" + id + ".train.h5ad" + input_test = dataset_dir + "/" + id + ".test.h5ad" + + if os.path.exists(input_test): + obj = { + 'id': id, + 'id': id, + 'id': id, + 'dataset_id': dataset["dataset_id"], + 'input_train': input_train, + 'input_test': input_test + } + param_list.append(obj) + +# write as output file +output = { + "param_list": param_list, +} + +with open(output_dir + "/params.yaml", "w") as file: + yaml.dump(output, file) +HERE +fi + +export NXF_VER=22.04.5 +nextflow \ + run . \ + -main-script src/denoising/workflows/run/main.nf \ + -profile docker \ + -params-file "$params_file" \ + --publish_dir "$OUTPUT_DIR" \ + -with-tower + +bin/tools/docker/nextflow/process_log/process_log \ + --output "$OUTPUT_DIR/nextflow_log.tsv" \ No newline at end of file diff --git a/src/joint_embedding/resources_test_scripts/bmmc_cite.sh b/src/joint_embedding/resources_test_scripts/bmmc_cite.sh new file mode 100644 index 0000000000..72967ef27b --- /dev/null +++ b/src/joint_embedding/resources_test_scripts/bmmc_cite.sh @@ -0,0 +1,57 @@ +#!/bin/bash +# +#make sure the following command has been executed +#bin/viash_build -q 'denoising|common' + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +MOD_1_DATA=resources_test/common/openproblems_bmmc_cite_starter/openproblems_bmmc_cite_starter.output_rna.h5ad +MOD_2_DATA=resources_test/common/openproblems_bmmc_cite_starter/openproblems_bmmc_cite_starter.output_mod2.h5ad +DATASET_DIR=resources_test/joint_embedding/bmmc_cite + +if [ ! -f $MOD_1_DATA ]; then + echo "Error! Could not find raw data" + exit 1 +fi + +mkdir -p $DATASET_DIR + +# split dataset +bin/viash run src/joint_embedding/mask_dataset/config.vsh.yaml -- \ + --input_mod1 $MOD_1_DATA \ + --input_mod2 $MOD_2_DATA \ + --output_mod1 $DATASET_DIR/cite_mod1.h5ad \ + --output_mod2 $DATASET_DIR/cite_mod2.h5ad \ + --output_solution $DATASET_DIR/cite_solution.h5ad + +# run one method +bin/viash run src/joint_embedding/methods/pca/config.vsh.yaml -- \ + --input_mod1 $DATASET_DIR/cite_mod1.h5ad \ + --input_mod2 $DATASET_DIR/cite_mod2.h5ad \ + --output $DATASET_DIR/pca.h5ad + +# run one metric +bin/viash run src/joint_embedding/metrics/ari/config.vsh.yaml -- \ + --input_prediction $DATASET_DIR/pca.h5ad \ + --input_solution $DATASET_DIR/cite_solution.h5ad \ + --output $DATASET_DIR/ari.h5ad + +# run benchmark +export NXF_VER=22.04.5 + +bin/nextflow \ + run . \ + -main-script src/joint_embedding/workflows/run/main.nf \ + -profile docker \ + -resume \ + --id bmmc_cite \ + --dataset_id bmmc_site \ + --input_mod1 $DATASET_DIR/cite_mod1.h5ad \ + --input_mod2 $DATASET_DIR/cite_mod2.h5ad \ + --input_solution $DATASET_DIR/cite_solution.h5ad \ + --output scores.tsv \ + --publish_dir $DATASET_DIR/ \ No newline at end of file diff --git a/src/joint_embedding/workflows/run/config.vsh.yaml b/src/joint_embedding/workflows/run/config.vsh.yaml new file mode 100644 index 0000000000..5f3e7800cf --- /dev/null +++ b/src/joint_embedding/workflows/run/config.vsh.yaml @@ -0,0 +1,26 @@ +functionality: + name: "run_benchmark" + namespace: "joint_embedding/workflows" + argument_groups: + - name: Inputs + arguments: + - name: "--id" + type: "string" + description: "The ID of the dataset" + required: true + - name: "--input_mod1" + type: "file" # todo: replace with includes + - name: "--input_mod2" + type: "file" # todo: replace with includes + - name: "--input_solution" + type: "file" # todo: replace with includes + - name: Outputs + arguments: + - name: "--output" + direction: "output" + type: file + resources: + - type: nextflow_script + path: main.nf +platforms: + - type: nextflow \ No newline at end of file diff --git a/src/joint_embedding/workflows/run/main.nf b/src/joint_embedding/workflows/run/main.nf new file mode 100644 index 0000000000..45ac7ad76d --- /dev/null +++ b/src/joint_embedding/workflows/run/main.nf @@ -0,0 +1,152 @@ +nextflow.enable.dsl=2 + +sourceDir = params.rootDir + "/src" +targetDir = params.rootDir + "/target/nextflow" + +// import control methods +include { random_embed } from "$targetDir/joint_embedding/control_methods/random_embed/main.nf" +include { zeros_embed } from "$targetDir/joint_embedding/control_methods/zeros_embed/main.nf" + +// import methods +include { lmds } from "$targetDir/joint_embedding/methods/lmds/main.nf" +include { mnn } from "$targetDir/joint_embedding/methods/mnn/main.nf" +include { newwave } from "$targetDir/joint_embedding/methods/newwave/main.nf" +include { pca } from "$targetDir/joint_embedding/methods/pca/main.nf" +include { totalvi } from "$targetDir/joint_embedding/methods/totalvi/main.nf" +include { umap } from "$targetDir/joint_embedding/methods/umap/main.nf" + +// import metrics +include { ari } from "$targetDir/joint_embedding/metrics/ari/main.nf" +include { asw_batch } from "$targetDir/joint_embedding/metrics/asw_batch/main.nf" +include { asw_label } from "$targetDir/joint_embedding/metrics/asw_label/main.nf" +include { cc_cons } from "$targetDir/joint_embedding/metrics/cc_cons/main.nf" +include { check_format } from "$targetDir/joint_embedding/metrics/check_format/main.nf" +include { graph_connectivity } from "$targetDir/joint_embedding/metrics/graph_connectivity/main.nf" +include { latent_mixing } from "$targetDir/joint_embedding/metrics/latent_mixing/main.nf" +include { nmi } from "$targetDir/joint_embedding/metrics/nmi/main.nf" +include { rfoob } from "$targetDir/joint_embedding/metrics/rfoob/main.nf" +include { ti_cons } from "$targetDir/joint_embedding/metrics/ti_cons/main.nf" +include { ti_cons_batch } from "$targetDir/joint_embedding/metrics/ti_cons_batch/main.nf" + +// tsv generation component +include { extract_scores } from "$targetDir/common/extract_scores/main.nf" + +// import helper functions +include { readConfig; viashChannel; helpMessage } from sourceDir + "/wf_utils/WorkflowHelper.nf" +include { setWorkflowArguments; getWorkflowArguments; passthroughMap as pmap } from sourceDir + "/wf_utils/DataflowHelper.nf" + +config = readConfig("$projectDir/config.vsh.yaml") + +// construct a map of methods (id -> method_module) +methods = [ lmds, mnn, newwave, pca, totalvi, umap] + .collectEntries{method -> + [method.config.functionality.name, method] + } + +workflow { + helpMessage(config) + + viashChannel(params, config) + | run_wf +} + +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + + // split params for downstream components + | setWorkflowArguments( + method: ["input_mod1", "input_mod2"], + metric: ["input_solution"], + output: ["output"] + ) + + // multiply events by the number of method + | add_methods + + // run methods + | getWorkflowArguments(key: "method") + | run_methods + + // construct tuples for metrics + | pmap{ id, file, passthrough -> + // derive unique ids from output filenames + def newId = file.getName().replaceAll(".output.*", "") + // combine prediction with solution + def newData = [ input_prediction: file, input_solution: passthrough.metric.input_solution ] + [ newId, newData, passthrough ] + } + + // run metrics + | getWorkflowArguments(key: "metric") + | run_metrics + + // convert to tsv + | aggregate_results + + emit: + output_ch +} + +workflow add_methods { + take: input_ch + main: + output_ch = Channel.fromList(methods.keySet()) + | combine(input_ch) + + // generate combined id for method_id and dataset_id + | pmap{method_id, dataset_id, data -> + def new_id = dataset_id + "." + method_id + def new_data = data.clone() + [method_id: method_id] + new_data.remove("id") + [new_id, new_data] + } + emit: output_ch +} + +workflow run_methods { + take: input_ch + main: + // generate one channel per method + method_chs = methods.collect { method_id, method_module -> + input_ch + | filter{it[1].method_id == method_id} + | method_module + } + // mix all results + output_ch = method_chs[0].mix(*method_chs.drop(1)) + + emit: output_ch +} + +workflow run_metrics { + take: input_ch + main: + + output_ch = input_ch + | (ari & asw_batch & asw_label & cc_cons & check_format & graph_connectivity & latent_mixing & nmi & rfoob & ti_cons & ti_cons_batch) + | mix + + emit: output_ch +} + +workflow aggregate_results { + take: input_ch + main: + + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ it -> + [ "combined", it.collect{ it[1] } ] + it[0].drop(2) + } + | getWorkflowArguments(key: "output") + | extract_scores.run( + auto: [ publish: true ] + ) + + emit: output_ch +} \ No newline at end of file diff --git a/src/joint_embedding/workflows/run/nextflow.config b/src/joint_embedding/workflows/run/nextflow.config new file mode 100644 index 0000000000..ea674ffa07 --- /dev/null +++ b/src/joint_embedding/workflows/run/nextflow.config @@ -0,0 +1,14 @@ +manifest { + name = 'joint_embedding/workflows/run' + mainScript = 'main.nf' + nextflowVersion = '!>=22.04.5' + description = 'Multi modality - joint embedding' +} + +params { + rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString() +} + +// include common settings +includeConfig("${params.rootDir}/src/wf_utils/ProfilesHelper.config") +includeConfig("${params.rootDir}/src/wf_utils/labels.config") \ No newline at end of file diff --git a/src/match_modality/README.md b/src/match_modality/README.md new file mode 100644 index 0000000000..3f1ff48cc1 --- /dev/null +++ b/src/match_modality/README.md @@ -0,0 +1,23 @@ +# Match modality + +Structure of this task: + + src/match_modality + ├── api Interface specifications for components and datasets in this task + ├── control_methods Baseline (random/ground truth) methods to compare methods against + ├── methods Methods to be benchmarked + ├── metrics Metrics used to quantify performance of methods + ├── README.md This file + ├── resources_scripts Scripts to process the datasets + ├── resources_test_scripts Scripts to process the test resources + ├── split_dataset Component to prepare common datasets + └── workflows Pipelines to run the full benchmark + +Relevant links: + +* [Description and results at openproblems.bio](https://openproblems.bio/neurips_2021/) + +* [Experimental results](https://openproblems-experimental.netlify.app/results/match_modality/) + + +* [Contribution guide](https://github.com/openproblems-bio/openproblems-v2/blob/main/CONTRIBUTING.md) diff --git a/src/match_modality/api/anndata_dataset.yaml b/src/match_modality/api/anndata_dataset.yaml new file mode 100644 index 0000000000..3382dba2ff --- /dev/null +++ b/src/match_modality/api/anndata_dataset.yaml @@ -0,0 +1,52 @@ +type: file +description: An input h5ad dataset. +example: "dataset.h5ad" +info: + label: "Dataset" + slots: + layers: + - type: integer + name: counts + description: Raw counts + required: true + obs: + - type: string + name: batch + description: Batch information + required: true + - type: double + name: size_factors + description: The size factors created by the normalisation method, if any. + required: false + - type: boolean + name: is_train + description: if sample is train data + required: true + var: + - type: string + name: gene_ids + description: + required: false + - type: string + name: feature_types + description: + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: organism + description: "data from which organism " + required: false + - type: string + name: gene_activity_var_names + description: + required: true + obsm: + - type: double + name: gene_activity + description: + required: true + diff --git a/src/match_modality/api/anndata_masked_mod.yaml b/src/match_modality/api/anndata_masked_mod.yaml new file mode 100644 index 0000000000..31ef25d06e --- /dev/null +++ b/src/match_modality/api/anndata_masked_mod.yaml @@ -0,0 +1,46 @@ +type: file +description: "The masked data" +example: "masked.h5ad" +info: + short_description: "masked data" + slots: + layers: + - type: integer + name: counts + description: Raw counts + obs: + - type: string + name: batch + description: Batch information + required: true + - type: double + name: size_factors + description: + required: false + var: + - type: string + name: feature_types + description: + required: true + - type: string + name: gene_ids + description: + required: false + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: organism + description: which organism + required: true + - type: string + name: gene_activity_var_names + description: + required: true + obsm: + - type: double + name: gene_activity + description: + required: true \ No newline at end of file diff --git a/src/match_modality/api/anndata_prediction.yaml b/src/match_modality/api/anndata_prediction.yaml new file mode 100644 index 0000000000..ca214d4963 --- /dev/null +++ b/src/match_modality/api/anndata_prediction.yaml @@ -0,0 +1,15 @@ +type: file +description: "The predicted pairing of test mod1&mod2 profiles." +example: "prediction.h5ad" +info: + short_description: "Prediction" + slots: + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: method_id + description: "A unique identifier for the method" + required: true diff --git a/src/match_modality/api/anndata_score.yaml b/src/match_modality/api/anndata_score.yaml new file mode 100644 index 0000000000..bfe79f07cc --- /dev/null +++ b/src/match_modality/api/anndata_score.yaml @@ -0,0 +1,25 @@ +type: file +description: "Metric score file" +example: "output.h5ad" +info: + short_description: "Score" + slots: + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: method_id + description: "A unique identifier for the method" + required: true + - type: string + name: metric_ids + description: "One or more unique metric identifiers" + multiple: true + required: true + - type: double + name: metric_values + description: "The metric values obtained for the given prediction. Must be of same length as 'metric_ids'." + multiple: true + required: true diff --git a/src/match_modality/api/anndata_solution.yaml b/src/match_modality/api/anndata_solution.yaml new file mode 100644 index 0000000000..e486646e2f --- /dev/null +++ b/src/match_modality/api/anndata_solution.yaml @@ -0,0 +1,20 @@ +type: file +description: "The masked solution data" +example: "masked.h5ad" +info: + short_description: "masked solution data" + slots: + obs: + - type: string + name: batch + description: Batch information + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: double + name: pairing_ix + description: number of samples + required: true diff --git a/src/match_modality/api/authors.yaml b/src/match_modality/api/authors.yaml new file mode 100644 index 0000000000..bd2cade4d0 --- /dev/null +++ b/src/match_modality/api/authors.yaml @@ -0,0 +1,28 @@ +functionality: + authors: + - name: Robrecht Cannoodt + roles: [ author, contributor ] + props: { github: rcannood, orcid: "0000-0003-3641-729X" } + - name: Kai Waldrant + roles: [ contributor ] + props: { github: KaiWaldrant } + - name: Michaela Mueller + email: mumichae@in.tum.de + roles: [ author, maintainer ] + props: { github: mumichae, orcid: "0000-0002-1401-1785" } + - name: Louise Deconinck + email: louise.deconinck@gmail.com + roles: [ author, maintainer ] + props: { github: LouiseDck, orcid: "" } + - name: Alejandro Granados + email: alejandro.granados@czbiohub.org + roles: [ author, maintainer ] + props: { github: agranado } + - name: Alex Tong + email: alexandertongdev@gmail.com + roles: [ author, maintainer ] + props: { github: atong01 } + - name: Daniel Burkhardt + email: daniel.b.burkhardt@gmail.com + roles: [ author, maintainer ] + props: { github: dburkhardt } \ No newline at end of file diff --git a/src/match_modality/api/comp_control_method.yaml b/src/match_modality/api/comp_control_method.yaml new file mode 100644 index 0000000000..8dff03a367 --- /dev/null +++ b/src/match_modality/api/comp_control_method.yaml @@ -0,0 +1,68 @@ +functionality: + arguments: + - name: "--input_test_mod1" + __merge__: anndata_masked_mod.yaml + - name: "--input_test_mod2" + __merge__: anndata_masked_mod.yaml + - name: "--input_test_sol" + __merge__: anndata_solution.yaml + - name: "--output" + direction: "output" + __merge__: anndata_prediction.yaml + test_resources: + - path: ../../../../output + - type: python_script + path: generic_test.py + text: | + from os import path + import subprocess + import anndata as ad + import numpy as np + from scipy.sparse import issparse + + # define some filenames + testpar = { + 'input_test_mod1': 'output/output_test_mod1.h5ad', + 'input_test_mod2': 'output/output_test_mod2.h5ad', + 'input_test_sol': 'output/output_test_sol.h5ad', + 'output': 'output/output_prediction.h5ad', + } + + print('> Running method', flush=True) + out = subprocess.run([ + meta['executable'], + '--input_test_mod1', testpar['input_test_mod1'], + '--input_test_mod2', testpar['input_test_mod2'], + '--input_test_sol', testpar['input_test_sol'], + '--output', testpar['output'] + ], + check=True, + capture_output= True, + text= True + ) + + # for troubleshooting: remove 'check=True' in subprocess.run above and comment out below print + # print(out.stderr, flush=True) + + print('> Checking whether output files were created', flush=True) + assert path.exists(testpar['output']) + + print('> Reading h5ad files', flush=True) + ad_sol = ad.read_h5ad(testpar['input_test_sol']) + ad_pred = ad.read_h5ad(testpar['output']) + + print('> Checking dataset id', flush=True) + assert ad_pred.uns['dataset_id'] == ad_sol.uns['dataset_id'] + + print('> Checking method id', flush=True) + assert ad_pred.uns['method_id'] == meta['functionality_name'] + + print('> Checking X', flush=True) + assert issparse(ad_pred.X) + assert np.all([x >= 0 for x in ad_pred.X.nonzero()]), 'Values must be strictly non-negative.' + assert ad_pred.X.nonzero()[0].size <= 1000 * ad_sol.n_obs + assert ad_pred.n_obs == ad_sol.n_obs + assert ad_pred.n_vars == ad_sol.n_vars + assert np.isclose(ad_pred.X.sum(axis=1), 1, atol=1e-10).all(), 'All rows should sum to 1.' + + print('> Test succeeded!', flush=True) diff --git a/src/match_modality/api/comp_mask_dataset.yaml b/src/match_modality/api/comp_mask_dataset.yaml new file mode 100644 index 0000000000..3876ef397e --- /dev/null +++ b/src/match_modality/api/comp_mask_dataset.yaml @@ -0,0 +1,32 @@ +functionality: + arguments: + - name: "--input_mod1" + __merge__: anndata_dataset.yaml + - name: "--input_mod2" + __merge__: anndata_dataset.yaml + - name: "--output_train_mod1" + direction: "output" + __merge__: anndata_masked_mod.yaml + - name: "--output_train_mod2" + direction: "output" + __merge__: anndata_masked_mod.yaml + - name: "--output_train_sol" + direction: "output" + __merge__: anndata_solution.yaml + - name: "--output_test_mod1" + direction: "output" + __merge__: anndata_masked_mod.yaml + - name: "--output_test_mod2" + direction: "output" + __merge__: anndata_masked_mod.yaml + - name: "--output_test_sol" + direction: "output" + __merge__: anndata_solution.yaml + - name: "--knn" + type: "integer" + default: 20 + description: The KNN cutoff for computing nearest neighbors on the mod1 and mod2 profiles. + - name: "--seed" + type: "integer" + default: 1 + description: The seed. \ No newline at end of file diff --git a/src/match_modality/api/comp_method.yaml b/src/match_modality/api/comp_method.yaml new file mode 100644 index 0000000000..87e45f646b --- /dev/null +++ b/src/match_modality/api/comp_method.yaml @@ -0,0 +1,77 @@ +functionality: + arguments: + - name: "--input_train_mod1" + __merge__: anndata_masked_mod.yaml + - name: "--input_train_mod2" + __merge__: anndata_masked_mod.yaml + - name: "--input_train_sol" + __merge__: anndata_solution.yaml + - name: "--input_test_mod1" + __merge__: anndata_masked_mod.yaml + - name: "--input_test_mod2" + __merge__: anndata_masked_mod.yaml + - name: "--output" + direction: output + __merge__: anndata_prediction.yaml + test_resources: + - path: ../../../../output + - type: python_script + path: generic_test.py + text: | + from os import path + import subprocess + import anndata as ad + import numpy as np + from scipy.sparse import issparse + + # define some filenames + testpar = { + 'input_train_mod1': 'output/output_train_mod1.h5ad', + 'input_train_mod2': 'output/output_train_mod2.h5ad', + 'input_train_sol': 'output/output_train_sol.h5ad', + 'input_test_mod1': 'output/output_test_mod1.h5ad', + 'input_test_mod2': 'output/output_test_mod2.h5ad', + 'input_test_sol': 'output/output_test_sol.h5ad', + 'output': 'output/output_prediction.h5ad', + } + + print('> Running method', flush=True) + out = subprocess.run([ + meta['executable'], + '--input_train_mod1', testpar['input_train_mod1'], + '--input_train_mod2', testpar['input_train_mod2'], + '--input_train_sol', testpar['input_train_sol'], + '--input_test_mod1', testpar['input_test_mod1'], + '--input_test_mod2', testpar['input_test_mod2'], + '--output', testpar['output'] + ], + check=True, + capture_output= True, + text= True + ) + + # for troubleshooting: remove 'check=True' in subprocess.run above and comment out below print + # print(out.stderr, flush=True) + + print('> Checking whether output files were created', flush=True) + assert path.exists(testpar['output']) + + print('> Reading h5ad files', flush=True) + ad_sol = ad.read_h5ad(testpar['input_test_sol']) + ad_pred = ad.read_h5ad(testpar['output']) + + print('> Checking dataset id', flush=True) + assert ad_pred.uns['dataset_id'] == ad_sol.uns['dataset_id'] + + print('> Checking method id', flush=True) + assert ad_pred.uns['method_id'] == meta['functionality_name'] + + print('> Checking X', flush=True) + assert issparse(ad_pred.X) + assert np.all([x >= 0 for x in ad_pred.X.nonzero()]), 'Values must be strictly non-negative.' + assert ad_pred.X.nonzero()[0].size <= 1000 * ad_sol.n_obs + assert ad_pred.n_obs == ad_sol.n_obs + assert ad_pred.n_vars == ad_sol.n_vars + assert np.isclose(ad_pred.X.sum(axis=1), 1, atol=1e-10).all(), 'All rows should sum to 1.' + + print('> Test succeeded!', flush=True) diff --git a/src/match_modality/api/comp_metric.yaml b/src/match_modality/api/comp_metric.yaml new file mode 100644 index 0000000000..ae94586882 --- /dev/null +++ b/src/match_modality/api/comp_metric.yaml @@ -0,0 +1,83 @@ +functionality: + arguments: + - name: --input_prediction + __merge__: anndata_prediction.yaml + - name: --input_solution + __merge__: anndata_solution.yaml + - name: --output + __merge__: anndata_score.yaml + direction: output + test_resources: + - path: ../../../../output + - type: python_script + path: generic_test.py + text: | + from os import path + import subprocess + import anndata as ad + import pandas as pd + import yaml + + input_prediction_path = "output/dr_knnr_cbf_prediction.h5ad" + input_solution_path = "output/output_test_sol.h5ad" + output_path = "output.h5ad" + # define some filenames + with open(meta["config"], "r") as file: + config = yaml.safe_load(file) + + cmd = [ + meta['executable'], + "--input_prediction", input_prediction_path, + "--input_solution", input_solution_path, + "--output", output_path + ] + + print("> Running method", flush=True) + out = subprocess.run(cmd, check=True) + + print("> Checking whether output files were created", flush=True) + assert path.exists(output_path) + + print("> Reading h5ad files", flush=True) + input_prediction = ad.read_h5ad(input_prediction_path) + input_solution = ad.read_h5ad(input_solution_path) + output = ad.read_h5ad(output_path) + + # Create DF from metric config info + metric_info = config['functionality']['info']['metrics'] + metric_meta = pd.DataFrame(metric_info) + metric_meta = metric_meta.astype({'metric_id': str, 'metric_name': str, 'metric_description': str, 'min': float, 'max': float, 'maximize': bool}) + print("> Checking contents of metric info", flush=True) + assert 'metric_id' in metric_meta + assert 'min' in metric_meta + assert 'max' in metric_meta + assert 'maximize' in metric_meta + + print("> Checking .uns['dataset_id']", flush=True) + assert 'dataset_id' in output.uns + assert output.uns['dataset_id'] == input_prediction.uns['dataset_id'] + + print("> Checking .uns['method_id']", flush=True) + assert 'method_id' in output.uns + assert output.uns['method_id'] == input_prediction.uns['method_id'] + + print("> Checking .uns['metric_ids']", flush=True) + assert 'metric_ids' in output.uns + assert set(output.uns['metric_ids']) == set(metric_meta.metric_id) + + print("> Checking .uns['metric_values']", flush=True) + assert 'metric_values' in output.uns + assert output.uns['metric_ids'].size == output.uns['metric_values'].size + + # merge with metric_meta to see if metric_value lies within the expected range + output_uns = pd.DataFrame({ + 'metric_id': output.uns['metric_ids'], + 'value': output.uns['metric_values'] + }) + + scores = metric_meta.merge(output_uns, on="metric_id") + + assert all(scores.value >= scores['min']) + assert all(scores.value <= scores['max']) + + print("> Test succeeded!", flush=True) diff --git a/src/match_modality/api/task_info.yaml b/src/match_modality/api/task_info.yaml new file mode 100644 index 0000000000..50d32b98f9 --- /dev/null +++ b/src/match_modality/api/task_info.yaml @@ -0,0 +1,13 @@ +task_id: match_modality +task_name: Match modality +summary: "Predicting which profiles from one modality resembles a profile from another." +description: | + "While joint profiling of two modalities in the same single cell is now possible, + most single-cell datasets that exist measure only a single modality. These modalities + complement each other in their description of cellular state. Yet, it is challenging + to analyse uni-modal datasets together when they do not share observations (cells) or + a common feature space (genes, proteins, or open chromatin peaks). If we could map + observations to one another across modalities, it would be possible to treat separately + profiled datasets in the same manner as new multi-modal sequencing data. Mapping these + modalities to one another opens up the vast amount of uni-modal single-cell datasets + generated in the past years to multi-modal data analysis methods." \ No newline at end of file diff --git a/src/match_modality/control_methods/constant/config.vsh.yaml b/src/match_modality/control_methods/constant/config.vsh.yaml new file mode 100644 index 0000000000..c20acf234f --- /dev/null +++ b/src/match_modality/control_methods/constant/config.vsh.yaml @@ -0,0 +1,24 @@ +__merge__: ../../api/comp_control_method.yaml +functionality: + name: constant + namespace: match_modality/control_methods + description: Returns constant weights between all mod1 profiles and the first 1000 mod2 profiles. + info: + type: negative_control + method_name: Constant + resources: + - type: r_script + path: script.R +platforms: + - type: docker + image: eddelbuettel/r2u:22.04 + setup: + - type: r + cran: [ anndata] + - type: apt + packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3] + - type: python + pip: [ anndata>=0.8 ] + - type: nextflow + directives: + label: [ lowmem, lowcpu ] diff --git a/src/match_modality/control_methods/constant/script.R b/src/match_modality/control_methods/constant/script.R new file mode 100644 index 0000000000..590d49f982 --- /dev/null +++ b/src/match_modality/control_methods/constant/script.R @@ -0,0 +1,50 @@ +cat("Loading dependencies\n") +requireNamespace("anndata", quietly = TRUE) +library(Matrix, warn.conflicts = FALSE, quietly = TRUE) + +## VIASH START +par <- list( + input_test_mod1 = "output/output_test_mod1.h5ad", + input_test_mod2 = "output/output_test_mod2.h5ad", + output = "output/output_prediction.h5ad" +) +meta <- list(functionality_name = "foo") +## VIASH END + + +cat("Reading h5ad files\n") +# input_train_mod1 <- anndata::read_h5ad(par$input_train_mod1) +# input_train_mod2 <- anndata::read_h5ad(par$input_train_mod2) +# input_train_sol <- anndata::read_h5ad(par$input_train_sol) +input_test_mod1 <- anndata::read_h5ad(par$input_test_mod1, backed = TRUE) +input_test_mod2 <- anndata::read_h5ad(par$input_test_mod2, backed = TRUE) + +knn_df <- + expand.grid( + i = seq_len(nrow(input_test_mod1)), + j = seq_len(min(nrow(input_test_mod2), 1000)) + ) + +knn_mat <- + Matrix::sparseMatrix( + i = knn_df$i, + j = knn_df$j, + x = rep(1, nrow(knn_df)), + dims = list(nrow(input_test_mod1), nrow(input_test_mod2)) + ) + +# normalise to make rows sum to 1 +rs <- Matrix::rowSums(knn_mat) +knn_mat@x <- knn_mat@x / rs[knn_mat@i + 1] + +cat("Creating output anndata\n") +out <- anndata::AnnData( + X = as(knn_mat, "CsparseMatrix"), + uns = list( + dataset_id = input_test_mod1$uns[["dataset_id"]], + method_id = meta$functionality_name + ) +) + +cat("Writing predictions to file\n") +zzz <- out$write_h5ad(par$output, compression = "gzip") diff --git a/src/match_modality/control_methods/random_pairing/config.vsh.yaml b/src/match_modality/control_methods/random_pairing/config.vsh.yaml new file mode 100644 index 0000000000..fd9da8d595 --- /dev/null +++ b/src/match_modality/control_methods/random_pairing/config.vsh.yaml @@ -0,0 +1,20 @@ +__merge__: ../../api/comp_control_method.yaml +functionality: + name: random_pairing + namespace: match_modality/control_methods + description: Generates random pairings weights drawn from a uniform distribution. + info: + type: negative_control + method_name: Random Pairing + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: "python:3.10" + setup: + - type: python + pip: [ anndata>=0.8, numpy, scikit-learn ] + - type: nextflow + directives: + label: [ lowmem, lowcpu ] \ No newline at end of file diff --git a/src/match_modality/control_methods/random_pairing/script.py b/src/match_modality/control_methods/random_pairing/script.py new file mode 100644 index 0000000000..868b6ef31c --- /dev/null +++ b/src/match_modality/control_methods/random_pairing/script.py @@ -0,0 +1,45 @@ +import anndata as ad +import numpy as np +import scipy.sparse +from sklearn.preprocessing import normalize + +# VIASH START +par = { + "input_test_mod1": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_mod1.h5ad", + "input_test_mod2": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_mod2.h5ad", + "output": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad", +} + +meta = { + "functionality_name": "foo" +} +# VIASH END + +print("Load datasets") +input_test_mod1 = ad.read_h5ad(par["input_test_mod1"]) +input_test_mod2 = ad.read_h5ad(par["input_test_mod2"]) + +# determine number of values in array +num_values = min(1000, input_test_mod1.n_obs) * input_test_mod1.n_obs +indices = np.random.randint(input_test_mod1.n_obs**2, size=num_values) + +mat_x = np.random.rand(num_values) +mat_i = indices % input_test_mod1.n_obs +mat_j = (indices / input_test_mod1.n_obs).astype(int) +pairing_matrix = scipy.sparse.csr_matrix( + (mat_x, (mat_i, mat_j)), + shape=(input_test_mod1.n_obs, input_test_mod2.n_obs) +) + +# row normalise +prob_matrix = normalize(pairing_matrix, norm="l1") + +# Write out prediction +prediction = ad.AnnData( + X=prob_matrix, + uns={ + "method_id": meta["functionality_name"], + "dataset_id": input_test_mod1.uns["dataset_id"] + } +) +prediction.write_h5ad(par["output"]) diff --git a/src/match_modality/control_methods/semi_solution/config.vsh.yaml b/src/match_modality/control_methods/semi_solution/config.vsh.yaml new file mode 100644 index 0000000000..756716feb6 --- /dev/null +++ b/src/match_modality/control_methods/semi_solution/config.vsh.yaml @@ -0,0 +1,24 @@ +__merge__: ../../api/comp_control_method.yaml +functionality: + name: semi_solution + namespace: match_modality/control_methods + description: Returns the ground-truth pairing. + info: + type: positive_control + method_name: Semi-solution + resources: + - type: r_script + path: script.R +platforms: + - type: docker + image: eddelbuettel/r2u:22.04 + setup: + - type: r + cran: [ anndata ] + - type: apt + packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3] + - type: python + pip: [ anndata>=0.8 ] + - type: nextflow + directives: + label: [ lowmem, lowcpu ] diff --git a/src/match_modality/control_methods/semi_solution/script.R b/src/match_modality/control_methods/semi_solution/script.R new file mode 100644 index 0000000000..d36833b4b0 --- /dev/null +++ b/src/match_modality/control_methods/semi_solution/script.R @@ -0,0 +1,29 @@ +cat("Loading dependencies\n") +requireNamespace("anndata", quietly = TRUE) + +## VIASH START +par <- list( + input_test_sol = "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_sol.h5ad", + output = "output.h5ad" +) +meta <- list(functionality_name = "foo") +## VIASH END + +cat("Reading h5ad files\n") +input_test_sol <- anndata::read_h5ad(par$input_test_sol) + +# randomly fill in gold standard values +input_test_sol$X@x <- runif(length(input_test_sol$X@x)) + +# fill other values with random values as well +ix <- sample.int(nrow(input_test_sol) * ncol(input_test_sol), nrow(input_test_sol) * 10) +input_test_sol$X[ix] <- runif(length(ix)) + +# normalise to make rows sum to 1 +rs <- Matrix::rowSums(input_test_sol$X) +input_test_sol$X@x <- input_test_sol$X@x / rs[input_test_sol$X@i + 1] + +input_test_sol$uns[["method_id"]] <- meta$functionality_name + +cat("Writing predictions to file\n") +zzz <- input_test_sol$write_h5ad(par$output, compression = "gzip") diff --git a/src/match_modality/control_methods/solution/config.vsh.yaml b/src/match_modality/control_methods/solution/config.vsh.yaml new file mode 100644 index 0000000000..b352fdaedf --- /dev/null +++ b/src/match_modality/control_methods/solution/config.vsh.yaml @@ -0,0 +1,24 @@ +__merge__: ../../api/comp_control_method.yaml +functionality: + name: solution + namespace: match_modality/control_methods + description: Returns the ground-truth pairing. + info: + type: positive_control + method_name: Solution + resources: + - type: r_script + path: script.R +platforms: + - type: docker + image: eddelbuettel/r2u:22.04 + setup: + - type: r + cran: [ anndata ] + - type: apt + packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3] + - type: python + pip: [ anndata>=0.8 ] + - type: nextflow + directives: + label: [ lowmem, lowcpu ] diff --git a/src/match_modality/control_methods/solution/script.R b/src/match_modality/control_methods/solution/script.R new file mode 100644 index 0000000000..88b5b746dd --- /dev/null +++ b/src/match_modality/control_methods/solution/script.R @@ -0,0 +1,18 @@ +cat("Loading dependencies\n") +requireNamespace("anndata", quietly = TRUE) + +## VIASH START +par <- list( + input_test_sol = "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_sol.h5ad", + output = "output.h5ad" +) +meta <- list(functionality_name = "foo") +## VIASH END + +cat("Reading h5ad files\n") +input_test_sol <- anndata::read_h5ad(par$input_test_sol) + +input_test_sol$uns[["method_id"]] <- meta$functionality_name + +cat("Writing predictions to file\n") +zzz <- input_test_sol$write_h5ad(par$output, compression = "gzip") diff --git a/src/match_modality/mask_dataset/config.vsh.yaml b/src/match_modality/mask_dataset/config.vsh.yaml new file mode 100644 index 0000000000..ac341f3b84 --- /dev/null +++ b/src/match_modality/mask_dataset/config.vsh.yaml @@ -0,0 +1,23 @@ +__merge__: ../api/comp_mask_dataset.yaml +functionality: + name: mask_dataset + namespace: match_modality + description: | + Censor an existing dataset: obfuscate gene names, remove cell identities and + shuffle cells of modalities, for distribution to competitors. + resources: + - type: r_script + path: script.R +platforms: + - type: docker + image: eddelbuettel/r2u:22.04 + setup: + - type: r + cran: [ anndata, tidyverse , testthat] + - type: apt + packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3] + - type: python + pip: [ anndata>=0.8 ] + - type: nextflow + directives: + label: [ highmem, midtime, highcpu ] diff --git a/src/match_modality/mask_dataset/script.R b/src/match_modality/mask_dataset/script.R new file mode 100644 index 0000000000..f41de56600 --- /dev/null +++ b/src/match_modality/mask_dataset/script.R @@ -0,0 +1,179 @@ +cat("Loading dependencies\n") +options(tidyverse.quiet = TRUE) +library(tidyverse) +requireNamespace("anndata", quietly = TRUE) +library(assertthat, quietly = TRUE, warn.conflicts = FALSE) +library(Matrix, quietly = TRUE, warn.conflicts = FALSE) + +## VIASH START +# input_path <- "resources_test/common/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter." +# input_path <- "output/datasets/common/openproblems_bmmc_multiome_phase1/openproblems_bmmc_multiome_phase1.manual_formatting." +# input_path <- "output/datasets/common/openproblems_bmmc_cite_phase1/openproblems_bmmc_cite_phase1.manual_formatting." +input_path <- "resources_test/common/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter." +output_path <- "output/multiome" +# output_path <- "output/datasets/match_modality/openproblems_bmmc_multiome_phase1/openproblems_bmmc_multiome_phase1.censor_dataset." +# output_path <- "output/datasets/match_modality/openproblems_bmmc_multiome_iid/openproblems_bmmc_multiome_iid.censor_dataset." +# dir.create(dirname(output_path), recursive = TRUE) + +par <- list( + input_mod1 = paste0(input_path, "output_rna.h5ad"), + input_mod2 = paste0(input_path, "output_mod2.h5ad"), + output_train_mod1 = paste0(output_path, "output_train_mod1.h5ad"), + output_train_mod2 = paste0(output_path, "output_train_mod2.h5ad"), + output_train_sol = paste0(output_path, "output_train_sol.h5ad"), + output_test_mod1 = paste0(output_path, "output_test_mod1.h5ad"), + output_test_mod2 = paste0(output_path, "output_test_mod2.h5ad"), + output_test_sol = paste0(output_path, "output_test_sol.h5ad"), + seed = 1L, + knn = 10L +) +## VIASH END + +set.seed(par$seed) + +subset_mats <- function(li, obs_filt, anonymize = FALSE) { + out <- list() + for (n in names(li)) { + mat <- li[[n]][obs_filt, , drop = FALSE] + if (anonymize) { + rownames(mat) <- paste0("cell_", seq_len(nrow(mat))) + } + out[[n]] <- mat + } + out +} + + +cat("Reading input data\n") +input_mod1 <- anndata::read_h5ad(par$input_mod1) +input_mod2 <- anndata::read_h5ad(par$input_mod2) +ad1_mod <- unique(input_mod1$var[["feature_types"]]) +ad2_mod <- unique(input_mod2$var[["feature_types"]]) +new_dataset_id <- paste0(input_mod1$uns[["dataset_id"]], "_MM_", tolower(ad1_mod), "2", tolower(ad2_mod)) +ad1_uns <- list(dataset_id = new_dataset_id, organism = "human") +ad2_uns <- list(dataset_id = new_dataset_id, organism = "human") +ad1_obsm <- list() +ad2_obsm <- list() + +if (ad1_mod == "ATAC") { + ad1_uns$gene_activity_var_names <- input_mod1$uns$gene_activity_var_names + ad1_obsm$gene_activity <- as(input_mod1$obsm$gene_activity, "CsparseMatrix") +} +if (ad2_mod == "ATAC") { + ad2_uns$gene_activity_var_names <- input_mod2$uns$gene_activity_var_names + ad2_obsm$gene_activity <- as(input_mod2$obsm$gene_activity, "CsparseMatrix") +} + +cat("Shuffle train cells\n") +train_ix <- which(input_mod1$obs$is_train) %>% sort +train_mod2_ix <- sample.int(length(train_ix)) + +cat("Shuffle test cells\n") +test_ix <- which(!input_mod1$obs$is_train) %>% sort +test_mod2_ix <- sample.int(length(test_ix)) + +is_categorical <- function(x) is.character(x) || is.factor(x) +# relevel <- function(x) factor(as.character(x)) +relevel <- function(x) as.character(x) + +cat("Creating train objects\n") +mod1_var <- input_mod1$var %>% select(one_of("gene_ids", "feature_types")) +mod2_var <- input_mod2$var %>% select(one_of("gene_ids", "feature_types")) +train_obs1 <- input_mod1$obs[train_ix, , drop = FALSE] %>% + select(one_of("batch", "size_factors")) %>% + mutate_if(is_categorical, relevel) +train_obs2 <- input_mod2$obs[train_ix[train_mod2_ix], , drop = FALSE] %>% + select(one_of("batch", "size_factors")) %>% + mutate_if(is_categorical, relevel) +rownames(train_obs2) <- NULL +if (ncol(train_obs2) == 0) train_obs2 <- NULL +assert_that("size_factors" %in% colnames(train_obs1) != "size_factors" %in% colnames(train_obs2)) +assert_that(all(train_obs1$batch == train_obs2$batch[order(train_mod2_ix)])) + +output_train_mod1 <- anndata::AnnData( + X = input_mod1$X[train_ix, , drop = FALSE], + layers = subset_mats(input_mod1$layers, train_ix), + obsm = subset_mats(ad1_obsm, train_ix), + obs = train_obs1, + var = mod1_var, + uns = ad1_uns +) +output_train_mod2 <- anndata::AnnData( + X = input_mod2$X[train_ix[train_mod2_ix], , drop = FALSE] %>% + magrittr::set_rownames(., paste0("cell_", seq_len(nrow(.)))), + layers = subset_mats(input_mod2$layers, train_ix[train_mod2_ix], anonymize = TRUE), + obsm = subset_mats(ad2_obsm, train_ix[train_mod2_ix], anonymize = TRUE), + obs = train_obs2, + var = mod2_var, + uns = ad2_uns +) +assert_that(all(output_train_mod1$obs$batch == output_train_mod2$obs$batch[order(train_mod2_ix)])) + +cat("Create test objects\n") +test_obs1 <- input_mod1$obs[test_ix, , drop = FALSE] %>% + select(one_of("batch", "size_factors")) %>% + mutate_if(is_categorical, relevel) +test_obs2 <- input_mod2$obs[test_ix[test_mod2_ix], , drop = FALSE] %>% + select(one_of("batch", "size_factors")) %>% + mutate_if(is_categorical, relevel) +rownames(test_obs2) <- NULL +if (ncol(test_obs2) == 0) test_obs2 <- NULL +assert_that("size_factors" %in% colnames(train_obs1) != "size_factors" %in% colnames(train_obs2)) +assert_that(all(test_obs1$batch == test_obs2$batch[order(test_mod2_ix)])) + +output_test_mod1 <- anndata::AnnData( + X = input_mod1$X[test_ix, , drop = FALSE], + layers = subset_mats(input_mod1$layers, test_ix), + obsm = subset_mats(ad1_obsm, test_ix), + obs = test_obs1, + var = mod1_var, + uns = ad1_uns +) +output_test_mod2 <- anndata::AnnData( + X = input_mod2$X[test_ix[test_mod2_ix], , drop = FALSE] %>% + magrittr::set_rownames(., paste0("cell_", seq_len(nrow(.)))), + layers = subset_mats(input_mod2$layers, test_ix[test_mod2_ix], anonymize = TRUE), + obsm = subset_mats(ad2_obsm, test_ix[test_mod2_ix], anonymize = TRUE), + obs = test_obs2, + var = mod2_var, + uns = ad2_uns +) +assert_that(all(output_test_mod1$obs$batch == output_test_mod2$obs$batch[order(test_mod2_ix)])) + +cat("Create solution objects\n") + +train_sol_mat <- Matrix::sparseMatrix( + i = seq_along(train_mod2_ix), + j = order(train_mod2_ix), + x = rep(1, length(train_mod2_ix)) +) +output_train_sol <- anndata::AnnData( + X = train_sol_mat, + obs = input_mod1$obs[train_ix, , drop = FALSE] %>% select(one_of(c("batch"))) %>% mutate_if(is_categorical, relevel), + uns = list(dataset_id = new_dataset_id, pairing_ix = train_mod2_ix - 1) +) + +test_sol_mat <- Matrix::sparseMatrix( + i = seq_along(test_mod2_ix), + j = order(test_mod2_ix), + x = rep(1, length(test_mod2_ix)) +) +output_test_sol <- anndata::AnnData( + X = test_sol_mat, + obs = input_mod1$obs[test_ix, , drop = FALSE] %>% select(one_of(c("batch"))) %>% mutate_if(is_categorical, relevel), + uns = list(dataset_id = new_dataset_id, pairing_ix = test_mod2_ix - 1) +) + +# checks +# mean(rowSums(train_solknn > 0)) +# mean(rowSums(test_solknn > 0)) +# sum(train_solknn * train_sol_mat) == nrow(train_sol_mat) +# sum(test_solknn * test_sol_mat) == nrow(test_sol_mat) + +cat("Saving output files as h5ad\n") +zzz <- output_train_mod1$write_h5ad(par$output_train_mod1, compression = "gzip") +zzz <- output_train_mod2$write_h5ad(par$output_train_mod2, compression = "gzip") +zzz <- output_train_sol$write_h5ad(par$output_train_sol, compression = "gzip") +zzz <- output_test_mod1$write_h5ad(par$output_test_mod1, compression = "gzip") +zzz <- output_test_mod2$write_h5ad(par$output_test_mod2, compression = "gzip") +zzz <- output_test_sol$write_h5ad(par$output_test_sol, compression = "gzip") diff --git a/src/match_modality/methods/babel_knn/config.vsh.yaml b/src/match_modality/methods/babel_knn/config.vsh.yaml new file mode 100644 index 0000000000..4a7cdc2e31 --- /dev/null +++ b/src/match_modality/methods/babel_knn/config.vsh.yaml @@ -0,0 +1,41 @@ +__merge__: ../../api/comp_method.yaml +functionality: + status: disabled + name: babel_knn + namespace: match_modality/methods + description: Predict test expression with BABEL and match cells with KNN. + info: + type: method + method_name: Babel+KNN + paper_doi: "10.1073/pnas.2023070118" + arguments: + - name: "--n_dims" + type: "integer" + default: 10 + description: Number of dimensions to use for dimensionality reduction. + - name: "--n_neigh" + type: "integer" + default: 10 + description: Number of neighbors for KNN. Match probability will be 1/n_neigh + resources: + - type: r_script + path: script.R +platforms: + - type: docker + image: eddelbuettel/r2u:22.04 + setup: + - type: r + cran: [ anndata, lmds, FNN ] + bioc: [ DropletUtils ] + - type: apt + packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3, git] + - type: python + pip: [ anndata>=0.8 ] + - type: docker + run: + - wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-py39_4.10.3-Linux-x86_64.sh -O ~/miniconda.sh && /bin/bash ~/miniconda.sh -b -p /opt/conda && /opt/conda/bin/conda -V + - cd / && git clone --depth 1 https://github.com/rcannood/babel.git + - cd /babel && /opt/conda/bin/conda env create -f environment.yml + - type: nextflow + directives: + label: [ lowmem, lowcpu ] diff --git a/src/match_modality/methods/babel_knn/script.R b/src/match_modality/methods/babel_knn/script.R new file mode 100644 index 0000000000..42702b6150 --- /dev/null +++ b/src/match_modality/methods/babel_knn/script.R @@ -0,0 +1,195 @@ +cat(">> Loading dependencies\n") + +requireNamespace("anndata", quietly = TRUE) +library(Matrix, warn.conflicts = FALSE, quietly = TRUE) +library(testthat, warn.conflicts = FALSE, quietly = TRUE) +requireNamespace("DropletUtils", quietly = TRUE) + +options(tidyverse.quiet = TRUE) +library(tidyverse) + +babel_location <- "/babel/bin/" +conda_bin <- "/opt/conda/bin/conda" + +## VIASH START +path <- "output/datasets/match_modality/openproblems_bmmc_multiome_phase1/openproblems_bmmc_multiome_phase1.censor_dataset.output_" +par <- list( + input_train_mod1 = paste0(path, "train_mod1.h5ad"), + input_train_mod2 = paste0(path, "train_mod2.h5ad"), + input_train_sol = paste0(path, "train_sol.h5ad"), + input_test_mod1 = paste0(path, "test_mod1.h5ad"), + input_test_mod2 = paste0(path, "test_mod2.h5ad"), + output = "output.h5ad", + n_dims = 10, + n_neighs = 10 +) +conda_bin <- "conda" +babel_location <- "../babel/bin/" +## VIASH END + +input_train_mod2 <- anndata::read_h5ad(par$input_train_mod2, backed = TRUE) +if (input_train_mod2$var$feature_types[[1]] != "ATAC") { + cat("Error: babel only runs on GEX to ATAC datasets\n") + quit(save = "no", status = 42) +} + +cat("Reading h5ad files\n") +input_train_mod1 <- anndata::read_h5ad(par$input_train_mod1) +input_train_mod2 <- anndata::read_h5ad(par$input_train_mod2) +input_train_sol <- anndata::read_h5ad(par$input_train_sol) +input_test_mod1 <- anndata::read_h5ad(par$input_test_mod1) +input_test_mod2 <- anndata::read_h5ad(par$input_test_mod2) + +cat(">> Reading h5ad files\n") +if (is.null(input_train_mod1$var$gene_ids)) input_train_mod1$var$gene_ids <- colnames(input_train_mod1) +if (is.null(input_train_mod2$var$gene_ids)) input_train_mod2$var$gene_ids <- colnames(input_train_mod2) +if (is.null(input_test_mod1$var$gene_ids)) input_test_mod1$var$gene_ids <- colnames(input_test_mod1) +if (is.null(input_test_mod2$var$gene_ids)) input_test_mod2$var$gene_ids <- colnames(input_test_mod2) + +mod1 <- as.character(unique(input_train_mod1$var$feature_types)) +mod2 <- as.character(unique(input_train_mod2$var$feature_types)) + +# multiome_matrix for export to Babel's input format +multiome_matrix <- cbind(input_train_mod1$X, input_train_mod2$X) + +# generate multiome anndata object for training +ad_babel <- anndata::AnnData( + X = multiome_matrix, + var = bind_rows(input_train_mod1$var, input_train_mod2$var) +) + +# setting up babel dirs +tmpdir <- tempfile(pattern = "babel_temp", fileext = "/") +cat(">> Setting up directories for babel at ", tmpdir, "\n", sep = "") +dir.create(tmpdir) +on.exit(unlink(tmpdir, recursive = TRUE)) + +dir_data <- paste0(tmpdir, "data/") # location of input files +dir.create(dir_data) +dir_model <- paste0(tmpdir, "model/") # location of babel model +dir_pred <- paste0(tmpdir, "pred/") # location of predictions + +feature_type_map <- c( + "GEX" = "Gene Expression", + "ADT" = "Peaks", # try to make it run on ADT data as well + "ATAC" = "Peaks" +) + +cat(">> Writing train dataset as 10x-CellRanger H5 format\n") +DropletUtils::write10xCounts( + paste0(dir_data, "train_input.h5"), + t(ad_babel$X), + gene.id = ad_babel$var$gene_ids, + gene.symbol = colnames(ad_babel), + barcodes = rownames(ad_babel), + type = "HDF5", + version = "3", + genome = "GRCh38", + gene.type = unname(feature_type_map[ad_babel$var$feature_types]), + overwrite = TRUE +) + +cat(">> Writing test dataset as 10x-CellRanger H5 format\n") +DropletUtils::write10xCounts( + paste0(dir_data, "test_input.h5"), + t(input_test_mod1$X), + gene.id = input_test_mod1$var$gene_ids, + gene.symbol = colnames(input_test_mod1), + barcodes = rownames(input_test_mod1), + type = "HDF5", + version = "3", + genome = "GRCh38", + gene.type = unname(feature_type_map[input_test_mod1$var$feature_types]), + overwrite = TRUE +) + + +cat(">> Babel: train model\n") +babel_train_cmd <- paste0( + conda_bin, " run -n babel ", + "python ", babel_location, "train_model.py ", + "--data ", dir_data, "train_input.h5 ", + "--outdir ", dir_model, " ", + "--nofilter" +) +# stringent filtering causes babel to sometimes fail +# reason: https://github.com/wukevin/babel/blob/main/babel/sc_data_loaders.py#L168-L190 + +out1 <- system(babel_train_cmd) + +# check whether training succeeded +expect_equal(out1, 0, info = paste0("Model training failed with exit code ", out1)) + +cat(">> Babel: predict from model\n") +babel_pred_cmd <- paste0( + conda_bin, " run -n babel ", + "python ", babel_location, "predict_model.py ", + "--checkpoint ", dir_model, " ", + "--data ", dir_data, "test_input.h5 ", + "--outdir ", dir_pred, " ", + "--nofilter" +) +out2 <- system(babel_pred_cmd) + +# check whether prediction succeeded +expect_equal(out2, 0, info = paste0("Prediction failed with exit code ", out1)) + +cat(">> Read predictions\n") +pred <- anndata::read_h5ad(paste0(dir_pred, "/rna_atac_adata.h5ad")) + +####################################### +##### KNN + +# Only some features are present in Babel's output +input_test_mod2_filter <- input_test_mod2[, row.names(input_test_mod2$var) %in% row.names(pred$var)] + +# Dimensional reduction of both predicted and test profiles +pred_profiles <- pred[, row.names(input_test_mod2_filter$var)]$X + +cat("Performing DR on test values\n") +dr <- lmds::lmds( + rbind(pred_profiles, input_test_mod2_filter$X), + ndim = par$n_dims, + distance_method = par$distance_method +) + +train_ix <- seq_len(nrow(pred_profiles)) +dr_preds <- dr[train_ix, , drop = FALSE] +dr_test_mod2 <- dr[-train_ix, , drop = FAPSE] + + +cat("Performing KNN between test mod2 DR and predicted test mod2\n") +knn_out <- FNN::get.knnx( + dr_preds, + dr_test_mod2, + k = min(1000, nrow(dr_preds)) +) + +cat("Creating output data structures\n") +df <- tibble( + i = as.vector(row(knn_out$nn.index)), + j = as.vector(knn_out$nn.index), + x = max(knn_out$nn.dist) * 2 - as.vector(knn_out$nn.dist) +) +knn_mat <- Matrix::sparseMatrix( + i = df$i, + j = df$j, + x = df$x, + dims = list(nrow(input_test_mod1), nrow(input_test_mod2)) +) + +# normalise to make rows sum to 1 +rs <- Matrix::rowSums(knn_mat) +knn_mat@x <- knn_mat@x / rs[knn_mat@i + 1] + +cat("Creating output anndata\n") +out <- anndata::AnnData( + X = as(knn_mat, "CsparseMatrix"), + uns = list( + dataset_id = input_train_mod1$uns[["dataset_id"]], + method_id = "baseline_babel_knn" + ) +) + +cat("Writing predictions to file\n") +zzz <- out$write_h5ad(par$output, compression = "gzip") \ No newline at end of file diff --git a/src/match_modality/methods/dr_knnr_cbf/config.vsh.yaml b/src/match_modality/methods/dr_knnr_cbf/config.vsh.yaml new file mode 100644 index 0000000000..37321fcad1 --- /dev/null +++ b/src/match_modality/methods/dr_knnr_cbf/config.vsh.yaml @@ -0,0 +1,29 @@ +__merge__: ../../api/comp_method.yaml +functionality: + name: dr_knnr_cbf + namespace: match_modality/methods + description: Perform LMDS+Pearson DR, predict embedding with KNN regression, and match cells with KNN. + info: + type: method + method_name: LMDS+KNNR+CBF + arguments: + - name: "--n_pop" + type: "integer" + default: 100 + description: Population size. + resources: + - type: r_script + path: script.R +platforms: + - type: docker + image: eddelbuettel/r2u:22.04 + setup: + - type: r + cran: [ anndata, tidyverse, lmds, dynutils, proxy, FNN, pbapply] + - type: apt + packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3] + - type: python + pip: [ anndata>=0.8 ] + - type: nextflow + directives: + label: [ midmem, midcpu ] diff --git a/src/match_modality/methods/dr_knnr_cbf/script.R b/src/match_modality/methods/dr_knnr_cbf/script.R new file mode 100644 index 0000000000..021e2d6483 --- /dev/null +++ b/src/match_modality/methods/dr_knnr_cbf/script.R @@ -0,0 +1,144 @@ +cat("Loading dependencies\n") +options(tidyverse.quiet = TRUE) +library(tidyverse) +requireNamespace("anndata", quietly = TRUE) +library(Matrix, warn.conflicts = FALSE, quietly = TRUE) + +## VIASH START +path <- "output/datasets/match_modality/openproblems_bmmc_multiome_phase1_rna/openproblems_bmmc_multiome_phase1_rna.censor_dataset.output_" +path <- "output/datasets/match_modality/openproblems_bmmc_cite_phase1_rna/openproblems_bmmc_cite_phase1_rna.censor_dataset.output_" +par <- list( + input_train_mod1 = paste0(path, "train_mod1.h5ad"), + input_train_mod2 = paste0(path, "train_mod2.h5ad"), + input_train_sol = paste0(path, "train_sol.h5ad"), + input_test_mod1 = paste0(path, "test_mod1.h5ad"), + input_test_mod2 = paste0(path, "test_mod2.h5ad"), + output = "output.h5ad", + n_pop = 300L +) +meta <- list(functionality_name = "foo") +## VIASH END + +n_cores <- parallel::detectCores(all.tests = FALSE, logical = TRUE) + +method_id <- meta$functionality_name + +cat("Read train sol\n") +input_train_sol <- anndata::read_h5ad(par$input_train_sol) + +cat("Reading mod1 h5ad files\n") +input_train_mod1 <- anndata::read_h5ad(par$input_train_mod1) +input_test_mod1 <- anndata::read_h5ad(par$input_test_mod1) + +cat("Running LMDS on input data\n") +# merge input matrices +mod1_X <- rbind(input_train_mod1$X, input_test_mod1$X) +rm(input_train_mod1, input_test_mod1) +gc() + +# perform DR +dr_x1 <- lmds::lmds(mod1_X, ndim = 10, distance_method = "pearson") +rm(mod1_X) +gc() + +# split input matrices +dr_x1_train <- dr_x1[seq_len(nrow(input_train_sol)), , drop = FALSE] +dr_x1_test <- dr_x1[-seq_len(nrow(input_train_sol)), , drop = FALSE] + +cat("Reading mod1 h5ad files\n") +input_train_mod2 <- anndata::read_h5ad(par$input_train_mod2) +input_test_mod2 <- anndata::read_h5ad(par$input_test_mod2) + +cat("Running LMDS on input data\n") +# merge input matrices +match_train <- input_train_sol$uns$pairing_ix + 1 +mod2_X <- rbind(input_train_mod2$X[order(match_train), , drop = FALSE], input_test_mod2$X) +rm(input_train_mod2, input_test_mod2) +gc() + +# perform DR +dr_x2 <- lmds::lmds(mod2_X, ndim = 3, distance_method = "pearson") +rm(mod2_X) +gc() + +# split input matrices +dr_x2_train <- dr_x2[seq_len(nrow(input_train_sol)), , drop = FALSE] +dr_x2_test <- dr_x2[-seq_len(nrow(input_train_sol)), , drop = FALSE] + +cat("Predicting mod1 DR of test cells\n") +pred_mod1 <- apply(dr_x1_train, 2, function(yi) { + FNN::knn.reg( + train = dr_x2_train, + test = dr_x2_test, + y = yi, + k = min(15, nrow(dr_x1_test)) + )$pred +}) + +cat("Predicting mod2 DR of test cells\n") +pred_mod2 <- apply(dr_x2_train, 2, function(yi) { + FNN::knn.reg( + train = dr_x1_train, + test = dr_x1_test, + y = yi, + k = min(15, nrow(dr_x1_test)) + )$pred +}) + +cat("Minimising distances between mod1 and mod2 pairs\n") +gen_vec <- function(z) { + int <- seq_len(nrow(pred_mod1)) + + i <- j <- c() + resti <- int + restj <- int + + while (length(resti) > 0) { + ixi <- sample.int(length(resti), 1) + newi <- resti[[ixi]] + d1 <- proxy::dist(pred_mod1[restj, , drop = FALSE], dr_x1_test[newi, , drop = FALSE], method = "euclidean") + d2 <- proxy::dist(pred_mod2[restj, , drop = FALSE], dr_x2_test[newi, , drop = FALSE], method = "euclidean") + d12 <- d1 + d2 + ixj <- which.min(d12[, 1]) + newj <- restj[[ixj]] + resti <- resti[-ixi] + restj <- restj[-ixj] + i <- c(i, newi) + j <- c(j, newj) + + # tibble(i, j); tibble(resti, restj) + } + + tibble::tibble(i, j) +} + +outs <- pbapply::pblapply(seq_len(par$n_pop), cl = n_cores, gen_vec) +# outs <- lapply(seq_len(par$n_pop), gen_vec) +df <- bind_rows(outs) %>% + group_by(i, j) %>% + summarise(n = n(), .groups = "drop") %>% + arrange(desc(n)) %>% + mutate(gold = i == j) + +knn_mat <- Matrix::sparseMatrix( + i = df$i, + j = df$j, + x = df$n, + dims = list(nrow(dr_x1_test), nrow(dr_x2_test)) +) + +# normalise to make rows sum to 1 +rs <- Matrix::rowSums(knn_mat) +knn_mat@x <- knn_mat@x / rs[knn_mat@i + 1] + +cat("Creating output anndata\n") +out <- anndata::AnnData( + X = as(knn_mat, "CsparseMatrix"), + uns = list( + dataset_id = input_train_sol$uns[["dataset_id"]], + method_id = method_id + ) +) + +cat("Writing predictions to file\n") +zzz <- out$write_h5ad(par$output, compression = "gzip") diff --git a/src/match_modality/methods/dr_knnr_knn/config.vsh.yaml b/src/match_modality/methods/dr_knnr_knn/config.vsh.yaml new file mode 100644 index 0000000000..fd9387d6d4 --- /dev/null +++ b/src/match_modality/methods/dr_knnr_knn/config.vsh.yaml @@ -0,0 +1,24 @@ +__merge__: ../../api/comp_method.yaml +functionality: + name: dr_knnr_knn + namespace: match_modality/methods + description: Perform LMDS+Pearson DR, predict embedding with KNN regression, and match cells with KNN. + info: + type: baseline + method_name: LMDS+KNNR+KNN + resources: + - type: r_script + path: script.R +platforms: + - type: docker + image: eddelbuettel/r2u:22.04 + setup: + - type: r + cran: [ anndata, tidyverse, lmds, dynutils, proxy, FNN] + - type: apt + packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3] + - type: python + pip: [ anndata>=0.8 ] + - type: nextflow + directives: + label: [ lowmem, lowcpu ] diff --git a/src/match_modality/methods/dr_knnr_knn/script.R b/src/match_modality/methods/dr_knnr_knn/script.R new file mode 100644 index 0000000000..fa32cbdf29 --- /dev/null +++ b/src/match_modality/methods/dr_knnr_knn/script.R @@ -0,0 +1,104 @@ +cat("Loading dependencies\n") +options(tidyverse.quiet = TRUE) +library(tidyverse) +requireNamespace("anndata", quietly = TRUE) +library(Matrix, warn.conflicts = FALSE, quietly = TRUE) + +## VIASH START +path <- "output/datasets/match_modality/openproblems_bmmc_multiome_phase1_rna/openproblems_bmmc_multiome_phase1_rna.censor_dataset.output_" +par <- list( + input_train_mod1 = paste0(path, "train_mod1.h5ad"), + input_train_mod2 = paste0(path, "train_mod2.h5ad"), + input_train_sol = paste0(path, "train_sol.h5ad"), + input_test_mod1 = paste0(path, "test_mod1.h5ad"), + input_test_mod2 = paste0(path, "test_mod2.h5ad"), + output = "output.h5ad" +) +## VIASH END + +method_id <- meta$functionality_name + +cat("Read train sol\n") +input_train_sol <- anndata::read_h5ad(par$input_train_sol) + +cat("Reading mod1 h5ad files\n") +input_train_mod1 <- anndata::read_h5ad(par$input_train_mod1) +input_test_mod1 <- anndata::read_h5ad(par$input_test_mod1) + +cat("Running LMDS on input data\n") +# merge input matrices +mod1_X <- rbind(input_train_mod1$X, input_test_mod1$X) +rm(input_train_mod1, input_train_mod2) + +# perform DR +dr_x1 <- lmds::lmds(mod1_X, ndim = 10, distance_method = "pearson") +rm(mod1_X) + +# split input matrices +dr_x1_train <- dr_x1[seq_len(nrow(input_train_sol)), , drop = FALSE] +dr_x1_test <- dr_x1[-seq_len(nrow(input_train_sol)), , drop = FALSE] + +cat("Reading mod1 h5ad files\n") +input_train_mod2 <- anndata::read_h5ad(par$input_train_mod2) +input_test_mod2 <- anndata::read_h5ad(par$input_test_mod2) + +cat("Running LMDS on input data\n") +# merge input matrices +match_train <- input_train_sol$uns$pairing_ix + 1 +mod2_X <- rbind(input_train_mod2$X[order(match_train), , drop = FALSE], input_test_mod2$X) +rm(input_train_mod2, input_test_mod2) + +# perform DR +dr_x2 <- lmds::lmds(mod2_X, ndim = 3, distance_method = "pearson") +rm(mod2_X) + +# split input matrices +dr_x2_train <- dr_x2[seq_len(nrow(input_train_sol)), , drop = FALSE] +dr_x2_test <- dr_x2[-seq_len(nrow(input_train_sol)), , drop = FALSE] + +cat("Predicting for each column in modality 2\n") +preds <- apply(dr_x2_train, 2, function(yi) { + FNN::knn.reg( + train = dr_x1_train, + test = dr_x1_test, + y = yi, + k = min(15, nrow(dr_x1_test)) + )$pred +}) + + +cat("Performing KNN between test mod2 DR and predicted test mod2\n") +knn_out <- FNN::get.knnx( + preds, + dr_x2_test, + k = min(1000, nrow(dr_x1_test)) +) + +cat("Creating output data structures\n") +df <- tibble( + i = as.vector(row(knn_out$nn.index)), + j = as.vector(knn_out$nn.index), + x = max(knn_out$nn.dist) * 2 - as.vector(knn_out$nn.dist) +) +knn_mat <- Matrix::sparseMatrix( + i = df$i, + j = df$j, + x = df$x, + dims = list(nrow(dr_x1_test), nrow(dr_x2_test)) +) + +# normalise to make rows sum to 1 +rs <- Matrix::rowSums(knn_mat) +knn_mat@x <- knn_mat@x / rs[knn_mat@i + 1] + +cat("Creating output anndata\n") +out <- anndata::AnnData( + X = as(knn_mat, "CsparseMatrix"), + uns = list( + dataset_id = input_train_sol$uns[["dataset_id"]], + method_id = method_id + ) +) + +cat("Writing predictions to file\n") +zzz <- out$write_h5ad(par$output, compression = "gzip") diff --git a/src/match_modality/methods/linear_knn/config.vsh.yaml b/src/match_modality/methods/linear_knn/config.vsh.yaml new file mode 100644 index 0000000000..20fa30783b --- /dev/null +++ b/src/match_modality/methods/linear_knn/config.vsh.yaml @@ -0,0 +1,29 @@ +__merge__: ../../api/comp_method.yaml +functionality: + name: linear_knn + namespace: match_modality/methods + description: Perform DR with Linear Regression, and match cells with kNN + info: + type: method + method_name: Linear Regression and KNN + arguments: + - name: "--n_svd" + type: "integer" + default: 10 + description: Number of SVD components to compress to + - name: "--n_neighbors" + type: "integer" + default: 10 + description: Number of neighbors for matching modalities + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: "python:3.10" + setup: + - type: python + pip: [ anndata, scipy, scikit-learn ] + - type: nextflow + directives: + label: [ lowmem, lowcpu ] diff --git a/src/match_modality/methods/linear_knn/script.py b/src/match_modality/methods/linear_knn/script.py new file mode 100644 index 0000000000..c776337a4a --- /dev/null +++ b/src/match_modality/methods/linear_knn/script.py @@ -0,0 +1,105 @@ +import logging +import anndata as ad +import scipy.spatial +import scipy.sparse +import numpy as np + +from sklearn.decomposition import TruncatedSVD +from sklearn.neighbors import NearestNeighbors +from sklearn.linear_model import LinearRegression +from sklearn.preprocessing import normalize + +## VIASH START +# Anything within this block will be removed by `viash` and will be +# replaced with the parameters as specified in your config.vsh.yaml. +par = { + "input_train_mod1": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_mod1.h5ad", + "input_train_mod2": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_mod2.h5ad", + "input_train_sol": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_sol.h5ad", + "input_test_mod1": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_mod1.h5ad", + "input_test_mod2": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_mod2.h5ad", + "output": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad", + "n_svd": 100, + "n_neighbors" : 10, +} + +meta = { + "funtionality_name": "foo" +} +## VIASH END + +logging.basicConfig(level=logging.INFO) + +logging.info("Load datasets") +input_train_mod1 = ad.read_h5ad(par["input_train_mod1"]) +input_train_mod2 = ad.read_h5ad(par["input_train_mod2"]) +input_train_sol = ad.read_h5ad(par["input_train_sol"]) +input_test_mod1 = ad.read_h5ad(par["input_test_mod1"]) +input_test_mod2 = ad.read_h5ad(par["input_test_mod2"]) + +# This method runs PCA on each modality individually, then runs linear regression to predict mod2 +# from mod1 and finally performs kNN to match modalities + +# unscramble training cells +ord = np.argsort(input_train_sol.uns['pairing_ix']) +input_train_mod2 = input_train_mod2[ord, :] + +# concatenate train and test data +mod1 = ad.concat( + { + "train": input_train_mod1, + "test": input_test_mod1 + }, + index_unique="-", + label="group" +) +mod2 = ad.concat( + { + "train": input_train_mod2, + "test": input_test_mod2 + }, + index_unique="-", + label="group" +) + +# Create helper views to access the test data later +mod1tr = mod1[mod1.obs["group"] == "train", :] +mod2tr = mod2[mod2.obs["group"] == "train", :] + +mod1te = mod1[mod1.obs["group"] == "test", :] +mod2te = mod2[mod2.obs["group"] == "test", :] + +logging.info("Running PCA") +n_svd = min(par["n_svd"], mod1.n_obs, mod2.n_obs, mod1.n_vars, mod1.n_vars) + +# Use TruncatedSVD for fast decomposition of the data +mod1.obsm["X_pca"] = TruncatedSVD(n_svd).fit_transform(mod1.X) +mod2.obsm["X_pca"] = TruncatedSVD(n_svd).fit_transform(mod2.X) + +reg = LinearRegression() + +reg.fit(mod1tr.obsm["X_pca"], mod2tr.obsm["X_pca"]) +mod2te_pred = reg.predict(mod1te.obsm["X_pca"]) + +neighbors = NearestNeighbors(n_neighbors=np.min((mod2te.shape[0], par["n_neighbors"])), n_jobs=-1) +neighbors = neighbors.fit(mod2te_pred) + +distances, indices = neighbors.kneighbors(mod2te.obsm["X_pca"]) + +prediction = np.zeros((mod2te.shape[0], mod2te.shape[0])) +for i, neighbors in enumerate(indices): + prediction[i, neighbors] = 1 / distances[i] + +prediction = normalize(prediction, norm="l1") + +prediction = scipy.sparse.csr_matrix(prediction) + +print("Write prediction output") +prediction = ad.AnnData( + X=prediction, + uns={ + "dataset_id": input_train_mod1.uns["dataset_id"], + "method_id": meta["functionality_name"] + } +) +prediction.write_h5ad(par["output"]) diff --git a/src/match_modality/methods/newwave_knnr_cbf/config.vsh.yaml b/src/match_modality/methods/newwave_knnr_cbf/config.vsh.yaml new file mode 100644 index 0000000000..6fcc058b2a --- /dev/null +++ b/src/match_modality/methods/newwave_knnr_cbf/config.vsh.yaml @@ -0,0 +1,42 @@ +__merge__: ../../api/comp_method.yaml +functionality: + name: newwave_knnr_cbf + namespace: match_modality/methods + description: Perform DR with NewWave, predict embedding with KNN regression, and matching cells with a consensus best fit algorithm. + info: + type: method + method_name: NewWave+KNNR+CBF + arguments: + - name: "--n_pop" + type: "integer" + default: 300 + description: Population size. + - name: "--newwave_maxiter" + type: "integer" + default: 40 + description: Maximum number of NewWave iterations. + - name: "--newwave_ngene" + type: "integer" + default: 200 + description: Setting of the n_gene_par NewWave parameter. + - name: "--newwave_ncell" + type: "integer" + default: 200 + description: Setting of the n_cell_par NewWave parameter. + resources: + - type: r_script + path: script.R +platforms: + - type: docker + image: eddelbuettel/r2u:22.04 + setup: + - type: r + cran: [ anndata, tidyverse, lmds, dynutils, proxy, FNN, pbapply, proxy, proxyC] + bioc: [ SingleCellExperiment, NewWave ] + - type: apt + packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3] + - type: python + pip: [ anndata>=0.8 ] + - type: nextflow + directives: + label: [ highmem, highcpu ] diff --git a/src/match_modality/methods/newwave_knnr_cbf/script.R b/src/match_modality/methods/newwave_knnr_cbf/script.R new file mode 100644 index 0000000000..ddc291a517 --- /dev/null +++ b/src/match_modality/methods/newwave_knnr_cbf/script.R @@ -0,0 +1,188 @@ +cat("Loading dependencies\n") +options(tidyverse.quiet = TRUE) +library(tidyverse) +requireNamespace("anndata", quietly = TRUE) +requireNamespace("pbapply", quietly = TRUE) +library(Matrix, warn.conflicts = FALSE, quietly = TRUE) +requireNamespace("NewWave", quietly = TRUE) +requireNamespace("FNN", quietly = TRUE) +requireNamespace("SingleCellExperiment", quietly = TRUE) + +## VIASH START +path <- "output/datasets/match_modality/openproblems_bmmc_multiome_phase1/openproblems_bmmc_multiome_phase1.censor_dataset.output_" +par <- list( + input_train_mod1 = paste0(path, "train_mod1.h5ad"), + input_train_mod2 = paste0(path, "train_mod2.h5ad"), + input_train_sol = paste0(path, "train_sol.h5ad"), + input_test_mod1 = paste0(path, "test_mod1.h5ad"), + input_test_mod2 = paste0(path, "test_mod2.h5ad"), + output = "output.h5ad", + n_pop = 300L, + newwave_maxiter = 10 +) +meta <- list(functionality_name = "foo") + +# # read in solution data to check whether method is working +# input_test_sol <- anndata::read_h5ad(paste0(path, "test_sol.h5ad")) +# match_test <- input_test_sol$uns$pairing_ix + 1 +## VIASH END + +n_cores <- parallel::detectCores(all.tests = FALSE, logical = TRUE) + +method_id <- meta$functionality_name + +input_train_sol <- anndata::read_h5ad(par$input_train_sol) +match_train <- input_train_sol$uns$pairing_ix + 1 + +cat("Reading h5ad files\n") +input_train_mod1 <- anndata::read_h5ad(par$input_train_mod1) +input_test_mod1 <- anndata::read_h5ad(par$input_test_mod1) + +# fetch a few variables +train_ix <- seq_len(nrow(input_train_mod1)) +did <- input_train_mod1$uns[["dataset_id"]] +batch1 <- c(as.character(input_train_mod1$obs$batch), as.character(input_test_mod1$obs$batch)) + +cat("Running NewWave\n") +data1 <- SummarizedExperiment::SummarizedExperiment( + assays = list(counts = cbind(t(input_train_mod1$layers[["counts"]]), t(input_test_mod1$layers[["counts"]]))), + colData = data.frame(batch = factor(batch1)) +) +data1 <- data1[Matrix::rowSums(SummarizedExperiment::assay(data1)) > 0, ] +# option 1: filter by HVG +# data1 <- data1[order(proxyC::rowSds(SummarizedExperiment::assay(data1)), decreasing = TRUE)[1:100], ] + +# remove large unneeded dataset objects +rm(input_train_mod1, input_test_mod1) +gc() + +res1 <- NewWave::newWave( + data1, + X = "~batch", + verbose = TRUE, + K = 10, + maxiter_optimize = par$newwave_maxiter, + n_gene_par = min(par$newwave_ngene, nrow(data1)), + n_cell_par = min(par$newwave_ncell, ncol(data1)), + commondispersion = FALSE +) +rm(data1) +dr_x1 <- SingleCellExperiment::reducedDim(res1) + +cat("Reading h5ad files\n") +input_train_mod2 <- anndata::read_h5ad(par$input_train_mod2) +input_test_mod2 <- anndata::read_h5ad(par$input_test_mod2) + +# don't know batch ordering in input_test_mod2 +batch2 <- c(as.character(input_train_sol$obs$batch), rep("unknownbatch", nrow(input_test_mod2))) + +data2 <- SummarizedExperiment::SummarizedExperiment( + assays = list(counts = cbind(t(input_train_mod2$layers[["counts"]][order(match_train), , drop = FALSE]), t(input_test_mod2$layers[["counts"]]))), + colData = data.frame(batch = factor(batch2)) +) +data2 <- data2[Matrix::rowSums(SummarizedExperiment::assay(data2)) > 0, ] +# data2 <- data2[order(proxyC::rowSds(SummarizedExperiment::assay(data2)), decreasing = TRUE)[1:100], ] + +# remove large unneeded dataset objects +rm(input_train_mod2, input_test_mod2) +gc() + +cat("Running NewWave\n") +res2 <- NewWave::newWave( + data2, + X = "~batch", + verbose = TRUE, + K = 10, + maxiter_optimize = par$newwave_maxiter, + n_gene_par = min(par$newwave_ngene, nrow(data2)), + n_cell_par = min(par$newwave_ncell, ncol(data2)), + commondispersion = FALSE +) +dr_x2 <- SingleCellExperiment::reducedDim(res2) + +colnames(dr_x1) <- paste0("comp_", seq_len(ncol(dr_x1))) +colnames(dr_x2) <- paste0("comp_", seq_len(ncol(dr_x2))) + +# split up DR matrices +dr_x1_train <- dr_x1[train_ix, , drop = FALSE] +dr_x1_test <- dr_x1[-train_ix, , drop = FALSE] +dr_x2_train <- dr_x2[train_ix, , drop = FALSE] +dr_x2_test <- dr_x2[-train_ix, , drop = FALSE] + +cat("Predicting mod1 DR of test cells\n") +pred_mod1 <- apply(dr_x1_train, 2, function(yi) { + FNN::knn.reg( + train = dr_x2_train, + test = dr_x2_test, + y = yi, + k = min(15, nrow(dr_x1_test)) + )$pred +}) + +cat("Predicting mod2 DR of test cells\n") +pred_mod2 <- apply(dr_x2_train, 2, function(yi) { + FNN::knn.reg( + train = dr_x1_train, + test = dr_x1_test, + y = yi, + k = min(15, nrow(dr_x1_test)) + )$pred +}) + +cat("Minimising distances between mod1 and mod2 pairs\n") +gen_vec <- function(z) { + int <- seq_len(nrow(pred_mod1)) + + i <- j <- c() + resti <- int + restj <- int + + while (length(resti) > 0) { + ixi <- sample.int(length(resti), 1) + newi <- resti[[ixi]] + d1 <- proxy::dist(pred_mod1[restj, , drop = FALSE], dr_x1_test[newi, , drop = FALSE], method = "euclidean") + d2 <- proxy::dist(pred_mod2[restj, , drop = FALSE], dr_x2_test[newi, , drop = FALSE], method = "euclidean") + d12 <- d1 + d2 + ixj <- which.min(d12[, 1]) + newj <- restj[[ixj]] + resti <- resti[-ixi] + restj <- restj[-ixj] + i <- c(i, newi) + j <- c(j, newj) + + # tibble(i, j); tibble(resti, restj) + } + + tibble::tibble(i, j) +} +outs <- pbapply::pblapply(seq_len(par$n_pop), cl = n_cores, gen_vec) + +# outs <- lapply(seq_len(par$n_pop), gen_vec) +df <- bind_rows(outs) %>% + group_by(i, j) %>% + summarise(n = n(), .groups = "drop") %>% + arrange(desc(n)) %>% + mutate(gold = i == j) + +knn_mat <- Matrix::sparseMatrix( + i = df$i, + j = df$j, + x = df$n, + dims = list(nrow(dr_x1_test), nrow(dr_x2_test)) +) + +# normalise to make rows sum to 1 +rs <- Matrix::rowSums(knn_mat) +knn_mat@x <- knn_mat@x / rs[knn_mat@i + 1] + +cat("Creating output anndata\n") +out <- anndata::AnnData( + X = as(knn_mat, "CsparseMatrix"), + uns = list( + dataset_id = did, + method_id = method_id + ) +) + +cat("Writing predictions to file\n") +zzz <- out$write_h5ad(par$output, compression = "gzip") diff --git a/src/match_modality/methods/newwave_knnr_knn/config.vsh.yaml b/src/match_modality/methods/newwave_knnr_knn/config.vsh.yaml new file mode 100644 index 0000000000..4293005a23 --- /dev/null +++ b/src/match_modality/methods/newwave_knnr_knn/config.vsh.yaml @@ -0,0 +1,37 @@ +__merge__: ../../api/comp_method.yaml +functionality: + name: newwave_knnr_knn + namespace: match_modality/methods + description: Perform DR with NewWave, predict embedding with KNN regression, and match cells with a KNN. + info: + type: method + method_name: NewWave+KNNR+KNN + arguments: + - name: "--newwave_maxiter" + type: "integer" + default: 40 + description: Maximum number of NewWave iterations. + - name: "--newwave_ngene" + type: "integer" + default: 200 + description: Setting of the n_gene_par NewWave parameter. + - name: "--newwave_ncell" + type: "integer" + default: 200 + resources: + - type: r_script + path: script.R +platforms: + - type: docker + image: eddelbuettel/r2u:22.04 + setup: + - type: r + cran: [ anndata, tidyverse, lmds, dynutils, proxy, FNN] + bioc: [ SingleCellExperiment, NewWave ] + - type: apt + packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3] + - type: python + pip: [ anndata>=0.8, tensorflow ] + - type: nextflow + directives: + label: [ highmem, highcpu ] diff --git a/src/match_modality/methods/newwave_knnr_knn/script.R b/src/match_modality/methods/newwave_knnr_knn/script.R new file mode 100644 index 0000000000..2810af1ab2 --- /dev/null +++ b/src/match_modality/methods/newwave_knnr_knn/script.R @@ -0,0 +1,162 @@ +cat("Loading dependencies\n") +options(tidyverse.quiet = TRUE) +library(tidyverse) +requireNamespace("anndata", quietly = TRUE) +library(Matrix, warn.conflicts = FALSE, quietly = TRUE) +requireNamespace("NewWave", quietly = TRUE) +requireNamespace("FNN", quietly = TRUE) +requireNamespace("SingleCellExperiment", quietly = TRUE) + +## VIASH START +# path <- "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter." +# path <- "output/public_datasets/match_modality/dyngen_citeseq_1/dyngen_citeseq_1.censor_dataset.output_" +path <- "output/public_datasets/match_modality/dyngen_atac_1/dyngen_atac_1.censor_dataset.output_" +# path <- "debug/debug." +par <- list( + input_train_mod1 = paste0(path, "train_mod1.h5ad"), + input_train_mod2 = paste0(path, "train_mod2.h5ad"), + input_train_sol = paste0(path, "train_sol.h5ad"), + input_test_mod1 = paste0(path, "test_mod1.h5ad"), + input_test_mod2 = paste0(path, "test_mod2.h5ad"), + output = "output.h5ad", + n_dims = 10L, + distance_method = "spearman", + n_ga_pop = 200L, + n_ga_iter = 500L +) +meta <- list(functionality_name = "foo") + +# # read in solution data to check whether method is working +input_test_sol <- anndata::read_h5ad(paste0(path, "test_sol.h5ad")) +match_test <- input_test_sol$uns$pairing_ix + 1 +## VIASH END + +n_cores <- parallel::detectCores(all.tests = FALSE, logical = TRUE) + +method_id <- meta$functionality_name + +input_train_sol <- anndata::read_h5ad(par$input_train_sol) +match_train <- input_train_sol$uns$pairing_ix + 1 + +cat("Reading h5ad files\n") +input_train_mod1 <- anndata::read_h5ad(par$input_train_mod1) +input_test_mod1 <- anndata::read_h5ad(par$input_test_mod1) + +# fetch a few variables +train_ix <- seq_len(nrow(input_train_mod1)) +did <- input_train_mod1$uns[["dataset_id"]] +batch1 <- c(as.character(input_train_mod1$obs$batch), as.character(input_test_mod1$obs$batch)) + +cat("Running NewWave\n") +data1 <- SummarizedExperiment::SummarizedExperiment( + assays = list(counts = cbind(t(input_train_mod1$layers[["counts"]]), t(input_test_mod1$layers[["counts"]]))), + colData = data.frame(batch = factor(batch1)) +) +data1 <- data1[Matrix::rowSums(SummarizedExperiment::assay(data1)) > 0, ] +# option 1: filter by HVG +# data1 <- data1[order(proxyC::rowSds(SummarizedExperiment::assay(data1)), decreasing = TRUE)[1:100], ] + +# remove large unneeded dataset objects +rm(input_train_mod1, input_test_mod1) +gc() + +res1 <- NewWave::newWave( + data1, + X = "~batch", + verbose = TRUE, + K = 10, + maxiter_optimize = par$newwave_maxiter, + n_gene_par = min(par$newwave_ngene, nrow(data1)), + n_cell_par = min(par$newwave_ncell, ncol(data1)), + commondispersion = FALSE +) +rm(data1) +dr_x1 <- SingleCellExperiment::reducedDim(res1) + +cat("Reading h5ad files\n") +input_train_mod2 <- anndata::read_h5ad(par$input_train_mod2) +input_test_mod2 <- anndata::read_h5ad(par$input_test_mod2) + +# don't know batch ordering in input_test_mod2 +batch2 <- c(as.character(input_train_sol$obs$batch), rep("unknownbatch", nrow(input_test_mod2))) + +data2 <- SummarizedExperiment::SummarizedExperiment( + assays = list(counts = cbind(t(input_train_mod2$layers[["counts"]][order(match_train), , drop = FALSE]), t(input_test_mod2$layers[["counts"]]))), + colData = data.frame(batch = factor(batch2)) +) +data2 <- data2[Matrix::rowSums(SummarizedExperiment::assay(data2)) > 0, ] +# data2 <- data2[order(proxyC::rowSds(SummarizedExperiment::assay(data2)), decreasing = TRUE)[1:100], ] + +# remove large unneeded dataset objects +rm(input_train_mod2, input_test_mod2) +gc() + +cat("Running NewWave\n") +res2 <- NewWave::newWave( + data2, + X = "~batch", + verbose = TRUE, + K = 10, + maxiter_optimize = par$newwave_maxiter, + n_gene_par = min(par$newwave_ngene, nrow(data2)), + n_cell_par = min(par$newwave_ncell, ncol(data2)), + commondispersion = FALSE +) +dr_x2 <- SingleCellExperiment::reducedDim(res2) + +colnames(dr_x1) <- paste0("comp_", seq_len(ncol(dr_x1))) +colnames(dr_x2) <- paste0("comp_", seq_len(ncol(dr_x2))) + +# split up DR matrices +dr_x1_train <- dr_x1[train_ix, , drop = FALSE] +dr_x1_test <- dr_x1[-train_ix, , drop = FALSE] +dr_x2_train <- dr_x2[train_ix, , drop = FALSE] +dr_x2_test <- dr_x2[-train_ix, , drop = FALSE] + +cat("Predicting mod1 DR of test cells\n") +preds <- apply(dr_x1_train, 2, function(yi) { + FNN::knn.reg( + train = dr_x2_train, + test = dr_x2_test, + y = yi, + k = min(15, nrow(dr_x1_test)) + )$pred +}) + + +cat("Performing KNN between test mod2 DR and predicted test mod2\n") +knn_out <- FNN::get.knnx( + preds, + dr_x2_test, + k = min(1000, nrow(preds)) +) + +cat("Creating output data structures\n") +df <- tibble( + i = as.vector(row(knn_out$nn.index)), + j = as.vector(knn_out$nn.index), + x = 1000 - as.vector(col(knn_out$nn.index)) + 1 + # x = max(knn_out$nn.dist) * 2 - as.vector(knn_out$nn.dist) +) +knn_mat <- Matrix::sparseMatrix( + i = df$i, + j = df$j, + x = df$x, + dims = list(nrow(dr_x1_test), nrow(dr_x2_test)) +) + +# normalise to make rows sum to 1 +rs <- Matrix::rowSums(knn_mat) +knn_mat@x <- knn_mat@x / rs[knn_mat@i + 1] + +cat("Creating output anndata\n") +out <- anndata::AnnData( + X = as(knn_mat, "CsparseMatrix"), + uns = list( + dataset_id = did, + method_id = method_id + ) +) + +cat("Writing predictions to file\n") +zzz <- out$write_h5ad(par$output, compression = "gzip") diff --git a/src/match_modality/methods/procrustes_knn/config.vsh.yaml b/src/match_modality/methods/procrustes_knn/config.vsh.yaml new file mode 100644 index 0000000000..65d0327291 --- /dev/null +++ b/src/match_modality/methods/procrustes_knn/config.vsh.yaml @@ -0,0 +1,25 @@ +__merge__: ../../api/comp_method.yaml +functionality: + name: procrustes_knn + namespace: match_modality/methods + description: Perform DR with Procrustes, and match cells with a GA to minimise pairwise distances. + info: + type: method + method_name: Procrustes+KNN + arguments: + - name: "--n_svd" + type: "integer" + default: 100 + description: Number of SVD components to compress to + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: "python:3.10" + setup: + - type: python + pip: [ anndata>=0.8, scipy, scikit-learn ] + - type: nextflow + directives: + label: [ lowmem, lowcpu ] diff --git a/src/match_modality/methods/procrustes_knn/script.py b/src/match_modality/methods/procrustes_knn/script.py new file mode 100644 index 0000000000..3e16b5fbb7 --- /dev/null +++ b/src/match_modality/methods/procrustes_knn/script.py @@ -0,0 +1,112 @@ +import logging +import anndata as ad +import scipy.spatial +import scipy.sparse +import numpy as np + +from sklearn.preprocessing import normalize +from sklearn.decomposition import TruncatedSVD +from sklearn.neighbors import NearestNeighbors + +## VIASH START + +# Anything within this block will be removed by `viash` and will be +# replaced with the parameters as specified in your config.vsh.yaml. + +par = { + "input_train_mod1": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_mod1.h5ad", + "input_train_mod2": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_mod2.h5ad", + "input_train_sol": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_sol.h5ad", + "input_test_mod1": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_mod1.h5ad", + "input_test_mod2": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_mod2.h5ad", + "output": "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad", + "n_svd": 100, +} + +meta = { + "functionality_name": "foo" +} +## VIASH END + +logging.basicConfig(level=logging.INFO) + +logging.info("Load datasets") +input_train_mod1 = ad.read_h5ad(par["input_train_mod1"]) +input_train_mod2 = ad.read_h5ad(par["input_train_mod2"]) +# input_train_sol = ad.read_h5ad(par["input_train_sol"]) +input_test_mod1 = ad.read_h5ad(par["input_test_mod1"]) +input_test_mod2 = ad.read_h5ad(par["input_test_mod2"]) + +# This method runs PCA on each modality individually, then uses the Procrustes method to identify +# a linear transform that best superimposes the points from modality 1 onto modality 2. + +# concatenate train and test data +mod1 = ad.concat( + { + "train": input_train_mod1, + "test": input_test_mod1 + }, + index_unique="-", + label="group" +) +mod2 = ad.concat( + { + "train": input_train_mod2, + "test": input_test_mod2 + }, + index_unique="-", + label="group" +) + +# Create helper views to access the test data later +mod1te = mod1[mod1.obs["group"] == "test", :] +mod2te = mod2[mod2.obs["group"] == "test", :] + +logging.info("Running PCA") +n_svd = min(par["n_svd"], mod1.n_obs, mod2.n_obs, mod1.n_vars, mod1.n_vars) + +# Use TruncatedSVD for fast decomposition of the data +mod1.obsm["X_pca"] = TruncatedSVD(n_svd).fit_transform(mod1.X) +mod2.obsm["X_pca"] = TruncatedSVD(n_svd).fit_transform(mod2.X) + +logging.info("Running Procrustes Alignment") +# This function takes in two matrices of points A and B, standardizes both, and applies a linear to +# matrix B to minimize the disparity measured as the sum of the squares of the pointwise distances +# between the two input datasets +mod1.obsm["X_pro"], mod2.obsm["X_pro"], disparity = scipy.spatial.procrustes( + mod1.obsm["X_pca"], + mod2.obsm["X_pca"], +) +logging.info("> Disparity value is: %0.3f" % disparity) + +logging.info("Perform nearest neighbors") +# To get the matching matrix, for each point in mod1_test, we take the 1000 nearest neighbors of that +# point in the transformed mod2_test dataset +n_neighbors = min(1000, mod1te.n_obs, mod1te.n_vars, mod2te.n_obs, mod2te.n_vars) +nn = NearestNeighbors(n_neighbors=n_neighbors).fit(mod1te.obsm["X_pro"]) +distances, indices = nn.kneighbors(X=mod2te.obsm["X_pro"]) + +logging.info("Create pairing matrix") +# Translate the neighborhood assignments to a pairing matrix that is (n_obs, n_obs) +# NOTE: `pairing_matrix` must have NO MORE than 1000*n_obs non-zero entries for fast metric computation +ind_i = np.tile(np.arange(mod1te.n_obs), (n_neighbors, 1)).T.flatten() +ind_j = indices.flatten() +ind_dist = distances.flatten() +ind_x = 2 * max(ind_dist) - ind_dist +pairing_matrix = scipy.sparse.csr_matrix( + (ind_x, (ind_i, ind_j)), + shape=(input_test_mod1.n_obs, input_test_mod2.n_obs) +) + +# row normalise +prob_matrix = normalize(pairing_matrix, norm="l1") + +print("Write prediction output") +prediction = ad.AnnData( + X=prob_matrix, + uns={ + "dataset_id": input_train_mod1.uns["dataset_id"], + "method_id": meta["functionality_name"] + } +) +prediction.write_h5ad(par["output"]) diff --git a/src/match_modality/metrics/aupr/config.vsh.yaml b/src/match_modality/metrics/aupr/config.vsh.yaml new file mode 100644 index 0000000000..205efb353e --- /dev/null +++ b/src/match_modality/metrics/aupr/config.vsh.yaml @@ -0,0 +1,40 @@ +__merge__: ../../api/comp_metric.yaml +functionality: + name: aupr + namespace: match_modality/metrics + description: Calculating basic metrics for task2. + info: + metrics: + - metric_id: pairing_auroc + metric_name: Pairing auroc + metric_description: Area Under ROC curve + maximize: true + min: 0 + max: 1 + - metric_id: pairing_aupr + metric_name: Pairing aupr + metric_description: Area Under PR curve + maximize: true + min: 0 + max: 1 + resources: + - type: r_script + path: script.R + test_resources: + - type: r_script + path: test_custom.R + - path: ../../../../resources_test +platforms: + - type: docker + image: eddelbuettel/r2u:22.04 + setup: + - type: r + cran: [ anndata, pracma, tidyverse, testthat] + - type: apt + packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3] + - type: python + pip: [ anndata>=0.8, pyyaml ] + - type: nextflow + directives: + label: [ lowmem, lowcpu ] + diff --git a/src/match_modality/metrics/aupr/script.R b/src/match_modality/metrics/aupr/script.R new file mode 100644 index 0000000000..1609d6f673 --- /dev/null +++ b/src/match_modality/metrics/aupr/script.R @@ -0,0 +1,136 @@ +cat("Load dependencies\n") +options(tidyverse.quiet = TRUE) +library(tidyverse) +library(testthat, quietly = TRUE, warn.conflicts = FALSE) +library(Matrix, quietly = TRUE, warn.conflicts = FALSE) +requireNamespace("anndata", quietly = TRUE) +requireNamespace("pracma", quietly = TRUE) + +## VIASH START +par <- list( + input_solution = "output/output_test_sol.h5ad", + input_prediction = "output/dr_knnr_cbf_prediction.h5ad", + output = "output/aupr_score.h5ad" +) +## VIASH END + +cat("Read solution h5ad\n") +ad_sol <- anndata::read_h5ad(par$input_solution, backed = "r") + +cat("Read prediction h5ad\n") +expect_true( + grepl("\\.h5ad$", par$input_prediction), + info = "Prediction file should be an h5ad file" +) +ad_pred <- + tryCatch({ + anndata::read_h5ad(par$input_prediction) + }, error = function(e) { + stop(paste0("Can't open prediction h5ad file. Detailed error message:\n", e$message)) + }) +expect_true( + ad_sol$uns$dataset_id == ad_pred$uns$dataset_id +) + +X_pred <- as(ad_pred$X, "CsparseMatrix")[, order(ad_sol$uns$pairing_ix)] +dimnames(X_pred) <- list(NULL, NULL) + +cat("Data wrangling\n") +pred_summ <- summary(X_pred) %>% + as_tibble() %>% + mutate(gold = i == j) %>% + arrange(desc(x)) + +# helper function +calculate_au <- function(values, are_true, num_positive_interactions, num_possible_interactions, extend_by = 10000) { + ord <- order(rank(values, ties.method = "random"), decreasing = TRUE) + values <- values[ord] + are_true <- are_true[ord] + + # calculate base statistics + num_selected <- seq_along(are_true) + tp <- cumsum(are_true) + fp <- num_selected - tp + length_ranking <- length(tp) + num_negative_interactions <- num_possible_interactions - num_positive_interactions + + # extend base statistics, if necessary + if (extend_by > 0 && length_ranking != num_possible_interactions) { + diff.predictions <- num_possible_interactions - length_ranking + diff.trues <- num_positive_interactions - tail(tp, 1) + diff.negs <- num_negative_interactions - tail(fp, 1) + + multiplier <- seq_len(extend_by) / extend_by + + extra_num_selected <- multiplier * diff.predictions + tail(num_selected, 1) + extra_tp <- multiplier * diff.trues + tail(tp, 1) + extra_fp <- multiplier * diff.negs + tail(fp, 1) + + num_selected <- c(num_selected, extra_num_selected) + are_true <- c(are_true, rep(NA, extend_by)) + values <- c(values, rep(NA, extend_by)) + tp <- c(tp, extra_tp) + fp <- c(fp, extra_fp) + } + + # calculate extended statistics + metrics <- tibble( + num_selected = c(0, num_selected), + value = c(NA, values), + are_true = c(NA, are_true), + tp = c(0, tp), + fp = c(0, fp), + fn = num_positive_interactions - tp, + tn = num_negative_interactions - fp, + acc = (tp + tn) / (num_positive_interactions + num_negative_interactions), + tpr = tp / num_positive_interactions, + spec = tn / num_negative_interactions, + prec = ifelse(num_selected == 0, 1, tp / (tp + fp)), + npv = tn / (tn + fn), + f1 = 2 * tp / (2 * tp + fp + fn), + mcc = ifelse(num_selected == 0, 0, (tp * tn - fp * fn) / sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))), + informedness = tpr + spec - 1, + markedness = prec + npv - 1 + ) + + # calculate area under the curves + area_under <- tibble( + auroc = pracma::trapz(1 - metrics$spec, metrics$tpr), + aupr = abs(pracma::trapz(metrics$tpr, metrics$prec)) + ) + + list(metrics = metrics, area_under = area_under) +} + + +cat("Calculate area under the curve\n") +au_out <- calculate_au( + values = pred_summ$x, + are_true = pred_summ$gold, + num_positive_interactions = nrow(X_pred), + num_possible_interactions = (nrow(X_pred) * 1.0) * nrow(X_pred) +) + +# GENIE3bis::plot_curves(au_out) + +colnames(au_out$area_under) <- paste0("pairing_", colnames(au_out$area_under)) + +cat("Create output object\n") +out_values <- c( + as.list(au_out$area_under) +) + +out <- anndata::AnnData( + X = NULL, + shape = dim(ad_sol), + uns = list( + dataset_id = ad_pred$uns$dataset_id, + method_id = ad_pred$uns$method_id, + metric_ids = names(out_values), + metric_values = as.numeric(out_values), + genie3 = au_out + ) +) + +cat("Write output to h5ad file\n") +zzz <- out$write_h5ad(par$output, compression = "gzip") \ No newline at end of file diff --git a/src/match_modality/metrics/aupr/test_custom.R b/src/match_modality/metrics/aupr/test_custom.R new file mode 100644 index 0000000000..61e120f8d4 --- /dev/null +++ b/src/match_modality/metrics/aupr/test_custom.R @@ -0,0 +1,100 @@ +library(assertthat, quietly = TRUE) +library(Matrix, quietly = TRUE) +requireNamespace("anndata", quietly = TRUE) + +## VIASH START +# This code block will be replaced by viash at runtime. +meta <- list(functionality_name = "foo") +## VIASH END + +# determine filenames and arguments +testpar <- list( + "input_solution" = "temp_sol.h5ad", + "input_prediction" = "temp_pred.h5ad", + "output" = "temp_out.h5ad" +) +command <- paste0("./", meta[["functionality_name"]]) +args <- unlist(rbind(paste0("--", names(testpar)), unname(testpar))) + +# uncomment this for manual testing +# command <- "viash" +# args <- c("run", "src/match_modality/metrics/aupr/config.vsh.yaml", "--", args) + +cat("Creating test files\n") +ad_sol <- anndata::AnnData( + X = as(Matrix(c(1, 0, 0, 0, 1, 0, 0, 0, 1), nrow = 3, byrow = TRUE, sparse = TRUE), "CsparseMatrix"), + layers = list( + neighbors = as(Matrix(c(1, 0.5, 0, 0.5, 1, 0.25, 0, 0.25, 1), nrow = 3, byrow = TRUE, sparse = TRUE), "CsparseMatrix") + ), + uns = list(dataset_id = "simple", pairing_ix = c(0, 1, 2)), + obs = data.frame( + cell_type = c("a", "a", "b") + ) +) +ad_pred <- anndata::AnnData( + X = as(Matrix(c(1, .1, .2, .3, .9, .4, .5, .6, .8), nrow = 3, byrow = TRUE, sparse = TRUE), "CsparseMatrix"), + uns = list(dataset_id = "simple", method_id = "simple") +) + +ad_sol$write_h5ad(testpar$input_solution, compression = "gzip") +ad_pred$write_h5ad(testpar$input_prediction, compression = "gzip") + +cat("> Running metric\n") +out <- processx::run( + command = command, + args = args, + stderr_to_stdout = TRUE +) + +cat("> Reading metric scores\n") +assert_that(file.exists(testpar$output)) +ad_out <- anndata::read_h5ad(testpar$output) + +scores1 <- ad_out$uns$metric_values +names(scores1) <- ad_out$uns$metric_ids +# assert_that( +# scores1[["pairing_aupr"]] >= scores1[["neighbor_aupr"]], +# scores1[["neighbor_aupr"]] >= scores1[["celltype_aupr"]], +# scores1[["pairing_auroc"]] >= scores1[["neighbor_auroc"]], +# scores1[["pairing_auroc"]] >= scores1[["celltype_auroc"]] +# ) + + +cat("Creating test files\n") +# pairing_ix <- c(2, 1, 3) +pairing_ix <- c(3, 1, 2) +ad_sol <- anndata::AnnData( + X = as(diag(length(pairing_ix)), "CsparseMatrix")[,pairing_ix], + layers = list( + neighbors = as(Matrix(c(1, 0.5, 0, 0.5, 1, 0.25, 0, 0.25, 1), nrow = 3, byrow = TRUE, sparse = TRUE), "CsparseMatrix")[,pairing_ix] + ), + uns = list(dataset_id = "simple", pairing_ix = pairing_ix-1), + obs = data.frame( + cell_type = c("a", "a", "b") + ) +) +ad_pred <- anndata::AnnData( + X = as(Matrix(c(1, .1, .2, .3, .9, .4, .5, .6, .8), nrow = 3, byrow = TRUE, sparse = TRUE), "CsparseMatrix")[,pairing_ix], + uns = list(dataset_id = "simple", method_id = "simple") +) + +ad_sol$write_h5ad(testpar$input_solution, compression = "gzip") +ad_pred$write_h5ad(testpar$input_prediction, compression = "gzip") + +cat("> Running metric\n") +out <- processx::run( + command = command, + args = args, + stderr_to_stdout = TRUE +) + +cat("> Reading metric scores\n") +assert_that(file.exists(testpar$output)) +ad_out <- anndata::read_h5ad(testpar$output) + +scores2 <- ad_out$uns$metric_values +names(scores2) <- ad_out$uns$metric_ids + +assert_that(all(scores1 == scores2)) + +cat("> Test succeeded!\n") diff --git a/src/match_modality/metrics/check_format/config.vsh.yaml b/src/match_modality/metrics/check_format/config.vsh.yaml new file mode 100644 index 0000000000..94e833b471 --- /dev/null +++ b/src/match_modality/metrics/check_format/config.vsh.yaml @@ -0,0 +1,35 @@ +__merge__: ../../api/comp_metric.yaml +functionality: + name: check_format + namespace: match_modality/metrics + description: Checking whether the prediction of a method has the right format. + info: + metrics: + - metric_id: finished + metric_name: Finished + metric_description: Check if metric finished succesfully + maximize: true + min: 0 + max: 1 + - metric_id: correct_format + metric_name: Correct Format + metric_description: Check if predictioin has the right format + maximize: true + min: 0 + max: 1 + resources: + - type: r_script + path: script.R +platforms: + - type: docker + image: eddelbuettel/r2u:22.04 + setup: + - type: r + cran: [ anndata, bit64] + - type: apt + packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3] + - type: python + pip: [ anndata>=0.8, pyyaml ] + - type: nextflow + directives: + label: [ lowmem, lowcpu ] \ No newline at end of file diff --git a/src/match_modality/metrics/check_format/script.R b/src/match_modality/metrics/check_format/script.R new file mode 100644 index 0000000000..ade396c0bb --- /dev/null +++ b/src/match_modality/metrics/check_format/script.R @@ -0,0 +1,67 @@ +cat("Load dependencies\n") +library(assertthat, quietly = TRUE, warn.conflicts = FALSE) +requireNamespace("anndata", quietly = TRUE) + +## VIASH START +task <- "match_modality" +par <- list( + input_solution = paste0("resources_test/", task, "/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_sol.h5ad"), + input_prediction = paste0("resources_test/", task, "/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad"), + output = paste0("resources_test/", task, "/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.scores.h5ad") +) +## VIASH END + +cat("Read prediction h5ad\n") +ad_sol <- anndata::read_h5ad(par$input_solution, backed = "r") + +cat("Checking solution h5ad\n") +correct_format <- tryCatch({ + # read prediction + ad_pred <- anndata::read_h5ad(par$input_prediction) + + # check dataset id + dataset_id <- ad_pred$uns[["dataset_id"]] + assert_that(dataset_id == ad_sol$uns[["dataset_id"]]) + + # check method id + method_id <- ad_pred$uns[["method_id"]] + assert_that( + is.character(method_id), + method_id != "" + ) + + # check X + assert_that( + is(ad_pred$X, "sparseMatrix"), + ad_pred$n_obs == ad_sol$n_obs, + ad_pred$n_vars == ad_sol$n_vars, + length(ad_pred$X@x) <= 1000 * ad_sol$n_obs, + all(ad_pred$X@x >= 0), + isTRUE(all.equal( + Matrix::rowSums(ad_pred$X), + rep(1, ad_pred$n_obs), + check.attributes = FALSE, + tolerance = 1e-5 + )) + ) + + 1 +}, error = function(e) { + cat("ERROR: ", e$message, "\n", sep = "") + 0 +}) + + +cat("Create output object\n") +out <- anndata::AnnData( + shape = c(0, 0), + uns = list( + dataset_id = ad_pred$uns$dataset_id, + method_id = ad_pred$uns$method_id, + metric_ids = c("finished", "correct_format"), + metric_values = c(1, correct_format) + ) +) + +cat("Write output to h5ad file\n") +zzz <- out$write_h5ad(par$output, compression = "gzip") diff --git a/src/match_modality/metrics/match_probability/config.vsh.yaml b/src/match_modality/metrics/match_probability/config.vsh.yaml new file mode 100644 index 0000000000..c9c274f954 --- /dev/null +++ b/src/match_modality/metrics/match_probability/config.vsh.yaml @@ -0,0 +1,30 @@ +__merge__: ../../api/comp_metric.yaml +functionality: + name: match_probability + namespace: match_modality/metrics + description: Calculating basic metrics for task2. + info: + metrics: + - metric_id: match_probability + metric_name: Match Probability + metric_description: Calculating basic metrics for match modality + min: 0 + max: 1 + maximize: true + resources: + - type: r_script + path: script.R +platforms: + - type: docker + image: eddelbuettel/r2u:22.04 + setup: + - type: r + cran: [ anndata, tidyverse, testthat] + - type: apt + packages: [ libhdf5-dev, libgeos-dev, python3, python3-pip, python3-dev, python-is-python3] + - type: python + pip: [ anndata>=0.8, pyyaml ] + - type: nextflow + directives: + label: [ lowmem, lowcpu ] + diff --git a/src/match_modality/metrics/match_probability/script.R b/src/match_modality/metrics/match_probability/script.R new file mode 100644 index 0000000000..b68051ad2b --- /dev/null +++ b/src/match_modality/metrics/match_probability/script.R @@ -0,0 +1,56 @@ +cat("Load dependencies\n") +options(tidyverse.quiet = TRUE) +library(tidyverse) +library(testthat, quietly = TRUE, warn.conflicts = FALSE) +library(Matrix, quietly = TRUE, warn.conflicts = FALSE) +requireNamespace("anndata", quietly = TRUE) + +## VIASH START +par <- list( + input_solution = "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_sol.h5ad", + input_prediction = "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.prediction.h5ad", + output = "resources_test/match_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.scores.h5ad" +) +## VIASH END + +cat("Read solution h5ad\n") +ad_sol <- anndata::read_h5ad(par$input_solution, backed = "r") + +cat("Read prediction h5ad\n") +ad_pred <- anndata::read_h5ad(par$input_prediction) + +cat("Unscrambling predictions\n") +pairing_ix <- ad_sol$uns[["pairing_ix"]] +X_pred <- as(ad_pred$X, "CsparseMatrix")[, order(pairing_ix)] +dimnames(X_pred) <- list(NULL, NULL) + +# set negative values to 0 +X_pred@x <- ifelse(X_pred@x < 0, 0, X_pred@x) + +cat("Calculating normalisation factors\n") +rowSum <- Matrix::rowSums(X_pred) + +cat("Computing the match modality score\n") +match_probability_vec <- diag(X_pred) / rowSum + +match_probability <- mean(match_probability_vec) + +cat("Create output object\n") +out <- anndata::AnnData( + shape = c(0, 0), + uns = list( + dataset_id = ad_pred$uns$dataset_id, + method_id = ad_pred$uns$method_id, + metric_ids = list("match_probability"), + metric_values = list(match_probability), + per_cell = list( + match_probability = match_probability_vec + ) + ) +) + +# should we also save the metrics object? +# this would allow for plotting the auroc and aupr curves afterwards. + +cat("Write output to h5ad file\n") +zzz <- out$write_h5ad(par$output, compression = "gzip") \ No newline at end of file diff --git a/src/match_modality/resources_scripts/mask_datasets.sh b/src/match_modality/resources_scripts/mask_datasets.sh new file mode 100644 index 0000000000..a70cc985d5 --- /dev/null +++ b/src/match_modality/resources_scripts/mask_datasets.sh @@ -0,0 +1,64 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +COMMON_DATASETS="resources/datasets/openproblems_v1" +OUTPUT_DIR="resources/match_modality/datasets/openproblems_v1" + +if [ ! -d "$OUTPUT_DIR" ]; then + mkdir -p "$OUTPUT_DIR" +fi + +params_file="$OUTPUT_DIR/params.yaml" + +if [ ! -f $params_file ]; then + python << HERE +import anndata as ad +import glob +import yaml + +h5ad_files = glob.glob("$COMMON_DATASETS/**.h5ad") + +# this task doesn't use normalizations +# +param_list = {} + +for h5ad_file in h5ad_files: + print(f"Checking {h5ad_file}") + adata = ad.read_h5ad(h5ad_file, backed=True) + if "counts" in adata.layers: + dataset_id = adata.uns["dataset_id"].replace("/", ".") + obj = { + 'id': dataset_id, + 'input': h5ad_file, + 'dataset_id': dataset_id, + } + param_list[dataset_id] = obj + +output = { + "param_list": list(param_list.values()), + "seed": 123, + "output_train": "\$id.train.h5ad", + "output_test": "\$id.test.h5ad" +} + +with open("$params_file", "w") as file: + yaml.dump(output, file) +HERE +fi + +export NXF_VER=22.04.5 +nextflow \ + run . \ + -main-script target/nextflow/match_modality/mask_dataset/main.nf \ + -profile docker \ + -resume \ + -params-file $params_file \ + --publish_dir "$OUTPUT_DIR" + +bin/tools/docker/nextflow/process_log/process_log \ + --output "$OUTPUT_DIR/nextflow_log.tsv" \ No newline at end of file diff --git a/src/match_modality/resources_scripts/run_benchmarks.sh b/src/match_modality/resources_scripts/run_benchmarks.sh new file mode 100644 index 0000000000..3bf2418934 --- /dev/null +++ b/src/match_modality/resources_scripts/run_benchmarks.sh @@ -0,0 +1,74 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +set -e + +export TOWER_WORKSPACE_ID=53907369739130 + +DATASETS_DIR="resources/match_modality/datasets/openproblems_v1" +OUTPUT_DIR="resources/match_modality/benchmarks/openproblems_v1" + +if [ ! -d "$OUTPUT_DIR" ]; then + mkdir -p "$OUTPUT_DIR" +fi + +params_file="$OUTPUT_DIR/params.yaml" + +if [ ! -f $params_file ]; then + python << HERE +import yaml +import os + +dataset_dir = "$DATASETS_DIR" +output_dir = "$OUTPUT_DIR" + +# read split datasets yaml +with open(dataset_dir + "/params.yaml", "r") as file: + split_list = yaml.safe_load(file) +datasets = split_list['param_list'] + +# figure out where train/test files were stored +param_list = [] + +for dataset in datasets: + id = dataset["id"] + input_train = dataset_dir + "/" + id + ".train.h5ad" + input_test = dataset_dir + "/" + id + ".test.h5ad" + + if os.path.exists(input_test): + obj = { + 'id': id, + 'id': id, + 'id': id, + 'dataset_id': dataset["dataset_id"], + 'input_train': input_train, + 'input_test': input_test + } + param_list.append(obj) + +# write as output file +output = { + "param_list": param_list, +} + +with open(output_dir + "/params.yaml", "w") as file: + yaml.dump(output, file) +HERE +fi + +export NXF_VER=22.04.5 +nextflow \ + run . \ + -main-script src/match_modality/workflows/run/main.nf \ + -profile docker \ + -params-file "$params_file" \ + --publish_dir "$OUTPUT_DIR" \ + -with-tower + +bin/tools/docker/nextflow/process_log/process_log \ + --output "$OUTPUT_DIR/nextflow_log.tsv" \ No newline at end of file diff --git a/src/match_modality/resources_test_scripts/bmmc_cite.sh b/src/match_modality/resources_test_scripts/bmmc_cite.sh new file mode 100644 index 0000000000..d3debaa836 --- /dev/null +++ b/src/match_modality/resources_test_scripts/bmmc_cite.sh @@ -0,0 +1,65 @@ +#!/bin/bash +# +#make sure the following command has been executed +#viash ns build -q 'match_modality|common' --parallel --setup cb + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +MOD_1_DATA=resources_test/common/openproblems_bmmc_cite_starter/openproblems_bmmc_cite_starter.output_rna.h5ad +MOD_2_DATA=resources_test/common/openproblems_bmmc_cite_starter/openproblems_bmmc_cite_starter.output_mod2.h5ad +DATASET_DIR=resources_test/match_modality/bmmc_cite + +if [ ! -f $MOD_1_DATA ]; then + echo "Error! Could not find raw data" + exit 1 +fi + +mkdir -p $DATASET_DIR + +# maskdataset +viash run src/match_modality/mask_dataset/config.vsh.yaml -- \ + --input_mod1 $MOD_1_DATA \ + --input_mod2 $MOD_2_DATA \ + --output_train_mod1 $DATASET_DIR/cite_train_mod1.h5ad \ + --output_train_mod2 $DATASET_DIR/cite_train_mod2.h5ad \ + --output_train_sol $DATASET_DIR/cite_train_sol.h5ad \ + --output_test_mod1 $DATASET_DIR/cite_test_mod1.h5ad \ + --output_test_mod2 $DATASET_DIR/cite_test_mod2.h5ad \ + --output_test_sol $DATASET_DIR/cite_test_sol.h5ad + +# run one method +viash run src/match_modality/methods/dr_knnr_cbf/config.vsh.yaml -- \ + --input_train_mod1 $DATASET_DIR/cite_train_mod1.h5ad \ + --input_train_mod2 $DATASET_DIR/cite_train_mod2.h5ad \ + --input_train_sol $DATASET_DIR/cite_train_sol.h5ad \ + --input_test_mod1 $DATASET_DIR/cite_test_mod1.h5ad \ + --input_test_mod2 $DATASET_DIR/cite_test_mod2.h5ad \ + --output $DATASET_DIR/dr_knnr_cbf.h5ad + +# run one metric +viash run src/match_modality/metrics/aupr/config.vsh.yaml -- \ + --input_prediction $DATASET_DIR/dr_knnr_cbf.h5ad \ + --input_solution $DATASET_DIR/cite_test_sol.h5ad \ + --output $DATASET_DIR/aupr.h5ad + +# run benchmark +export NXF_VER=22.04.5 + +nextflow \ + run . \ + -main-script src/match_modality/workflows/run/main.nf \ + -profile docker \ + --id bmmc_cite \ + --dataset_id bmmc_site \ + --input_train_mod1 $DATASET_DIR/cite_train_mod1.h5ad \ + --input_train_mod2 $DATASET_DIR/cite_train_mod2.h5ad \ + --input_train_sol $DATASET_DIR/cite_train_sol.h5ad \ + --input_test_mod1 $DATASET_DIR/cite_test_mod1.h5ad \ + --input_test_mod2 $DATASET_DIR/cite_test_mod2.h5ad \ + --input_solution $DATASET_DIR/cite_test_sol.h5ad \ + --output scores.tsv \ + --publish_dir $DATASET_DIR/ \ No newline at end of file diff --git a/src/match_modality/workflows/run/config.vsh.yaml b/src/match_modality/workflows/run/config.vsh.yaml new file mode 100644 index 0000000000..58db4d0b1e --- /dev/null +++ b/src/match_modality/workflows/run/config.vsh.yaml @@ -0,0 +1,32 @@ +functionality: + name: "run_benchmark" + namespace: "match_modality/workflows" + argument_groups: + - name: Inputs + arguments: + - name: "--id" + type: "string" + description: "The ID of the dataset" + required: true + - name: "--input_train_mod1" + type: "file" # todo: replace with includes + - name: "--input_train_mod2" + type: "file" + - name: "--input_train_sol" + type: "file" # todo: replace with includes + - name: "--input_test_mod1" + type: "file"# todo: replace with includes + - name: "--input_test_mod2" + type: "file" + - name: "--input_solution" + type: "file" # todo: replace with includes + - name: Outputs + arguments: + - name: "--output" + direction: "output" + type: file + resources: + - type: nextflow_script + path: main.nf +platforms: + - type: nextflow \ No newline at end of file diff --git a/src/match_modality/workflows/run/main.nf b/src/match_modality/workflows/run/main.nf new file mode 100644 index 0000000000..fe48e3f63f --- /dev/null +++ b/src/match_modality/workflows/run/main.nf @@ -0,0 +1,168 @@ +nextflow.enable.dsl=2 + +sourceDir = params.rootDir + "/src" +targetDir = params.rootDir + "/target/nextflow" + +// import control methods +include { constant } from "$targetDir/match_modality/control_methods/constant/main.nf" +include { random_pairing } from "$targetDir/match_modality/control_methods/random_pairing/main.nf" +include { semi_solution } from "$targetDir/match_modality/control_methods/semi_solution/main.nf" +include { solution } from "$targetDir/match_modality/control_methods/solution/main.nf" + + +// import methods +// include { babel_knn } from "$targetDir/match_modality/methods/babel_knn/main.nf" +include { dr_knnr_cbf } from "$targetDir/match_modality/methods/dr_knnr_cbf/main.nf" +include { dr_knnr_knn } from "$targetDir/match_modality/methods/dr_knnr_knn/main.nf" +include { linear_knn } from "$targetDir/match_modality/methods/linear_knn/main.nf" +include { newwave_knnr_cbf } from "$targetDir/match_modality/methods/newwave_knnr_cbf/main.nf" +include { newwave_knnr_knn } from "$targetDir/match_modality/methods/newwave_knnr_knn/main.nf" +include { procrustes_knn } from "$targetDir/match_modality/methods/procrustes_knn/main.nf" + + +// import metrics +include { aupr } from "$targetDir/match_modality/metrics/aupr/main.nf" +include { check_format } from "$targetDir/match_modality/metrics/check_format/main.nf" +include { match_probability } from "$targetDir/match_modality/metrics/match_probability/main.nf" + +// tsv generation component +include { extract_scores } from "$targetDir/common/extract_scores/main.nf" + +// import helper functions +include { readConfig; viashChannel; helpMessage } from sourceDir + "/wf_utils/WorkflowHelper.nf" +include { setWorkflowArguments; getWorkflowArguments; passthroughMap as pmap } from sourceDir + "/wf_utils/DataflowHelper.nf" + +config = readConfig("$projectDir/config.vsh.yaml") + +// construct a map of methods (id -> method_module) +methods = [ dr_knnr_cbf, dr_knnr_knn, linear_knn, newwave_knnr_cbf, newwave_knnr_knn, procrustes_knn] + .collectEntries{method -> + [method.config.functionality.name, method] + } + +workflow { + helpMessage(config) + + viashChannel(params, config) + | run_wf +} + +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + + // split params for downstream components + | setWorkflowArguments( + method: ["input_train_mod1", "input_train_mod2", "input_train_sol", "input_test_mod1", "input_test_mod2"], + metric: ["input_solution"], + output: ["output"] + ) + + // multiply events by the number of method + | add_methods + + // add input_solution to data for the positive controls + | controls_can_cheat + + // run methods + | getWorkflowArguments(key: "method") + | run_methods + + // construct tuples for metrics + | pmap{ id, file, passthrough -> + // derive unique ids from output filenames + def newId = file.getName().replaceAll(".output.*", "") + // combine prediction with solution + def newData = [ input_prediction: file, input_solution: passthrough.metric.input_solution ] + [ newId, newData, passthrough ] + } + + // run metrics + | getWorkflowArguments(key: "metric") + | run_metrics + + // convert to tsv + | aggregate_results + + emit: + output_ch +} + +workflow add_methods { + take: input_ch + main: + output_ch = Channel.fromList(methods.keySet()) + | combine(input_ch) + + // generate combined id for method_id and dataset_id + | pmap{method_id, dataset_id, data -> + def new_id = dataset_id + "." + method_id + def new_data = data.clone() + [method_id: method_id] + new_data.remove("id") + [new_id, new_data] + } + emit: output_ch +} + +workflow controls_can_cheat { + take: input_ch + main: + output_ch = input_ch + | pmap{id, data, passthrough -> + def method = methods[data.method_id] + def method_type = method.config.functionality.info.method_type + def new_data = data.clone() + if (method_type != "method") { + new_data = new_data + [input_test_sol: passthrough.metric.input_solution] + } + [id, new_data, passthrough] + } + emit: output_ch +} + +workflow run_methods { + take: input_ch + main: + // generate one channel per method + method_chs = methods.collect { method_id, method_module -> + input_ch + | filter{it[1].method_id == method_id} + | method_module + } + // mix all results + output_ch = method_chs[0].mix(*method_chs.drop(1)) + + emit: output_ch +} + +workflow run_metrics { + take: input_ch + main: + + output_ch = input_ch + | (aupr & check_format & match_probability) + | mix + + emit: output_ch +} + +workflow aggregate_results { + take: input_ch + main: + + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ it -> + [ "combined", it.collect{ it[1] } ] + it[0].drop(2) + } + | getWorkflowArguments(key: "output") + | extract_scores.run( + auto: [ publish: true ] + ) + + emit: output_ch +} \ No newline at end of file diff --git a/src/match_modality/workflows/run/nextflow.config b/src/match_modality/workflows/run/nextflow.config new file mode 100644 index 0000000000..6e33495d3a --- /dev/null +++ b/src/match_modality/workflows/run/nextflow.config @@ -0,0 +1,14 @@ +manifest { + name = 'match_modality/workflows/run' + mainScript = 'main.nf' + nextflowVersion = '!>=22.04.5' + description = 'Multi modality - math modalility' +} + +params { + rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString() +} + +// include common settings +includeConfig("${params.rootDir}/src/wf_utils/ProfilesHelper.config") +includeConfig("${params.rootDir}/src/wf_utils/labels.config") \ No newline at end of file