From 6351c4ddc5083db5d7baa6a1e39f20d7be091189 Mon Sep 17 00:00:00 2001 From: leawlb Date: Tue, 4 Oct 2022 16:45:29 +0200 Subject: [PATCH 01/32] most minimal changes that still work in dry run --- config/config-test.yaml | 8 ++++---- workflow/Snakefile | 7 +++++-- workflow/scripts/samples.py | 10 ---------- 3 files changed, 9 insertions(+), 16 deletions(-) diff --git a/config/config-test.yaml b/config/config-test.yaml index a4904a1..df1436d 100644 --- a/config/config-test.yaml +++ b/config/config-test.yaml @@ -1,5 +1,5 @@ paths: - output_dir: "/omics/groups/OE0538/internal/users/f844s/2022-bonemarrow/results" + output_dir: "/omics/groups/OE0538/internal/users/l012t/snakemake-cellranger/data" target_templates: linked_files: "linked_files/{0[individual]}/{0[sample_type]}" cellranger_count: "cellranger_count/{0[individual]}_{0[sample_type]}/outs" @@ -11,7 +11,7 @@ references: C57BL6J: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_spret/" metadata: raw: - - "~/projects/bonemarrow-study/data/otp-export/2022-09-21/OE0538_DO-0008_mmus/data-files.csv" + - "/omics/groups/OE0538/internal/users/l012t/Interspecies_BM_phd/data/data-files.csv" identifiers: - SAMPLE_NAME - Sample_Type @@ -26,7 +26,7 @@ metadata: # Enable / Disable rules and specifiy rule-specific parameters rules: cellranger_count: - extra: "" # set additional arguments for cellranger count +# extra: "" # set additional arguments for cellranger count allele_specific: False wasp_filter_reads: False - realign_bam: False + realign_bam: False \ No newline at end of file diff --git a/workflow/Snakefile b/workflow/Snakefile index 184622b..8e65206 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -4,6 +4,8 @@ import os from pathlib import Path from scripts.samples import Samples +print("works") + samples = Samples(config) REFERENCES = config["references"] @@ -15,6 +17,7 @@ def get_sample_string(identifiers): SAMPLE_WILDCARDS = get_sample_string(config['metadata']['identifiers']) print(samples.metadata) +print(SAMPLE_WILDCARDS) localrules: link_files, write_samples_spreadsheet, all # local execution of non-demanding rules @@ -79,7 +82,7 @@ rule cellranger_count: params: genome=REFERENCES["all_masked"], output_root_dir=lambda wildcards, output: Path(output[0]).parents[1], # remove last 2 levels from output path - extra=config["rules"]["cellranger_count"]["extra"] + #extra=config["rules"]["cellranger_count"]["extra"] log: "logs/cellranger_count_{individual}_{sample_type}.log" threads: 16 @@ -92,6 +95,6 @@ rule cellranger_count: "--transcriptome {params.genome} " "--fastqs {input} " "--localcores={threads} " - "{extra} " + #"{extra} " "--sample {wildcards.individual}_{wildcards.sample_type} " "> {log}" diff --git a/workflow/scripts/samples.py b/workflow/scripts/samples.py index bf9c40a..56d557c 100644 --- a/workflow/scripts/samples.py +++ b/workflow/scripts/samples.py @@ -7,8 +7,6 @@ class Samples: """ Convert the OTP-exported metadata spreadsheet into a pandas.DataFrame that provides metadata for the workflow. - - """ # columns to select columns = ["PID", @@ -64,11 +62,8 @@ def get_cellranger_filename(self, df: pd.DataFrame) -> pd.DataFrame: """Add column containing CellRanger compatible filename, i.e. in the format of - [Sample Name]_S[Sample_Number]_L00[Lane Number]_[Read Type]_001.fastq.gz - Here, [Sample Name] consists of PID and Sample Type. - See also: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/fastq-input """ @@ -104,7 +99,6 @@ def filter_by_wildcards( ) -> list: """Get item from DataFrame by subsetting using the following index attributes provided via Snakemakes wildcards object. - :return param: """ @@ -138,7 +132,3 @@ def targets(self) -> list: targets = list(map(lambda x: self.metadata.agg(x.format, axis=1).drop_duplicates().to_list(), target_templates.values())) return targets - - - -#grouped = metadata_subset.groupby(["PID", "Sample Type"])["FastQ Path"].apply(list) From 18d078dbefb749644dc8a65aa9610dfafd21a5b6 Mon Sep 17 00:00:00 2001 From: leawlb Date: Wed, 5 Oct 2022 10:50:02 +0200 Subject: [PATCH 02/32] customized for one object_ID as identifier --- config/config-test.yaml | 13 ++++++------- workflow/Snakefile | 27 ++++++++++++++------------- workflow/scripts/samples.py | 14 +++++++------- 3 files changed, 27 insertions(+), 27 deletions(-) diff --git a/config/config-test.yaml b/config/config-test.yaml index df1436d..54e6b0c 100644 --- a/config/config-test.yaml +++ b/config/config-test.yaml @@ -1,8 +1,8 @@ paths: output_dir: "/omics/groups/OE0538/internal/users/l012t/snakemake-cellranger/data" target_templates: - linked_files: "linked_files/{0[individual]}/{0[sample_type]}" - cellranger_count: "cellranger_count/{0[individual]}_{0[sample_type]}/outs" + linked_files: "linked_files/{0[individual]}" + cellranger_count: "cellranger_count/{0[individual]}/outs" samples_sheet: "metadata/samples.csv" references: all_masked: "/omics/groups/OE0538/internal/users/f844s/2022-bonemarrow/data/masked_reference/GRCm38_masked_all_strains" @@ -11,12 +11,11 @@ references: C57BL6J: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_spret/" metadata: raw: - - "/omics/groups/OE0538/internal/users/l012t/Interspecies_BM_phd/data/data-files.csv" + - "~/Interspecies_BM_phd/data/metadata_full.csv" identifiers: - - SAMPLE_NAME - - Sample_Type + - Object_ID - Age - - fraction + - Fraction # Define all columns from the metadata spreadsheet that # will be included in the SingleCellExperiment / Seurat # objects @@ -26,7 +25,7 @@ metadata: # Enable / Disable rules and specifiy rule-specific parameters rules: cellranger_count: -# extra: "" # set additional arguments for cellranger count + extra: "" # set additional arguments for cellranger count allele_specific: False wasp_filter_reads: False realign_bam: False \ No newline at end of file diff --git a/workflow/Snakefile b/workflow/Snakefile index 8e65206..b76e796 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -4,8 +4,6 @@ import os from pathlib import Path from scripts.samples import Samples -print("works") - samples = Samples(config) REFERENCES = config["references"] @@ -16,20 +14,21 @@ def get_sample_string(identifiers): SAMPLE_WILDCARDS = get_sample_string(config['metadata']['identifiers']) -print(samples.metadata) -print(SAMPLE_WILDCARDS) +#rint(samples.metadata) +#print(SAMPLE_WILDCARDS) localrules: link_files, write_samples_spreadsheet, all # local execution of non-demanding rules wildcard_constraints: - sample_type="[A-z0-9_-]+", - individual="[A-z0-9_-]+" + individual="[A-z0-9_.-]+" + +print(samples.targets) + rule all: input: samples.targets - rule link_files: """Link files with filename pattern required for Cellranger @@ -44,9 +43,9 @@ rule link_files: ) output: - directory(f"{OUTPUT_BASE_PATH}/linked_files/{{individual}}/{{sample_type}}/") + directory(f"{OUTPUT_BASE_PATH}/linked_files/{{individual}}/") log: - "logs/link_files_{individual}_{sample_type}.log" + "logs/link_files_{individual}.log" run: log_fh = open(log[0], "w") os.makedirs(output[0], exist_ok=True) @@ -69,6 +68,8 @@ rule write_samples_spreadsheet: samples.metadata.to_csv(output[0], index=False) print("done.", file=log_fh) +print("works") + rule cellranger_count: """Run Cellranger count @@ -78,23 +79,23 @@ rule cellranger_count: input: lambda wildcards: [os.path.abspath(path) for path in rules.link_files.output] output: - outs_dir=f"{OUTPUT_BASE_PATH}/cellranger_count/{{individual}}_{{sample_type}}/outs" + outs_dir=f"{OUTPUT_BASE_PATH}/cellranger_count/{{individual}}/outs" params: genome=REFERENCES["all_masked"], output_root_dir=lambda wildcards, output: Path(output[0]).parents[1], # remove last 2 levels from output path #extra=config["rules"]["cellranger_count"]["extra"] log: - "logs/cellranger_count_{individual}_{sample_type}.log" + "logs/cellranger_count_{individual}.log" threads: 16 envmodules: "cellranger/6.1.1", shell: "cd {params.output_root_dir}; " "cellranger count " - "--id {wildcards.individual}_{wildcards.sample_type} " # letters, numbers, hyphens, and underscores + "--id {wildcards.individual} " # letters, numbers, hyphens, and underscores "--transcriptome {params.genome} " "--fastqs {input} " "--localcores={threads} " #"{extra} " - "--sample {wildcards.individual}_{wildcards.sample_type} " + "--sample {wildcards.individual} " "> {log}" diff --git a/workflow/scripts/samples.py b/workflow/scripts/samples.py index 56d557c..76e8f1a 100644 --- a/workflow/scripts/samples.py +++ b/workflow/scripts/samples.py @@ -23,12 +23,12 @@ class Samples: "DATE_OF_DEATH", "LANE_NO", "READ", - "CELLRANGER_FASTQ_PATH"] + "CELLRANGER_FASTQ_PATH", + "Object_ID"] # map to rename columns in the format of (old):(new) columns_map = { - "PID": "individual", - "Sample Type": "sample_type", + "Object_ID": "individual" } def __init__(self, config): @@ -63,17 +63,17 @@ def get_cellranger_filename(self, df: pd.DataFrame) -> pd.DataFrame: """Add column containing CellRanger compatible filename, i.e. in the format of [Sample Name]_S[Sample_Number]_L00[Lane Number]_[Read Type]_001.fastq.gz - Here, [Sample Name] consists of PID and Sample Type. + Here, [Sample Name] consists of Object_ID. See also: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/fastq-input """ - grouped = df.groupby(["PID", "Sample Type", "LANE_NO", "READ"]) + grouped = df.groupby(["Object_ID", "LANE_NO", "READ"]) groups = [] for name, group in grouped: group = group.sort_values("FastQ Path") # import to have consistent sorting group["multi_sample_idx"] = range(1, len(group)+1) group["CELLRANGER_FASTQ_PATH"] = group.agg( - "{0[PID]}_{0[Sample Type]}_S{0[multi_sample_idx]}_L00{0[LANE_NO]}_R{0[READ]}_001.fastq.gz".format, + "{0[Object_ID]}_S{0[multi_sample_idx]}_L00{0[LANE_NO]}_R{0[READ]}_001.fastq.gz".format, axis=1, ) groups.append(group) @@ -102,7 +102,7 @@ def filter_by_wildcards( :return param: """ - _identifiers = ["individual", "sample_type"] + _identifiers = ["individual"] if wildcards: filters = dict((k, getattr(wildcards, k)) for k in _identifiers) From ded0ee90bbec4251647b9b34c7528aca0db76414 Mon Sep 17 00:00:00 2001 From: leawlb Date: Wed, 5 Oct 2022 11:12:14 +0200 Subject: [PATCH 03/32] WIP small addition for customization later --- config/config-test.yaml | 7 ++++--- workflow/Snakefile | 7 ++----- workflow/scripts/samples.py | 4 ++-- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/config/config-test.yaml b/config/config-test.yaml index 54e6b0c..328ff60 100644 --- a/config/config-test.yaml +++ b/config/config-test.yaml @@ -13,9 +13,10 @@ metadata: raw: - "~/Interspecies_BM_phd/data/metadata_full.csv" identifiers: - - Object_ID - - Age - - Fraction + - Species_ID + - Age_ID + - Fraction_ID + - Sample_NR # Define all columns from the metadata spreadsheet that # will be included in the SingleCellExperiment / Seurat # objects diff --git a/workflow/Snakefile b/workflow/Snakefile index b76e796..dde2ce0 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -14,17 +14,14 @@ def get_sample_string(identifiers): SAMPLE_WILDCARDS = get_sample_string(config['metadata']['identifiers']) -#rint(samples.metadata) -#print(SAMPLE_WILDCARDS) +print(samples.metadata) +print(SAMPLE_WILDCARDS) localrules: link_files, write_samples_spreadsheet, all # local execution of non-demanding rules wildcard_constraints: individual="[A-z0-9_.-]+" -print(samples.targets) - - rule all: input: samples.targets diff --git a/workflow/scripts/samples.py b/workflow/scripts/samples.py index 76e8f1a..ee8e5b6 100644 --- a/workflow/scripts/samples.py +++ b/workflow/scripts/samples.py @@ -2,7 +2,6 @@ from snakemake.io import Wildcards - class Samples: """ Convert the OTP-exported metadata spreadsheet into a pandas.DataFrame @@ -25,7 +24,7 @@ class Samples: "READ", "CELLRANGER_FASTQ_PATH", "Object_ID"] - + # map to rename columns in the format of (old):(new) columns_map = { "Object_ID": "individual" @@ -33,6 +32,7 @@ class Samples: def __init__(self, config): metadata_files = config["metadata"]["raw"] + identifiers = config["metadata"]["identifiers"] # for customization later self.output_base_dir = config["paths"]["output_dir"] self.target_templates = config["paths"]["target_templates"] From 028af443578345b1cdbba562355d91ef410f5a38 Mon Sep 17 00:00:00 2001 From: leawlb Date: Wed, 5 Oct 2022 16:06:46 +0200 Subject: [PATCH 04/32] WIP reying to run rule cellranger_count --- config/config-test.yaml | 2 +- workflow/Snakefile | 15 +++++++-------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/config/config-test.yaml b/config/config-test.yaml index 328ff60..67d667f 100644 --- a/config/config-test.yaml +++ b/config/config-test.yaml @@ -5,7 +5,7 @@ paths: cellranger_count: "cellranger_count/{0[individual]}/outs" samples_sheet: "metadata/samples.csv" references: - all_masked: "/omics/groups/OE0538/internal/users/f844s/2022-bonemarrow/data/masked_reference/GRCm38_masked_all_strains" + all_masked: "/omics/groups/OE0538/internal/users/f844s/2022-bonemarrow/data/masked_reference/GRCm38_masked_allStrains" CAST_EiJ: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_cast/" SPRET_EiJ: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_spret/" C57BL6J: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_spret/" diff --git a/workflow/Snakefile b/workflow/Snakefile index dde2ce0..29b5516 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -14,13 +14,13 @@ def get_sample_string(identifiers): SAMPLE_WILDCARDS = get_sample_string(config['metadata']['identifiers']) -print(samples.metadata) -print(SAMPLE_WILDCARDS) +#print(samples.metadata) +#print(SAMPLE_WILDCARDS) localrules: link_files, write_samples_spreadsheet, all # local execution of non-demanding rules -wildcard_constraints: - individual="[A-z0-9_.-]+" +wildcard_constraints: #individuals should not contain . or cellranger will not accept it as ID + individual="[A-z0-9_-]+" rule all: input: @@ -65,8 +65,6 @@ rule write_samples_spreadsheet: samples.metadata.to_csv(output[0], index=False) print("done.", file=log_fh) -print("works") - rule cellranger_count: """Run Cellranger count @@ -76,10 +74,10 @@ rule cellranger_count: input: lambda wildcards: [os.path.abspath(path) for path in rules.link_files.output] output: - outs_dir=f"{OUTPUT_BASE_PATH}/cellranger_count/{{individual}}/outs" + outs_dir=directory(f"{OUTPUT_BASE_PATH}/cellranger_count/{{individual}}/outs") params: genome=REFERENCES["all_masked"], - output_root_dir=lambda wildcards, output: Path(output[0]).parents[1], # remove last 2 levels from output path + output_root_dir=lambda wildcards, output: Path(output[0]).parents[1] # remove last 2 levels from output path #extra=config["rules"]["cellranger_count"]["extra"] log: "logs/cellranger_count_{individual}.log" @@ -88,6 +86,7 @@ rule cellranger_count: "cellranger/6.1.1", shell: "cd {params.output_root_dir}; " + "pwd;" "cellranger count " "--id {wildcards.individual} " # letters, numbers, hyphens, and underscores "--transcriptome {params.genome} " From 17e1d0d83e43c04def396a73115c7af3cde3a490 Mon Sep 17 00:00:00 2001 From: leawlb Date: Thu, 6 Oct 2022 13:27:58 +0200 Subject: [PATCH 05/32] path changes to try successful run --- config/config-test.yaml | 10 +++++----- workflow/Snakefile | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/config/config-test.yaml b/config/config-test.yaml index 67d667f..e42cb6e 100644 --- a/config/config-test.yaml +++ b/config/config-test.yaml @@ -6,12 +6,12 @@ paths: samples_sheet: "metadata/samples.csv" references: all_masked: "/omics/groups/OE0538/internal/users/f844s/2022-bonemarrow/data/masked_reference/GRCm38_masked_allStrains" - CAST_EiJ: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_cast/" - SPRET_EiJ: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_spret/" - C57BL6J: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_spret/" + #CAST_EiJ: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_cast/" + #SPRET_EiJ: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_spret/" + #C57BL6J: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_spret/" metadata: raw: - - "~/Interspecies_BM_phd/data/metadata_full.csv" + - "~/snakemake-cellranger/data/metadata_full.csv" identifiers: - Species_ID - Age_ID @@ -26,7 +26,7 @@ metadata: # Enable / Disable rules and specifiy rule-specific parameters rules: cellranger_count: - extra: "" # set additional arguments for cellranger count + # extra: "" # set additional arguments for cellranger count allele_specific: False wasp_filter_reads: False realign_bam: False \ No newline at end of file diff --git a/workflow/Snakefile b/workflow/Snakefile index 29b5516..04f5ebe 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -58,7 +58,7 @@ rule write_samples_spreadsheet: output: f"{OUTPUT_BASE_PATH}/{config['paths']['target_templates']['samples_sheet']}" log: - "logs/write_samples_spreadsheet.log" + f"{OUTPUT_BASE_PATH}/logs/write_samples_spreadsheet.log" run: with open(log[0], "w") as log_fh: print("storing samples metadata to {output[0]}", file=log_fh) @@ -80,7 +80,7 @@ rule cellranger_count: output_root_dir=lambda wildcards, output: Path(output[0]).parents[1] # remove last 2 levels from output path #extra=config["rules"]["cellranger_count"]["extra"] log: - "logs/cellranger_count_{individual}.log" + f"{OUTPUT_BASE_PATH}/logs/cellranger_count_{individual}.log" threads: 16 envmodules: "cellranger/6.1.1", From 31cc8e7d291a9cde7cdd5deea3e11959e389b8af Mon Sep 17 00:00:00 2001 From: leawlb Date: Thu, 6 Oct 2022 16:19:44 +0200 Subject: [PATCH 06/32] minor change for successful run with test dataset --- workflow/Snakefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index 04f5ebe..0fb4bbf 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -42,7 +42,7 @@ rule link_files: output: directory(f"{OUTPUT_BASE_PATH}/linked_files/{{individual}}/") log: - "logs/link_files_{individual}.log" + f"{OUTPUT_BASE_PATH}/logs/link_files_{{individual}}.log" run: log_fh = open(log[0], "w") os.makedirs(output[0], exist_ok=True) @@ -80,7 +80,7 @@ rule cellranger_count: output_root_dir=lambda wildcards, output: Path(output[0]).parents[1] # remove last 2 levels from output path #extra=config["rules"]["cellranger_count"]["extra"] log: - f"{OUTPUT_BASE_PATH}/logs/cellranger_count_{individual}.log" + f"{OUTPUT_BASE_PATH}/logs/cellranger_count_{{individual}}.log" threads: 16 envmodules: "cellranger/6.1.1", From 159870053f3265bd7a5f169643af70aa0446e56a Mon Sep 17 00:00:00 2001 From: leawlb Date: Thu, 6 Oct 2022 16:48:27 +0200 Subject: [PATCH 07/32] added R script for constructing SCE objects --- config/config-test.yaml | 1 + workflow/Snakefile | 10 ++++++++++ 2 files changed, 11 insertions(+) diff --git a/config/config-test.yaml b/config/config-test.yaml index e42cb6e..dafdf53 100644 --- a/config/config-test.yaml +++ b/config/config-test.yaml @@ -4,6 +4,7 @@ paths: linked_files: "linked_files/{0[individual]}" cellranger_count: "cellranger_count/{0[individual]}/outs" samples_sheet: "metadata/samples.csv" + construct_sce_objects: "sce_objects/{0[individual]}" references: all_masked: "/omics/groups/OE0538/internal/users/f844s/2022-bonemarrow/data/masked_reference/GRCm38_masked_allStrains" #CAST_EiJ: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_cast/" diff --git a/workflow/Snakefile b/workflow/Snakefile index 0fb4bbf..7e865ec 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -95,3 +95,13 @@ rule cellranger_count: #"{extra} " "--sample {wildcards.individual} " "> {log}" + +# construct SCE objects from counts matrices found in ../alignment/output/ +rule construct_sce_objects: + input: + output_cellranger = + f"{OUTPUT_BASE_PATH}/cellranger_count/{{individual}}/outs/raw_feature_bc_matrix" + output: + sce_objects = f"{OUTPUT_BASE_PATH}/sce_objects/{{individual}}" + script: + "scripts/construct_sce_objects.R" From cf7af69b8005ff70ec4d3d86f9902efe30a49e4e Mon Sep 17 00:00:00 2001 From: leawlb Date: Thu, 6 Oct 2022 16:49:26 +0200 Subject: [PATCH 08/32] added R script for SCE object construction --- workflow/scripts/construct_sce_objects.R | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 workflow/scripts/construct_sce_objects.R diff --git a/workflow/scripts/construct_sce_objects.R b/workflow/scripts/construct_sce_objects.R new file mode 100644 index 0000000..9684ca3 --- /dev/null +++ b/workflow/scripts/construct_sce_objects.R @@ -0,0 +1,24 @@ +#------------------------------------------------------------------------------- +# call objects from raw cellranger output matrices and add metadata + +library(DropletUtils, quietly = TRUE) +# r and DropletUtils modules must be installed in snakemake conda environment +# https://anaconda.org/conda-forge/r-base +# https://anaconda.org/bioconda/bioconductor-dropletutils + +#identifiers <- snakemake@params[["identifiers"]] +#single_cell_object_metadata_fields <- snakemake@params[["single_cell_object_metadata_fields"]] + + +#print(metadata) +#print(identifiers) +#print(single_cell_object_metadata_fields) + +print(snakemake@input[["output_cellranger"]]) + +# construct from raw cellranger output +sce <- read10xCounts(samples = snakemake@input[["output_cellranger"]], col.names = TRUE, + type = "sparse" ) + + +saveRDS(sce, file = snakemake@output[["sce_objects"]]) From 2fa037f51d20703d7559aeb0120164fdc3a26f35 Mon Sep 17 00:00:00 2001 From: leawlb Date: Thu, 6 Oct 2022 18:53:38 +0200 Subject: [PATCH 09/32] WIP trying to change 'individual' to identifier wildcards --- config/config-test.yaml | 3 +- workflow/Snakefile | 35 ++++++++++++++---------- workflow/scripts/construct_sce_objects.R | 35 +++++++++++++++++++----- workflow/scripts/samples.py | 3 ++ 4 files changed, 53 insertions(+), 23 deletions(-) diff --git a/config/config-test.yaml b/config/config-test.yaml index dafdf53..2888a05 100644 --- a/config/config-test.yaml +++ b/config/config-test.yaml @@ -14,10 +14,11 @@ metadata: raw: - "~/snakemake-cellranger/data/metadata_full.csv" identifiers: - - Species_ID + #- Species_ID # not needed atm since I already have only one species - Age_ID - Fraction_ID - Sample_NR + # - Object_ID # Define all columns from the metadata spreadsheet that # will be included in the SingleCellExperiment / Seurat # objects diff --git a/workflow/Snakefile b/workflow/Snakefile index 7e865ec..63c69f2 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -6,21 +6,24 @@ from scripts.samples import Samples samples = Samples(config) -REFERENCES = config["references"] +METADATA_PATH = config["metadata"]["raw"] OUTPUT_BASE_PATH = config["paths"]["output_dir"] +REFERENCES = config["references"] +IDENTIFIERS = config["metadata"]["identifiers"] + def get_sample_string(identifiers): return "_".join(["{" + label + "}" for label in identifiers]) - -SAMPLE_WILDCARDS = get_sample_string(config['metadata']['identifiers']) - -#print(samples.metadata) -#print(SAMPLE_WILDCARDS) +SAMPLE_WILDCARDS = get_sample_string(IDENTIFIERS) localrules: link_files, write_samples_spreadsheet, all # local execution of non-demanding rules -wildcard_constraints: #individuals should not contain . or cellranger will not accept it as ID - individual="[A-z0-9_-]+" +wildcard_constraints: #individuals should not contain "." + #individual="[A-z0-9_-]+", + #Species_ID="[A-z0-9_-]+", + Age_ID=IDENTIFIERS[0], + Fraction_ID="[A-z0-9_-]+", + Sample_NR="[A-z0-9-]+" rule all: input: @@ -40,9 +43,8 @@ rule link_files: ) output: - directory(f"{OUTPUT_BASE_PATH}/linked_files/{{individual}}/") log: - f"{OUTPUT_BASE_PATH}/logs/link_files_{{individual}}.log" + f"{OUTPUT_BASE_PATH}/logs/link_files_{SAMPLE_WILDCARDS}.log" run: log_fh = open(log[0], "w") os.makedirs(output[0], exist_ok=True) @@ -74,13 +76,13 @@ rule cellranger_count: input: lambda wildcards: [os.path.abspath(path) for path in rules.link_files.output] output: - outs_dir=directory(f"{OUTPUT_BASE_PATH}/cellranger_count/{{individual}}/outs") + outs_dir=directory(f"{OUTPUT_BASE_PATH}/cellranger_count/{SAMPLE_WILDCARDS}/outs") params: genome=REFERENCES["all_masked"], output_root_dir=lambda wildcards, output: Path(output[0]).parents[1] # remove last 2 levels from output path #extra=config["rules"]["cellranger_count"]["extra"] log: - f"{OUTPUT_BASE_PATH}/logs/cellranger_count_{{individual}}.log" + f"{OUTPUT_BASE_PATH}/logs/cellranger_count_{SAMPLE_WILDCARDS}.log" threads: 16 envmodules: "cellranger/6.1.1", @@ -95,13 +97,16 @@ rule cellranger_count: #"{extra} " "--sample {wildcards.individual} " "> {log}" - + # construct SCE objects from counts matrices found in ../alignment/output/ rule construct_sce_objects: input: output_cellranger = - f"{OUTPUT_BASE_PATH}/cellranger_count/{{individual}}/outs/raw_feature_bc_matrix" + f"{OUTPUT_BASE_PATH}/cellranger_count/{SAMPLE_WILDCARDS}/outs/raw_feature_bc_matrix" output: - sce_objects = f"{OUTPUT_BASE_PATH}/sce_objects/{{individual}}" + sce_objects = f"{OUTPUT_BASE_PATH}/sce_objects/{SAMPLE_WILDCARDS}" + params: + identifiers = IDENTIFIERS, + metadata = METADATA_PATH script: "scripts/construct_sce_objects.R" diff --git a/workflow/scripts/construct_sce_objects.R b/workflow/scripts/construct_sce_objects.R index 9684ca3..0e36696 100644 --- a/workflow/scripts/construct_sce_objects.R +++ b/workflow/scripts/construct_sce_objects.R @@ -9,16 +9,37 @@ library(DropletUtils, quietly = TRUE) #identifiers <- snakemake@params[["identifiers"]] #single_cell_object_metadata_fields <- snakemake@params[["single_cell_object_metadata_fields"]] - -#print(metadata) -#print(identifiers) -#print(single_cell_object_metadata_fields) - -print(snakemake@input[["output_cellranger"]]) - # construct from raw cellranger output sce <- read10xCounts(samples = snakemake@input[["output_cellranger"]], col.names = TRUE, type = "sparse" ) +# add specified metadata to object +metadata <- read.csv(file = snakemake@params[["metadata"]], + head = TRUE, + sep = ",", + check.names=FALSE, + stringsAsFactors=FALSE, + as.is=TRUE, + colClasses = "character") + +identifiers <- snakemake@params[["identifiers"]] +print(identifiers) + +#wildcards_curr <- snakemake@wildcards[[identifiers[1]]] +#print(wildcards_curr) + +# construct a unique Object_ID from the identifiers and add to metadata if necessary +for(i in snakemake@wildcards){ + print(i) +} +print(class(c(snakemake@wildcards))) + +#object_id_curr <- paste(identifiers, collapse = "_") +#print(object_id_curr) + + + saveRDS(sce, file = snakemake@output[["sce_objects"]]) + + diff --git a/workflow/scripts/samples.py b/workflow/scripts/samples.py index ee8e5b6..87d628c 100644 --- a/workflow/scripts/samples.py +++ b/workflow/scripts/samples.py @@ -1,5 +1,7 @@ import pandas as pd +#config['metadata']['identifiers'] + from snakemake.io import Wildcards class Samples: @@ -29,6 +31,7 @@ class Samples: columns_map = { "Object_ID": "individual" } + def __init__(self, config): metadata_files = config["metadata"]["raw"] From 224a296687342336f0ecac50e3c99ca5c935aa71 Mon Sep 17 00:00:00 2001 From: leawlb Date: Fri, 7 Oct 2022 10:04:52 +0200 Subject: [PATCH 10/32] WIP going back to individual --- config/config-test.yaml | 8 ++++---- workflow/Snakefile | 17 +++++++++-------- workflow/scripts/construct_sce_objects.R | 19 ++++++++++--------- workflow/scripts/samples.py | 5 +++-- 4 files changed, 26 insertions(+), 23 deletions(-) diff --git a/config/config-test.yaml b/config/config-test.yaml index 2888a05..d5dfb1f 100644 --- a/config/config-test.yaml +++ b/config/config-test.yaml @@ -15,10 +15,10 @@ metadata: - "~/snakemake-cellranger/data/metadata_full.csv" identifiers: #- Species_ID # not needed atm since I already have only one species - - Age_ID - - Fraction_ID - - Sample_NR - # - Object_ID + #- Age_ID + #- Fraction_ID + #- Sample_NR + - Object_ID # Define all columns from the metadata spreadsheet that # will be included in the SingleCellExperiment / Seurat # objects diff --git a/workflow/Snakefile b/workflow/Snakefile index 63c69f2..38dcc49 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -21,9 +21,10 @@ localrules: link_files, write_samples_spreadsheet, all # local execution of non wildcard_constraints: #individuals should not contain "." #individual="[A-z0-9_-]+", #Species_ID="[A-z0-9_-]+", - Age_ID=IDENTIFIERS[0], - Fraction_ID="[A-z0-9_-]+", - Sample_NR="[A-z0-9-]+" + #Age_ID="[A-z0-9_-]+", + #Fraction_ID="[A-z0-9_-]+", + #Sample_NR="[A-z0-9-]+" + individual="[A-z0-9-]+" rule all: input: @@ -44,7 +45,7 @@ rule link_files: ) output: log: - f"{OUTPUT_BASE_PATH}/logs/link_files_{SAMPLE_WILDCARDS}.log" + f"{OUTPUT_BASE_PATH}/logs/link_files_{{individual}}.log" run: log_fh = open(log[0], "w") os.makedirs(output[0], exist_ok=True) @@ -76,13 +77,13 @@ rule cellranger_count: input: lambda wildcards: [os.path.abspath(path) for path in rules.link_files.output] output: - outs_dir=directory(f"{OUTPUT_BASE_PATH}/cellranger_count/{SAMPLE_WILDCARDS}/outs") + outs_dir=directory(f"{OUTPUT_BASE_PATH}/cellranger_count/{{individual}}/outs") params: genome=REFERENCES["all_masked"], output_root_dir=lambda wildcards, output: Path(output[0]).parents[1] # remove last 2 levels from output path #extra=config["rules"]["cellranger_count"]["extra"] log: - f"{OUTPUT_BASE_PATH}/logs/cellranger_count_{SAMPLE_WILDCARDS}.log" + f"{OUTPUT_BASE_PATH}/logs/cellranger_count_{{individual}}.log" threads: 16 envmodules: "cellranger/6.1.1", @@ -102,9 +103,9 @@ rule cellranger_count: rule construct_sce_objects: input: output_cellranger = - f"{OUTPUT_BASE_PATH}/cellranger_count/{SAMPLE_WILDCARDS}/outs/raw_feature_bc_matrix" + f"{OUTPUT_BASE_PATH}/cellranger_count/{{individual}}/outs/raw_feature_bc_matrix" output: - sce_objects = f"{OUTPUT_BASE_PATH}/sce_objects/{SAMPLE_WILDCARDS}" + sce_objects = f"{OUTPUT_BASE_PATH}/sce_objects/{{individual}}" params: identifiers = IDENTIFIERS, metadata = METADATA_PATH diff --git a/workflow/scripts/construct_sce_objects.R b/workflow/scripts/construct_sce_objects.R index 0e36696..3a94b89 100644 --- a/workflow/scripts/construct_sce_objects.R +++ b/workflow/scripts/construct_sce_objects.R @@ -25,17 +25,11 @@ metadata <- read.csv(file = snakemake@params[["metadata"]], identifiers <- snakemake@params[["identifiers"]] print(identifiers) -#wildcards_curr <- snakemake@wildcards[[identifiers[1]]] -#print(wildcards_curr) +wildcards_curr <- snakemake@wildcards[["individual"]] +print(wildcards_curr) + -# construct a unique Object_ID from the identifiers and add to metadata if necessary -for(i in snakemake@wildcards){ - print(i) -} -print(class(c(snakemake@wildcards))) -#object_id_curr <- paste(identifiers, collapse = "_") -#print(object_id_curr) @@ -43,3 +37,10 @@ print(class(c(snakemake@wildcards))) saveRDS(sce, file = snakemake@output[["sce_objects"]]) + + +metadata <- read.csv(file = "~/snakemake-cellranger/data/metadata_full.csv", head = TRUE, sep = ",", check.names=FALSE, stringsAsFactors=FALSE, as.is=TRUE, colClasses = "character") +sce <- read10xCounts(samples = "/omics/groups/OE0538/internal/users/l012t/snakemake-cellranger/data/cellranger_count/mmus_old_str_2_0/outs/raw_feature_bc_matrix", col.names = TRUE, type = "sparse" ) + + + diff --git a/workflow/scripts/samples.py b/workflow/scripts/samples.py index 87d628c..3889625 100644 --- a/workflow/scripts/samples.py +++ b/workflow/scripts/samples.py @@ -25,11 +25,11 @@ class Samples: "LANE_NO", "READ", "CELLRANGER_FASTQ_PATH", - "Object_ID"] + "Object_ID"] # map to rename columns in the format of (old):(new) columns_map = { - "Object_ID": "individual" + "Object_ID": "individual" # Object_ID should uniquely identify each sample/object } @@ -48,6 +48,7 @@ def __init__(self, config): self.metadata = self.select_columns(metadata_full) self.metadata = self.metadata.rename(self.columns_map, axis="columns") + def rename_date_of_birth(self, row: pd.Series): From 135c3c1b2388e73b29c4528bbfe46519712d0d02 Mon Sep 17 00:00:00 2001 From: leawlb Date: Fri, 7 Oct 2022 10:41:26 +0200 Subject: [PATCH 11/32] addition of metadata to SCE objects during construction --- config/config-test.yaml | 34 --------------------- workflow/Snakefile | 20 +++++-------- workflow/scripts/construct_sce_objects.R | 38 ++++++++++-------------- workflow/scripts/samples.py | 6 ++-- 4 files changed, 28 insertions(+), 70 deletions(-) delete mode 100644 config/config-test.yaml diff --git a/config/config-test.yaml b/config/config-test.yaml deleted file mode 100644 index d5dfb1f..0000000 --- a/config/config-test.yaml +++ /dev/null @@ -1,34 +0,0 @@ -paths: - output_dir: "/omics/groups/OE0538/internal/users/l012t/snakemake-cellranger/data" - target_templates: - linked_files: "linked_files/{0[individual]}" - cellranger_count: "cellranger_count/{0[individual]}/outs" - samples_sheet: "metadata/samples.csv" - construct_sce_objects: "sce_objects/{0[individual]}" -references: - all_masked: "/omics/groups/OE0538/internal/users/f844s/2022-bonemarrow/data/masked_reference/GRCm38_masked_allStrains" - #CAST_EiJ: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_cast/" - #SPRET_EiJ: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_spret/" - #C57BL6J: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_spret/" -metadata: - raw: - - "~/snakemake-cellranger/data/metadata_full.csv" - identifiers: - #- Species_ID # not needed atm since I already have only one species - #- Age_ID - #- Fraction_ID - #- Sample_NR - - Object_ID - # Define all columns from the metadata spreadsheet that - # will be included in the SingleCellExperiment / Seurat - # objects - single_cell_object_metadata_fields: - - SAMPLE_NAME - - Age -# Enable / Disable rules and specifiy rule-specific parameters -rules: - cellranger_count: - # extra: "" # set additional arguments for cellranger count - allele_specific: False - wasp_filter_reads: False - realign_bam: False \ No newline at end of file diff --git a/workflow/Snakefile b/workflow/Snakefile index 38dcc49..8d8a8ff 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -10,20 +10,15 @@ METADATA_PATH = config["metadata"]["raw"] OUTPUT_BASE_PATH = config["paths"]["output_dir"] REFERENCES = config["references"] -IDENTIFIERS = config["metadata"]["identifiers"] +IDENTIFIER = config["metadata"]["identifier"] -def get_sample_string(identifiers): - return "_".join(["{" + label + "}" for label in identifiers]) -SAMPLE_WILDCARDS = get_sample_string(IDENTIFIERS) +#def get_sample_string(identifiers): +# return "_".join(["{" + label + "}" for label in identifiers]) +#SAMPLE_WILDCARDS = get_sample_string(IDENTIFIERS) localrules: link_files, write_samples_spreadsheet, all # local execution of non-demanding rules -wildcard_constraints: #individuals should not contain "." - #individual="[A-z0-9_-]+", - #Species_ID="[A-z0-9_-]+", - #Age_ID="[A-z0-9_-]+", - #Fraction_ID="[A-z0-9_-]+", - #Sample_NR="[A-z0-9-]+" +wildcard_constraints: #should not contain ".", see rule cellranger_count individual="[A-z0-9-]+" rule all: @@ -107,7 +102,8 @@ rule construct_sce_objects: output: sce_objects = f"{OUTPUT_BASE_PATH}/sce_objects/{{individual}}" params: - identifiers = IDENTIFIERS, - metadata = METADATA_PATH + identifier = IDENTIFIER, + metadata = METADATA_PATH, + single_cell_object_metadata_fields = config["metadata"]["single_cell_object_metadata_fields"] script: "scripts/construct_sce_objects.R" diff --git a/workflow/scripts/construct_sce_objects.R b/workflow/scripts/construct_sce_objects.R index 3a94b89..86af012 100644 --- a/workflow/scripts/construct_sce_objects.R +++ b/workflow/scripts/construct_sce_objects.R @@ -6,11 +6,9 @@ library(DropletUtils, quietly = TRUE) # https://anaconda.org/conda-forge/r-base # https://anaconda.org/bioconda/bioconductor-dropletutils -#identifiers <- snakemake@params[["identifiers"]] -#single_cell_object_metadata_fields <- snakemake@params[["single_cell_object_metadata_fields"]] - -# construct from raw cellranger output -sce <- read10xCounts(samples = snakemake@input[["output_cellranger"]], col.names = TRUE, +# construct SCE object from raw cellranger output +sce <- read10xCounts(samples = snakemake@input[["output_cellranger"]], + col.names = TRUE, type = "sparse" ) # add specified metadata to object @@ -22,25 +20,21 @@ metadata <- read.csv(file = snakemake@params[["metadata"]], as.is=TRUE, colClasses = "character") -identifiers <- snakemake@params[["identifiers"]] -print(identifiers) - -wildcards_curr <- snakemake@wildcards[["individual"]] -print(wildcards_curr) - - - - +wildcards_curr <- snakemake@wildcards[["individual"]] # to know which sample is currently loaded +identifier <- snakemake@params[["identifier"]] # to know which metadata column to compare it to to get the correct info +# subset data that is relevant for current object only, as specified by wildcard and by by single_cell_object_metadata_fields +metadata_curr <- metadata[which(metadata[which(colnames(metadata) == identifier)] == wildcards_curr),] +cols_add <- snakemake@params[["single_cell_object_metadata_fields"]] +metadata_curr <- metadata_curr[,colnames(metadata_curr) %in% cols_add] +# add cell-level metadata to each barcode in the SCE object +for(i in colnames(metadata_curr)){ + colData(sce)[i] <- rep(metadata_curr[1,i], ncol(sce)) +} saveRDS(sce, file = snakemake@output[["sce_objects"]]) - - - -metadata <- read.csv(file = "~/snakemake-cellranger/data/metadata_full.csv", head = TRUE, sep = ",", check.names=FALSE, stringsAsFactors=FALSE, as.is=TRUE, colClasses = "character") -sce <- read10xCounts(samples = "/omics/groups/OE0538/internal/users/l012t/snakemake-cellranger/data/cellranger_count/mmus_old_str_2_0/outs/raw_feature_bc_matrix", col.names = TRUE, type = "sparse" ) - - - +# for testing purposes +#metadata <- read.csv(file = "~/snakemake-cellranger/data/metadata_full.csv", head = TRUE, sep = ",", check.names=FALSE, stringsAsFactors=FALSE, as.is=TRUE, colClasses = "character") +#sce <- read10xCounts(samples = "/omics/groups/OE0538/internal/users/l012t/snakemake-cellranger/data/cellranger_count/mmus_old_str_2_0/outs/raw_feature_bc_matrix", col.names = TRUE, type = "sparse" ) diff --git a/workflow/scripts/samples.py b/workflow/scripts/samples.py index 3889625..4b7a419 100644 --- a/workflow/scripts/samples.py +++ b/workflow/scripts/samples.py @@ -28,14 +28,16 @@ class Samples: "Object_ID"] # map to rename columns in the format of (old):(new) + # Object_ID should uniquely identify each sample/object, it must already be contained in the metadata sheet + # it must be identical to config metadata identifier + # I haven't found a way to directly lift it from config yet columns_map = { - "Object_ID": "individual" # Object_ID should uniquely identify each sample/object + "Object_ID": "individual" } def __init__(self, config): metadata_files = config["metadata"]["raw"] - identifiers = config["metadata"]["identifiers"] # for customization later self.output_base_dir = config["paths"]["output_dir"] self.target_templates = config["paths"]["target_templates"] From e9691333807dad9b3cd15517f58947177b007f11 Mon Sep 17 00:00:00 2001 From: leawlb Date: Fri, 7 Oct 2022 13:53:23 +0200 Subject: [PATCH 12/32] minor fixes --- workflow/scripts/construct_sce_objects.R | 19 +++++++++---------- workflow/scripts/samples.py | 2 -- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/workflow/scripts/construct_sce_objects.R b/workflow/scripts/construct_sce_objects.R index 86af012..45795b8 100644 --- a/workflow/scripts/construct_sce_objects.R +++ b/workflow/scripts/construct_sce_objects.R @@ -1,5 +1,4 @@ #------------------------------------------------------------------------------- -# call objects from raw cellranger output matrices and add metadata library(DropletUtils, quietly = TRUE) # r and DropletUtils modules must be installed in snakemake conda environment @@ -20,11 +19,15 @@ metadata <- read.csv(file = snakemake@params[["metadata"]], as.is=TRUE, colClasses = "character") -wildcards_curr <- snakemake@wildcards[["individual"]] # to know which sample is currently loaded -identifier <- snakemake@params[["identifier"]] # to know which metadata column to compare it to to get the correct info +# to know which sample is currently loaded +wildcard_curr <- snakemake@wildcards[["individual"]] +# to know which metadata column to compare it to +identifier <- snakemake@params[["identifier"]] -# subset data that is relevant for current object only, as specified by wildcard and by by single_cell_object_metadata_fields -metadata_curr <- metadata[which(metadata[which(colnames(metadata) == identifier)] == wildcards_curr),] +# subset data that is relevant for current object only +# as specified by wildcard and by by single_cell_object_metadata_fields +metadata_curr <- metadata[ + which(metadata[which(colnames(metadata) == identifier)] == wildcard_curr),] cols_add <- snakemake@params[["single_cell_object_metadata_fields"]] metadata_curr <- metadata_curr[,colnames(metadata_curr) %in% cols_add] @@ -33,8 +36,4 @@ for(i in colnames(metadata_curr)){ colData(sce)[i] <- rep(metadata_curr[1,i], ncol(sce)) } -saveRDS(sce, file = snakemake@output[["sce_objects"]]) - -# for testing purposes -#metadata <- read.csv(file = "~/snakemake-cellranger/data/metadata_full.csv", head = TRUE, sep = ",", check.names=FALSE, stringsAsFactors=FALSE, as.is=TRUE, colClasses = "character") -#sce <- read10xCounts(samples = "/omics/groups/OE0538/internal/users/l012t/snakemake-cellranger/data/cellranger_count/mmus_old_str_2_0/outs/raw_feature_bc_matrix", col.names = TRUE, type = "sparse" ) +saveRDS(sce, file = snakemake@output[["sce_objects"]]) \ No newline at end of file diff --git a/workflow/scripts/samples.py b/workflow/scripts/samples.py index 4b7a419..4073787 100644 --- a/workflow/scripts/samples.py +++ b/workflow/scripts/samples.py @@ -1,7 +1,5 @@ import pandas as pd -#config['metadata']['identifiers'] - from snakemake.io import Wildcards class Samples: From 9276f95bfff6393b13519a3ab4012f0e6377cfbc Mon Sep 17 00:00:00 2001 From: leawlb Date: Mon, 10 Oct 2022 10:33:38 +0200 Subject: [PATCH 13/32] restored modified versions after merge --- config/config-test-lea.yaml | 48 ++++++++++++++++++++++++++++++++++ workflow/Snakefile | 51 +++++++++++++++++++++++++------------ 2 files changed, 83 insertions(+), 16 deletions(-) create mode 100644 config/config-test-lea.yaml diff --git a/config/config-test-lea.yaml b/config/config-test-lea.yaml new file mode 100644 index 0000000..abecb23 --- /dev/null +++ b/config/config-test-lea.yaml @@ -0,0 +1,48 @@ +paths: + output_dir: "/omics/groups/OE0538/internal/users/l012t/snakemake-cellranger/data" + target_templates: + linked_files: "linked_files/{0[individual]}" + cellranger_count: "cellranger_count/{0[individual]}/outs" + samples_sheet: "metadata/samples.csv" + construct_sce_objects: "sce_objects/{0[individual]}" +references: + all_masked: "/omics/groups/OE0538/internal/users/f844s/2022-bonemarrow/data/masked_reference/GRCm38_masked_allStrains" + #CAST_EiJ: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_cast/" + #SPRET_EiJ: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_spret/" + #C57BL6J: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_spret/" +metadata: + raw: + - "/omics/groups/OE0538/internal/users/l012t/snakemake-cellranger/data/metadata_full.csv" + identifier: + #- Species_ID # not needed atm since I already have only one species + #- Age_ID + #- Fraction_ID + #- Sample_NR + - Object_ID + # Define all columns from the metadata spreadsheet that + # will be included in the SingleCellExperiment / Seurat + # objects + single_cell_object_metadata_fields: + - Object_ID + - Mouse_ID + - Species_ID + - Age + - Age_ID + - Age_weeks + - Fraction + - Fraction_ID + - Antibody_combination + - Sample_NR + - Extrarun + - Batch_exp_day + - Batch_sequencing + - Date_collected + - Discard + - PID +# Enable / Disable rules and specifiy rule-specific parameters +rules: + cellranger_count: + # extra: "" # set additional arguments for cellranger count + allele_specific: False + wasp_filter_reads: False + realign_bam: False \ No newline at end of file diff --git a/workflow/Snakefile b/workflow/Snakefile index b035d0c..9f50f37 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -6,27 +6,27 @@ from scripts.samples import Samples samples = Samples(config) -REFERENCES = config["references"] +METADATA_PATH = config["metadata"]["raw"] OUTPUT_BASE_PATH = config["paths"]["output_dir"] -def get_sample_string(identifiers): - return "_".join(["{" + label + "}" for label in identifiers]) +REFERENCES = config["references"] +IDENTIFIER = config["metadata"]["identifier"] -SAMPLE_WILDCARDS = get_sample_string(config['metadata']['identifiers']) +#def get_sample_string(identifiers): +# return "_".join(["{" + label + "}" for label in identifiers]) +#SAMPLE_WILDCARDS = get_sample_string(IDENTIFIERS) print(samples.metadata) localrules: link_files, write_samples_spreadsheet, all # local execution of non-demanding rules -wildcard_constraints: - sample_type="[A-z0-9_-]+", - individual="[A-z0-9_-]+" +wildcard_constraints: #should not contain ".", see rule cellranger_count + individual="[A-z0-9-]+" rule all: input: samples.targets - rule link_files: """Link files with filename pattern required for Cellranger @@ -41,9 +41,8 @@ rule link_files: ) output: - directory(f"{OUTPUT_BASE_PATH}/linked_files/{{individual}}/{{sample_type}}/") log: - "logs/link_files_{individual}_{sample_type}.log" + f"{OUTPUT_BASE_PATH}/logs/link_files_{{individual}}.log" run: log_fh = open(log[0], "w") os.makedirs(output[0], exist_ok=True) @@ -59,7 +58,7 @@ rule write_samples_spreadsheet: output: f"{OUTPUT_BASE_PATH}/{config['paths']['target_files']['samples_sheet']}" log: - "logs/write_samples_spreadsheet.log" + f"{OUTPUT_BASE_PATH}/logs/write_samples_spreadsheet.log" run: with open(log[0], "w") as log_fh: print("storing samples metadata to {output[0]}", file=log_fh) @@ -75,23 +74,43 @@ rule cellranger_count: input: rules.link_files.output output: - outs_dir=f"{OUTPUT_BASE_PATH}/cellranger_count/{{individual}}_{{sample_type}}/outs" + outs_dir=directory(f"{OUTPUT_BASE_PATH}/cellranger_count/{{individual}}/outs") params: genome=REFERENCES["all_masked"], - output_root_dir=lambda wildcards, output: Path(output[0]).parents[1], # remove last 2 levels from output path - extra=config["rules"]["cellranger_count"]["extra"] + output_root_dir=lambda wildcards, output: Path(output[0]).parents[1] # remove last 2 levels from output path + #extra=config["rules"]["cellranger_count"]["extra"] log: - "logs/cellranger_count_{individual}_{sample_type}.log" + f"{OUTPUT_BASE_PATH}/logs/cellranger_count_{{individual}}.log" threads: 16 envmodules: "cellranger/6.1.1", shell: "cd {params.output_root_dir}; " + "pwd;" "cellranger count " - "--id {wildcards.individual}_{wildcards.sample_type} " # letters, numbers, hyphens, and underscores + "--id {wildcards.individual} " # letters, numbers, hyphens, and underscores "--transcriptome {params.genome} " "--fastqs {input} " "--localcores={threads} " +<<<<<<< HEAD + #"{extra} " + "--sample {wildcards.individual} " +======= "{params.extra} " "--sample {wildcards.individual}_{wildcards.sample_type} " +>>>>>>> cf05a99b38eb3233b3533b0392a99aa750ce3fa0 "> {log}" + +# construct SCE objects from counts matrices found in ../alignment/output/ +rule construct_sce_objects: + input: + output_cellranger = + f"{OUTPUT_BASE_PATH}/cellranger_count/{{individual}}/outs/raw_feature_bc_matrix" + output: + sce_objects = f"{OUTPUT_BASE_PATH}/sce_objects/{{individual}}" + params: + identifier = IDENTIFIER, + metadata = METADATA_PATH, + single_cell_object_metadata_fields = config["metadata"]["single_cell_object_metadata_fields"] + script: + "scripts/construct_sce_objects.R" From a2895324875afd31e1b48343a752d35c804bf58e Mon Sep 17 00:00:00 2001 From: leawlb Date: Mon, 10 Oct 2022 11:26:51 +0200 Subject: [PATCH 14/32] minor fixes --- config/config-test-lea.yaml | 3 +++ workflow/Snakefile | 8 +++----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/config/config-test-lea.yaml b/config/config-test-lea.yaml index abecb23..60401e9 100644 --- a/config/config-test-lea.yaml +++ b/config/config-test-lea.yaml @@ -5,6 +5,9 @@ paths: cellranger_count: "cellranger_count/{0[individual]}/outs" samples_sheet: "metadata/samples.csv" construct_sce_objects: "sce_objects/{0[individual]}" + # define specific target files + target_files: + samples_sheet: "metadata.csv" references: all_masked: "/omics/groups/OE0538/internal/users/f844s/2022-bonemarrow/data/masked_reference/GRCm38_masked_allStrains" #CAST_EiJ: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_cast/" diff --git a/workflow/Snakefile b/workflow/Snakefile index 9f50f37..0af6b5e 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -41,6 +41,7 @@ rule link_files: ) output: + directory(f"{OUTPUT_BASE_PATH}/linked_files/{{individual}}") log: f"{OUTPUT_BASE_PATH}/logs/link_files_{{individual}}.log" run: @@ -92,13 +93,10 @@ rule cellranger_count: "--transcriptome {params.genome} " "--fastqs {input} " "--localcores={threads} " -<<<<<<< HEAD #"{extra} " "--sample {wildcards.individual} " -======= - "{params.extra} " - "--sample {wildcards.individual}_{wildcards.sample_type} " ->>>>>>> cf05a99b38eb3233b3533b0392a99aa750ce3fa0 + #"{params.extra} " + "--sample {wildcards.individual} " "> {log}" # construct SCE objects from counts matrices found in ../alignment/output/ From 0b12e341b11c19f49c8c223c7855050f990d7b0d Mon Sep 17 00:00:00 2001 From: leawlb Date: Mon, 10 Oct 2022 11:48:09 +0200 Subject: [PATCH 15/32] minor adjustments --- workflow/Snakefile | 4 +--- workflow/scripts/samples.py | 6 ++++++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index 0af6b5e..3a1b251 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -87,14 +87,12 @@ rule cellranger_count: "cellranger/6.1.1", shell: "cd {params.output_root_dir}; " - "pwd;" + "pwd; " "cellranger count " "--id {wildcards.individual} " # letters, numbers, hyphens, and underscores "--transcriptome {params.genome} " "--fastqs {input} " "--localcores={threads} " - #"{extra} " - "--sample {wildcards.individual} " #"{params.extra} " "--sample {wildcards.individual} " "> {log}" diff --git a/workflow/scripts/samples.py b/workflow/scripts/samples.py index 4073787..c2f3a90 100644 --- a/workflow/scripts/samples.py +++ b/workflow/scripts/samples.py @@ -67,7 +67,9 @@ def get_cellranger_filename(self, df: pd.DataFrame) -> pd.DataFrame: """Add column containing CellRanger compatible filename, i.e. in the format of [Sample Name]_S[Sample_Number]_L00[Lane Number]_[Read Type]_001.fastq.gz + Here, [Sample Name] consists of Object_ID. + See also: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/fastq-input """ @@ -136,3 +138,7 @@ def targets(self) -> list: targets = list(map(lambda x: self.metadata.agg(x.format, axis=1).drop_duplicates().to_list(), target_templates.values())) return targets + + + +#grouped = metadata_subset.groupby(["PID", "Sample Type"])["FastQ Path"].apply(list) From 9a869c76123a62cb819ec495e8ac3161e95b9894 Mon Sep 17 00:00:00 2001 From: leawlb Date: Tue, 11 Oct 2022 12:03:09 +0200 Subject: [PATCH 16/32] SCe construction now functional with metadata_full --- config/config-test-lea.yaml | 3 +-- workflow/Snakefile | 2 +- workflow/scripts/construct_sce_objects.R | 4 +++- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/config/config-test-lea.yaml b/config/config-test-lea.yaml index 60401e9..a929214 100644 --- a/config/config-test-lea.yaml +++ b/config/config-test-lea.yaml @@ -1,9 +1,8 @@ paths: - output_dir: "/omics/groups/OE0538/internal/users/l012t/snakemake-cellranger/data" + output_dir: "/omics/groups/OE0538/internal/users/l012t/snakemake-cellranger/test5" target_templates: linked_files: "linked_files/{0[individual]}" cellranger_count: "cellranger_count/{0[individual]}/outs" - samples_sheet: "metadata/samples.csv" construct_sce_objects: "sce_objects/{0[individual]}" # define specific target files target_files: diff --git a/workflow/Snakefile b/workflow/Snakefile index 3a1b251..103dc7b 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -101,7 +101,7 @@ rule cellranger_count: rule construct_sce_objects: input: output_cellranger = - f"{OUTPUT_BASE_PATH}/cellranger_count/{{individual}}/outs/raw_feature_bc_matrix" + f"{OUTPUT_BASE_PATH}/cellranger_count/{{individual}}/outs/" output: sce_objects = f"{OUTPUT_BASE_PATH}/sce_objects/{{individual}}" params: diff --git a/workflow/scripts/construct_sce_objects.R b/workflow/scripts/construct_sce_objects.R index 45795b8..578265f 100644 --- a/workflow/scripts/construct_sce_objects.R +++ b/workflow/scripts/construct_sce_objects.R @@ -6,7 +6,9 @@ library(DropletUtils, quietly = TRUE) # https://anaconda.org/bioconda/bioconductor-dropletutils # construct SCE object from raw cellranger output -sce <- read10xCounts(samples = snakemake@input[["output_cellranger"]], +print(paste0(snakemake@input[["output_cellranger"]], "raw_feature_bc_matrix")) + +sce <- read10xCounts(samples = paste0(snakemake@input[["output_cellranger"]], "/raw_feature_bc_matrix"), col.names = TRUE, type = "sparse" ) From cf8fb83523faaaf477fa73b806012a9af0bdbbfe Mon Sep 17 00:00:00 2001 From: leawlb Date: Wed, 12 Oct 2022 00:56:25 +0200 Subject: [PATCH 17/32] concatenate multiple identifiers --- config/config-test-lea.yaml | 20 ++++++----- workflow/Snakefile | 18 ++++------ workflow/scripts/construct_sce_objects.R | 31 ++++++++++------- workflow/scripts/samples.py | 42 ++++++++++++++++++------ 4 files changed, 70 insertions(+), 41 deletions(-) diff --git a/config/config-test-lea.yaml b/config/config-test-lea.yaml index a929214..d9718ef 100644 --- a/config/config-test-lea.yaml +++ b/config/config-test-lea.yaml @@ -1,5 +1,5 @@ paths: - output_dir: "/omics/groups/OE0538/internal/users/l012t/snakemake-cellranger/test5" + output_dir: "/omics/groups/OE0538/internal/users/l012t/snakemake-cellranger/test10" target_templates: linked_files: "linked_files/{0[individual]}" cellranger_count: "cellranger_count/{0[individual]}/outs" @@ -15,16 +15,20 @@ references: metadata: raw: - "/omics/groups/OE0538/internal/users/l012t/snakemake-cellranger/data/metadata_full.csv" - identifier: - #- Species_ID # not needed atm since I already have only one species - #- Age_ID - #- Fraction_ID - #- Sample_NR - - Object_ID + identifiers: + # Define column name(s) used to define a sample (in correct order) + # entries from multiple cols are concatenated into one sample-specific name + # in the format of {Species_ID}_{Age_ID}_{Fraction_ID}_{Sample_NR} + - Species_ID # col names I manually added to metadata sheet beforehand + - Age_ID + - Fraction_ID + - Sample_NR + #- Objects_ID + #- PID # col name automatically included in OTP metadata sheets + single_cell_object_metadata_fields: # Define all columns from the metadata spreadsheet that # will be included in the SingleCellExperiment / Seurat # objects - single_cell_object_metadata_fields: - Object_ID - Mouse_ID - Species_ID diff --git a/workflow/Snakefile b/workflow/Snakefile index 103dc7b..b7faef1 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -10,18 +10,15 @@ METADATA_PATH = config["metadata"]["raw"] OUTPUT_BASE_PATH = config["paths"]["output_dir"] REFERENCES = config["references"] -IDENTIFIER = config["metadata"]["identifier"] - -#def get_sample_string(identifiers): -# return "_".join(["{" + label + "}" for label in identifiers]) -#SAMPLE_WILDCARDS = get_sample_string(IDENTIFIERS) +IDENTIFIERS = config["metadata"]["identifiers"] print(samples.metadata) +print(samples.metadata["individual"].unique()) -localrules: link_files, write_samples_spreadsheet, all # local execution of non-demanding rules +localrules: link_files, write_samples_spreadsheet, all, construct_sce_objects # local execution of non-demanding rules wildcard_constraints: #should not contain ".", see rule cellranger_count - individual="[A-z0-9-]+" + individual="[A-z0-9_-]+" rule all: input: @@ -100,13 +97,12 @@ rule cellranger_count: # construct SCE objects from counts matrices found in ../alignment/output/ rule construct_sce_objects: input: - output_cellranger = - f"{OUTPUT_BASE_PATH}/cellranger_count/{{individual}}/outs/" + output_cellranger = rules.cellranger_count.output output: sce_objects = f"{OUTPUT_BASE_PATH}/sce_objects/{{individual}}" params: - identifier = IDENTIFIER, - metadata = METADATA_PATH, + identifiers = IDENTIFIERS, + metadata = METADATA_PATH, # could be replaced with samples.metadata if R script is changed as well single_cell_object_metadata_fields = config["metadata"]["single_cell_object_metadata_fields"] script: "scripts/construct_sce_objects.R" diff --git a/workflow/scripts/construct_sce_objects.R b/workflow/scripts/construct_sce_objects.R index 578265f..7ba72c1 100644 --- a/workflow/scripts/construct_sce_objects.R +++ b/workflow/scripts/construct_sce_objects.R @@ -1,14 +1,16 @@ #------------------------------------------------------------------------------- library(DropletUtils, quietly = TRUE) -# r and DropletUtils modules must be installed in snakemake conda environment +library(tidyverse, quietly = TRUE) +# r and library modules must be installed in snakemake conda environment # https://anaconda.org/conda-forge/r-base # https://anaconda.org/bioconda/bioconductor-dropletutils +# https://anaconda.org/r/r-tidyverse # construct SCE object from raw cellranger output -print(paste0(snakemake@input[["output_cellranger"]], "raw_feature_bc_matrix")) - -sce <- read10xCounts(samples = paste0(snakemake@input[["output_cellranger"]], "/raw_feature_bc_matrix"), +print(paste0(snakemake@input[["output_cellranger"]], "/raw_feature_bc_matrix")) +sce <- read10xCounts(samples = paste0(snakemake@input[["output_cellranger"]], + "/raw_feature_bc_matrix"), col.names = TRUE, type = "sparse" ) @@ -21,15 +23,20 @@ metadata <- read.csv(file = snakemake@params[["metadata"]], as.is=TRUE, colClasses = "character") -# to know which sample is currently loaded -wildcard_curr <- snakemake@wildcards[["individual"]] -# to know which metadata column to compare it to -identifier <- snakemake@params[["identifier"]] +wildcard_curr <- snakemake@wildcards[["individual"]] # currently loaded sample +IDENTIFIERS <- snakemake@params[["identifiers"]] + +# if necessary concatenate identifiers again to obtain all possible wildcards +metadata_curr <- metadata +if(! "individual" %in% colnames(metadata_curr)){ + metadata_curr <- metadata_curr %>% + unite(col = "individual", all_of(IDENTIFIERS), sep = "_", remove = FALSE) + print('establishing "individual" column, construct_sce_objects.R') +} -# subset data that is relevant for current object only -# as specified by wildcard and by by single_cell_object_metadata_fields -metadata_curr <- metadata[ - which(metadata[which(colnames(metadata) == identifier)] == wildcard_curr),] +# subset data as specified by wildcard and single_cell_object_metadata_fields +metadata_curr <- metadata_curr[ + which(metadata_curr$individual == wildcard_curr),] cols_add <- snakemake@params[["single_cell_object_metadata_fields"]] metadata_curr <- metadata_curr[,colnames(metadata_curr) %in% cols_add] diff --git a/workflow/scripts/samples.py b/workflow/scripts/samples.py index c2f3a90..482a981 100644 --- a/workflow/scripts/samples.py +++ b/workflow/scripts/samples.py @@ -2,6 +2,9 @@ from snakemake.io import Wildcards +# list identifiers identically to config identifiers and in same order +IDENTIFIERS = ["Species_ID", "Age_ID", "Fraction_ID", "Sample_NR"] # find a way to lift directly from config + class Samples: """ Convert the OTP-exported metadata spreadsheet into a pandas.DataFrame @@ -23,15 +26,16 @@ class Samples: "LANE_NO", "READ", "CELLRANGER_FASTQ_PATH", - "Object_ID"] + "individual"] # include individual which is later generated + columns = columns + IDENTIFIERS # map to rename columns in the format of (old):(new) # Object_ID should uniquely identify each sample/object, it must already be contained in the metadata sheet # it must be identical to config metadata identifier # I haven't found a way to directly lift it from config yet - columns_map = { - "Object_ID": "individual" - } + #columns_map = { + # "Object_ID": "individual" + #} def __init__(self, config): @@ -42,13 +46,31 @@ def __init__(self, config): metadata_full = pd.concat((pd.read_csv(f) for f in metadata_files), ignore_index=True) + """Make "individual" column + + A single identifier col is effectively renamed to "individual" but + entries from multiple identifier cols are concatenated to "individual". + + Entries in the "individual" col are used as wildcards. + + For metadata_full it is used for downstream functions (?). + + This may cause a warning that I still have to take care of but that doesn't seem critical. + """ + if not "individual" in metadata_full.columns: + print('establishing "individual" column') + for i in IDENTIFIERS: + if i == IDENTIFIERS[0]: + metadata_full["individual"] = "" + metadata_full["individual"] = metadata_full["individual"] + metadata_full[i].map(str) + else: + metadata_full["individual"] = metadata_full["individual"] + '_' + metadata_full[i].map(str) + metadata_full["DATE_OF_BIRTH"] = metadata_full.apply(self.rename_date_of_birth, axis=1) - + metadata_full = self.get_cellranger_filename(metadata_full) self.metadata = self.select_columns(metadata_full) - self.metadata = self.metadata.rename(self.columns_map, axis="columns") - def rename_date_of_birth(self, row: pd.Series): @@ -68,18 +90,18 @@ def get_cellranger_filename(self, df: pd.DataFrame) -> pd.DataFrame: i.e. in the format of [Sample Name]_S[Sample_Number]_L00[Lane Number]_[Read Type]_001.fastq.gz - Here, [Sample Name] consists of Object_ID. + Here, [Sample Name] consists of "individual". See also: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/fastq-input """ - grouped = df.groupby(["Object_ID", "LANE_NO", "READ"]) + grouped = df.groupby(["individual", "LANE_NO", "READ"]) groups = [] for name, group in grouped: group = group.sort_values("FastQ Path") # import to have consistent sorting group["multi_sample_idx"] = range(1, len(group)+1) group["CELLRANGER_FASTQ_PATH"] = group.agg( - "{0[Object_ID]}_S{0[multi_sample_idx]}_L00{0[LANE_NO]}_R{0[READ]}_001.fastq.gz".format, + "{0[individual]}_S{0[multi_sample_idx]}_L00{0[LANE_NO]}_R{0[READ]}_001.fastq.gz".format, axis=1, ) groups.append(group) From f415304c377cc877da4b58f055cb7b4c50d0c91f Mon Sep 17 00:00:00 2001 From: leawlb Date: Wed, 12 Oct 2022 13:09:49 +0200 Subject: [PATCH 18/32] minor changes for functionality --- config/config-example.yaml | 9 ++++----- config/config-test-lea.yaml | 6 ++++-- workflow/Snakefile | 2 +- workflow/scripts/construct_sce_objects.R | 5 ++--- workflow/scripts/samples.py | 5 ++--- 5 files changed, 13 insertions(+), 14 deletions(-) diff --git a/config/config-example.yaml b/config/config-example.yaml index 2040a43..0e12dce 100644 --- a/config/config-example.yaml +++ b/config/config-example.yaml @@ -22,16 +22,15 @@ metadata: - "" # Define the column names used to define a sample identifiers: - - SAMPLE_NAME + - PID - Sample_Type - - Age - - fraction # Define all columns from the metadata spreadsheet that # will be included in the SingleCellExperiment / Seurat # objects single_cell_object_metadata_fields: - - SAMPLE_NAME - - Age + - PID + - Sample_Type + - individual # Enable / Disable rules and specifiy rule-specific parameters rules: cellranger_count: diff --git a/config/config-test-lea.yaml b/config/config-test-lea.yaml index d9718ef..327b7db 100644 --- a/config/config-test-lea.yaml +++ b/config/config-test-lea.yaml @@ -1,5 +1,5 @@ paths: - output_dir: "/omics/groups/OE0538/internal/users/l012t/snakemake-cellranger/test10" + output_dir: "/omics/groups/OE0538/internal/users/l012t/snakemake-cellranger/data" target_templates: linked_files: "linked_files/{0[individual]}" cellranger_count: "cellranger_count/{0[individual]}/outs" @@ -25,11 +25,12 @@ metadata: - Sample_NR #- Objects_ID #- PID # col name automatically included in OTP metadata sheets + #- Sample Type # col name automatically included in OTP metadata sheets single_cell_object_metadata_fields: # Define all columns from the metadata spreadsheet that # will be included in the SingleCellExperiment / Seurat # objects - - Object_ID + - Object_ID # colnames I manually added to metadata - Mouse_ID - Species_ID - Age @@ -44,6 +45,7 @@ metadata: - Batch_sequencing - Date_collected - Discard + - individual - PID # Enable / Disable rules and specifiy rule-specific parameters rules: diff --git a/workflow/Snakefile b/workflow/Snakefile index b7faef1..d3d99cb 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -102,7 +102,7 @@ rule construct_sce_objects: sce_objects = f"{OUTPUT_BASE_PATH}/sce_objects/{{individual}}" params: identifiers = IDENTIFIERS, - metadata = METADATA_PATH, # could be replaced with samples.metadata if R script is changed as well + metadata = METADATA_PATH, single_cell_object_metadata_fields = config["metadata"]["single_cell_object_metadata_fields"] script: "scripts/construct_sce_objects.R" diff --git a/workflow/scripts/construct_sce_objects.R b/workflow/scripts/construct_sce_objects.R index 7ba72c1..e2625aa 100644 --- a/workflow/scripts/construct_sce_objects.R +++ b/workflow/scripts/construct_sce_objects.R @@ -23,7 +23,7 @@ metadata <- read.csv(file = snakemake@params[["metadata"]], as.is=TRUE, colClasses = "character") -wildcard_curr <- snakemake@wildcards[["individual"]] # currently loaded sample +wildcard_curr <- snakemake@wildcards[["individual"]] # currently loaded individual sample IDENTIFIERS <- snakemake@params[["identifiers"]] # if necessary concatenate identifiers again to obtain all possible wildcards @@ -35,8 +35,7 @@ if(! "individual" %in% colnames(metadata_curr)){ } # subset data as specified by wildcard and single_cell_object_metadata_fields -metadata_curr <- metadata_curr[ - which(metadata_curr$individual == wildcard_curr),] +metadata_curr <- metadata_curr[which(metadata_curr$individual == wildcard_curr),] cols_add <- snakemake@params[["single_cell_object_metadata_fields"]] metadata_curr <- metadata_curr[,colnames(metadata_curr) %in% cols_add] diff --git a/workflow/scripts/samples.py b/workflow/scripts/samples.py index 482a981..511668e 100644 --- a/workflow/scripts/samples.py +++ b/workflow/scripts/samples.py @@ -48,7 +48,7 @@ def __init__(self, config): """Make "individual" column - A single identifier col is effectively renamed to "individual" but + A single identifier col is copied and renamed to "individual" but entries from multiple identifier cols are concatenated to "individual". Entries in the "individual" col are used as wildcards. @@ -61,8 +61,7 @@ def __init__(self, config): print('establishing "individual" column') for i in IDENTIFIERS: if i == IDENTIFIERS[0]: - metadata_full["individual"] = "" - metadata_full["individual"] = metadata_full["individual"] + metadata_full[i].map(str) + metadata_full["individual"] = metadata_full[i].map(str) else: metadata_full["individual"] = metadata_full["individual"] + '_' + metadata_full[i].map(str) From 1489930b1306cdbc82ff9f4da4fee314fd5bc673 Mon Sep 17 00:00:00 2001 From: leawlb Date: Fri, 21 Oct 2022 17:48:45 +0200 Subject: [PATCH 19/32] changed subfolderstructure, added second wildcard again --- config/config-test-lea.yaml | 12 ++++++------ workflow/Snakefile | 21 ++++++++++++--------- workflow/scripts/samples.py | 2 +- 3 files changed, 19 insertions(+), 16 deletions(-) diff --git a/config/config-test-lea.yaml b/config/config-test-lea.yaml index 327b7db..9c66a52 100644 --- a/config/config-test-lea.yaml +++ b/config/config-test-lea.yaml @@ -1,9 +1,9 @@ paths: - output_dir: "/omics/groups/OE0538/internal/users/l012t/snakemake-cellranger/data" + output_dir: "/omics/odcf/analysis/OE0538_projects/DO-0008/test" target_templates: - linked_files: "linked_files/{0[individual]}" - cellranger_count: "cellranger_count/{0[individual]}/outs" - construct_sce_objects: "sce_objects/{0[individual]}" + linked_files: "cellranger/linked_files/{0[Species_ID]}/{0[individual]}" + cellranger_count: "cellranger/cellranger_count/{0[Species_ID]}/{0[individual]}/outs" + construct_sce_objects: "sce_objects/01_cellranger_output/{0[Species_ID]}/sce_{0[individual]}-01" # define specific target files target_files: samples_sheet: "metadata.csv" @@ -14,7 +14,7 @@ references: #C57BL6J: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_spret/" metadata: raw: - - "/omics/groups/OE0538/internal/users/l012t/snakemake-cellranger/data/metadata_full.csv" + - "/omics/odcf/analysis/OE0538_projects/DO-0008/metadata/OE0538_DO-0008_metadata_combined.csv" identifiers: # Define column name(s) used to define a sample (in correct order) # entries from multiple cols are concatenated into one sample-specific name @@ -44,7 +44,7 @@ metadata: - Batch_exp_day - Batch_sequencing - Date_collected - - Discard + - Keep_sample - individual - PID # Enable / Disable rules and specifiy rule-specific parameters diff --git a/workflow/Snakefile b/workflow/Snakefile index d3d99cb..ae4d191 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -15,10 +15,11 @@ IDENTIFIERS = config["metadata"]["identifiers"] print(samples.metadata) print(samples.metadata["individual"].unique()) -localrules: link_files, write_samples_spreadsheet, all, construct_sce_objects # local execution of non-demanding rules +localrules: link_files, write_samples_spreadsheet, all # local execution of non-demanding rules wildcard_constraints: #should not contain ".", see rule cellranger_count - individual="[A-z0-9_-]+" + individual="[A-z0-9_-]+", + Species_ID="[A-z0-9_-]+" rule all: input: @@ -38,9 +39,9 @@ rule link_files: ) output: - directory(f"{OUTPUT_BASE_PATH}/linked_files/{{individual}}") + directory(f"{OUTPUT_BASE_PATH}/cellranger/linked_files/{{Species_ID}}/{{individual}}") log: - f"{OUTPUT_BASE_PATH}/logs/link_files_{{individual}}.log" + f"{OUTPUT_BASE_PATH}/cellranger/logs/{{Species_ID}}/link_files_{{individual}}.log" run: log_fh = open(log[0], "w") os.makedirs(output[0], exist_ok=True) @@ -54,9 +55,9 @@ rule link_files: rule write_samples_spreadsheet: output: - f"{OUTPUT_BASE_PATH}/{config['paths']['target_files']['samples_sheet']}" + f"{OUTPUT_BASE_PATH}/cellranger/{config['paths']['target_files']['samples_sheet']}" log: - f"{OUTPUT_BASE_PATH}/logs/write_samples_spreadsheet.log" + f"{OUTPUT_BASE_PATH}/cellranger/logs/write_samples_spreadsheet.log" run: with open(log[0], "w") as log_fh: print("storing samples metadata to {output[0]}", file=log_fh) @@ -72,13 +73,13 @@ rule cellranger_count: input: rules.link_files.output output: - outs_dir=directory(f"{OUTPUT_BASE_PATH}/cellranger_count/{{individual}}/outs") + outs_dir=directory(f"{OUTPUT_BASE_PATH}/cellranger/cellranger_count/{{Species_ID}}/{{individual}}/outs") params: genome=REFERENCES["all_masked"], output_root_dir=lambda wildcards, output: Path(output[0]).parents[1] # remove last 2 levels from output path #extra=config["rules"]["cellranger_count"]["extra"] log: - f"{OUTPUT_BASE_PATH}/logs/cellranger_count_{{individual}}.log" + f"{OUTPUT_BASE_PATH}/cellranger/logs/{{Species_ID}}/cellranger_count_{{individual}}.log" threads: 16 envmodules: "cellranger/6.1.1", @@ -99,10 +100,12 @@ rule construct_sce_objects: input: output_cellranger = rules.cellranger_count.output output: - sce_objects = f"{OUTPUT_BASE_PATH}/sce_objects/{{individual}}" + sce_objects = f"{OUTPUT_BASE_PATH}/sce_objects/01_cellranger_output/{{Species_ID}}/sce_{{individual}}-01" params: identifiers = IDENTIFIERS, metadata = METADATA_PATH, single_cell_object_metadata_fields = config["metadata"]["single_cell_object_metadata_fields"] script: "scripts/construct_sce_objects.R" + + diff --git a/workflow/scripts/samples.py b/workflow/scripts/samples.py index 511668e..1f0c2c5 100644 --- a/workflow/scripts/samples.py +++ b/workflow/scripts/samples.py @@ -129,7 +129,7 @@ def filter_by_wildcards( :return param: """ - _identifiers = ["individual"] + _identifiers = ["individual", "Species_ID"] if wildcards: filters = dict((k, getattr(wildcards, k)) for k in _identifiers) From fd312f0cdfc0602902d7f0f669b9471ffdd8a94b Mon Sep 17 00:00:00 2001 From: leawlb Date: Wed, 9 Nov 2022 20:16:33 +0100 Subject: [PATCH 20/32] add automatic resolution of required conda env for SCE construction --- config/config-test-lea.yaml | 4 ++-- workflow/Snakefile | 5 +++-- workflow/envs/construct_sce_objects.yaml | 12 ++++++++++++ workflow/scripts/construct_sce_objects.R | 12 +++++------- 4 files changed, 22 insertions(+), 11 deletions(-) create mode 100644 workflow/envs/construct_sce_objects.yaml diff --git a/config/config-test-lea.yaml b/config/config-test-lea.yaml index 9c66a52..d910fe9 100644 --- a/config/config-test-lea.yaml +++ b/config/config-test-lea.yaml @@ -1,9 +1,9 @@ paths: - output_dir: "/omics/odcf/analysis/OE0538_projects/DO-0008/test" + output_dir: "/omics/odcf/analysis/OE0538_projects/DO-0008/data" target_templates: linked_files: "cellranger/linked_files/{0[Species_ID]}/{0[individual]}" cellranger_count: "cellranger/cellranger_count/{0[Species_ID]}/{0[individual]}/outs" - construct_sce_objects: "sce_objects/01_cellranger_output/{0[Species_ID]}/sce_{0[individual]}-01" + construct_sce_objects: "sce_objects/archive/01_cellranger_output/{0[Species_ID]}/sce_{0[individual]}-01" # define specific target files target_files: samples_sheet: "metadata.csv" diff --git a/workflow/Snakefile b/workflow/Snakefile index ae4d191..4b1a849 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -100,12 +100,13 @@ rule construct_sce_objects: input: output_cellranger = rules.cellranger_count.output output: - sce_objects = f"{OUTPUT_BASE_PATH}/sce_objects/01_cellranger_output/{{Species_ID}}/sce_{{individual}}-01" + sce_objects = f"{OUTPUT_BASE_PATH}/sce_objects/archive/01_cellranger_output/{{Species_ID}}/sce_{{individual}}-01" params: identifiers = IDENTIFIERS, metadata = METADATA_PATH, single_cell_object_metadata_fields = config["metadata"]["single_cell_object_metadata_fields"] + conda: + "envs/construct_sce_objects.yaml" script: "scripts/construct_sce_objects.R" - diff --git a/workflow/envs/construct_sce_objects.yaml b/workflow/envs/construct_sce_objects.yaml new file mode 100644 index 0000000..048c041 --- /dev/null +++ b/workflow/envs/construct_sce_objects.yaml @@ -0,0 +1,12 @@ +name: construct_sce_objects +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - r-base=4.1.3 + - bioconductor-dropletutils=1.14.2 + - r-tidyverse=1.3.2 + +# set channel priority to strict with conda config --set channel_priority strict +# use --conda-frontend conda in snakemake command diff --git a/workflow/scripts/construct_sce_objects.R b/workflow/scripts/construct_sce_objects.R index e2625aa..36d379e 100644 --- a/workflow/scripts/construct_sce_objects.R +++ b/workflow/scripts/construct_sce_objects.R @@ -1,11 +1,8 @@ #------------------------------------------------------------------------------- -library(DropletUtils, quietly = TRUE) library(tidyverse, quietly = TRUE) -# r and library modules must be installed in snakemake conda environment -# https://anaconda.org/conda-forge/r-base -# https://anaconda.org/bioconda/bioconductor-dropletutils -# https://anaconda.org/r/r-tidyverse +#library(SummarizedExperiment, quietly = TRUE) +library(DropletUtils, quietly = TRUE) # construct SCE object from raw cellranger output print(paste0(snakemake@input[["output_cellranger"]], "/raw_feature_bc_matrix")) @@ -23,7 +20,7 @@ metadata <- read.csv(file = snakemake@params[["metadata"]], as.is=TRUE, colClasses = "character") -wildcard_curr <- snakemake@wildcards[["individual"]] # currently loaded individual sample +individual_curr <- snakemake@wildcards[["individual"]] # currently loaded individual sample IDENTIFIERS <- snakemake@params[["identifiers"]] # if necessary concatenate identifiers again to obtain all possible wildcards @@ -35,7 +32,7 @@ if(! "individual" %in% colnames(metadata_curr)){ } # subset data as specified by wildcard and single_cell_object_metadata_fields -metadata_curr <- metadata_curr[which(metadata_curr$individual == wildcard_curr),] +metadata_curr <- metadata_curr[which(metadata_curr$individual == individual_curr),] cols_add <- snakemake@params[["single_cell_object_metadata_fields"]] metadata_curr <- metadata_curr[,colnames(metadata_curr) %in% cols_add] @@ -44,4 +41,5 @@ for(i in colnames(metadata_curr)){ colData(sce)[i] <- rep(metadata_curr[1,i], ncol(sce)) } +print(sce) saveRDS(sce, file = snakemake@output[["sce_objects"]]) \ No newline at end of file From 3bdeeeca4ea163936a7161d4e1111e1fa53d6398 Mon Sep 17 00:00:00 2001 From: leawlb Date: Wed, 9 Nov 2022 22:28:34 +0100 Subject: [PATCH 21/32] removed manual addition of identifiers to samples.py --- workflow/Snakefile | 14 +++++++---- workflow/envs/construct_sce_objects.yaml | 4 +--- workflow/scripts/construct_sce_objects.R | 1 - workflow/scripts/samples.py | 30 +++++++++++------------- 4 files changed, 25 insertions(+), 24 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index 4b1a849..de3aaa3 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -13,7 +13,6 @@ REFERENCES = config["references"] IDENTIFIERS = config["metadata"]["identifiers"] print(samples.metadata) -print(samples.metadata["individual"].unique()) localrules: link_files, write_samples_spreadsheet, all # local execution of non-demanding rules @@ -96,17 +95,24 @@ rule cellranger_count: "> {log}" # construct SCE objects from counts matrices found in ../alignment/output/ + +""" +# I set channel priority to strict with << conda config --set channel_priority strict >> +# I had to use << --conda-frontend conda >> in snakemake command +# if used more than a couple times I suggest directly installing the packages into the snakemake env +""" + rule construct_sce_objects: input: output_cellranger = rules.cellranger_count.output output: - sce_objects = f"{OUTPUT_BASE_PATH}/sce_objects/archive/01_cellranger_output/{{Species_ID}}/sce_{{individual}}-01" + sce_objects = f"{OUTPUT_BASE_PATH}/sce_objects/01_cellranger_output/{{Species_ID}}/sce_{{individual}}-01" params: identifiers = IDENTIFIERS, metadata = METADATA_PATH, single_cell_object_metadata_fields = config["metadata"]["single_cell_object_metadata_fields"] - conda: - "envs/construct_sce_objects.yaml" + #conda: + # "envs/construct_sce_objects.yaml" # if required packages are installed this can be inactivated script: "scripts/construct_sce_objects.R" diff --git a/workflow/envs/construct_sce_objects.yaml b/workflow/envs/construct_sce_objects.yaml index 048c041..2924286 100644 --- a/workflow/envs/construct_sce_objects.yaml +++ b/workflow/envs/construct_sce_objects.yaml @@ -7,6 +7,4 @@ dependencies: - r-base=4.1.3 - bioconductor-dropletutils=1.14.2 - r-tidyverse=1.3.2 - -# set channel priority to strict with conda config --set channel_priority strict -# use --conda-frontend conda in snakemake command + \ No newline at end of file diff --git a/workflow/scripts/construct_sce_objects.R b/workflow/scripts/construct_sce_objects.R index 36d379e..8f1bfd8 100644 --- a/workflow/scripts/construct_sce_objects.R +++ b/workflow/scripts/construct_sce_objects.R @@ -1,7 +1,6 @@ #------------------------------------------------------------------------------- library(tidyverse, quietly = TRUE) -#library(SummarizedExperiment, quietly = TRUE) library(DropletUtils, quietly = TRUE) # construct SCE object from raw cellranger output diff --git a/workflow/scripts/samples.py b/workflow/scripts/samples.py index 1f0c2c5..4915cc4 100644 --- a/workflow/scripts/samples.py +++ b/workflow/scripts/samples.py @@ -3,7 +3,7 @@ from snakemake.io import Wildcards # list identifiers identically to config identifiers and in same order -IDENTIFIERS = ["Species_ID", "Age_ID", "Fraction_ID", "Sample_NR"] # find a way to lift directly from config +#IDENTIFIERS = ["Species_ID", "Age_ID", "Fraction_ID"] # find a way to lift directly from config class Samples: """ @@ -27,18 +27,14 @@ class Samples: "READ", "CELLRANGER_FASTQ_PATH", "individual"] # include individual which is later generated - - columns = columns + IDENTIFIERS - # map to rename columns in the format of (old):(new) - # Object_ID should uniquely identify each sample/object, it must already be contained in the metadata sheet - # it must be identical to config metadata identifier - # I haven't found a way to directly lift it from config yet - #columns_map = { - # "Object_ID": "individual" - #} - - + #print(IDENTIFIERS) + #print(columns) + #columns = columns + IDENTIFIERS + #print(columns) + #columns = None + def __init__(self, config): + IDENTIFIERS = config["metadata"]["identifiers"] metadata_files = config["metadata"]["raw"] self.output_base_dir = config["paths"]["output_dir"] @@ -69,7 +65,7 @@ def __init__(self, config): metadata_full = self.get_cellranger_filename(metadata_full) - self.metadata = self.select_columns(metadata_full) + self.metadata = self.select_columns(metadata_full, identifiers = IDENTIFIERS) def rename_date_of_birth(self, row: pd.Series): @@ -109,13 +105,15 @@ def get_cellranger_filename(self, df: pd.DataFrame) -> pd.DataFrame: def select_columns(self, df: pd.DataFrame, - columns: list = None): + columns: list = None, + identifiers: str = None): """Select/Subset columns from DataFrame to reduce DataFrame dimensions. """ + if not columns: - columns = self.columns + columns = self.columns + identifiers return df[columns] - + @staticmethod def filter_by_wildcards( wildcards: Wildcards, From 32a5d5569ab2774115b668c810b599a5fc4fa724 Mon Sep 17 00:00:00 2001 From: leawlb Date: Sun, 27 Nov 2022 23:16:40 +0100 Subject: [PATCH 22/32] various corrections --- config/config-test-lea.yaml | 6 +++--- workflow/Snakefile | 7 +++++-- workflow/scripts/samples.py | 2 +- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/config/config-test-lea.yaml b/config/config-test-lea.yaml index d910fe9..4be12d7 100644 --- a/config/config-test-lea.yaml +++ b/config/config-test-lea.yaml @@ -1,14 +1,14 @@ paths: - output_dir: "/omics/odcf/analysis/OE0538_projects/DO-0008/data" + output_dir: "/omics/odcf/analysis/OE0538_projects/DO-0008/test4" target_templates: linked_files: "cellranger/linked_files/{0[Species_ID]}/{0[individual]}" cellranger_count: "cellranger/cellranger_count/{0[Species_ID]}/{0[individual]}/outs" - construct_sce_objects: "sce_objects/archive/01_cellranger_output/{0[Species_ID]}/sce_{0[individual]}-01" + construct_sce_objects: "sce_objects/01_cellranger_output/{0[Species_ID]}/sce_{0[individual]}-01" # define specific target files target_files: samples_sheet: "metadata.csv" references: - all_masked: "/omics/groups/OE0538/internal/users/f844s/2022-bonemarrow/data/masked_reference/GRCm38_masked_allStrains" + all_masked: "/omics/groups/OE0538/internal/shared_data/CellRangerReferences/GRCm38_masked_allStrains/" #CAST_EiJ: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_cast/" #SPRET_EiJ: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_spret/" #C57BL6J: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_spret/" diff --git a/workflow/Snakefile b/workflow/Snakefile index de3aaa3..f4c6cf3 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -41,6 +41,7 @@ rule link_files: directory(f"{OUTPUT_BASE_PATH}/cellranger/linked_files/{{Species_ID}}/{{individual}}") log: f"{OUTPUT_BASE_PATH}/cellranger/logs/{{Species_ID}}/link_files_{{individual}}.log" + threads: 1 run: log_fh = open(log[0], "w") os.makedirs(output[0], exist_ok=True) @@ -57,6 +58,7 @@ rule write_samples_spreadsheet: f"{OUTPUT_BASE_PATH}/cellranger/{config['paths']['target_files']['samples_sheet']}" log: f"{OUTPUT_BASE_PATH}/cellranger/logs/write_samples_spreadsheet.log" + threads: 1 run: with open(log[0], "w") as log_fh: print("storing samples metadata to {output[0]}", file=log_fh) @@ -111,8 +113,9 @@ rule construct_sce_objects: identifiers = IDENTIFIERS, metadata = METADATA_PATH, single_cell_object_metadata_fields = config["metadata"]["single_cell_object_metadata_fields"] - #conda: - # "envs/construct_sce_objects.yaml" # if required packages are installed this can be inactivated + threads: 1 + conda: + "envs/construct_sce_objects.yaml" # if required packages are installed this can be inactivated script: "scripts/construct_sce_objects.R" diff --git a/workflow/scripts/samples.py b/workflow/scripts/samples.py index 4915cc4..c20cdf5 100644 --- a/workflow/scripts/samples.py +++ b/workflow/scripts/samples.py @@ -127,7 +127,7 @@ def filter_by_wildcards( :return param: """ - _identifiers = ["individual", "Species_ID"] + _identifiers = ["individual", "Species_ID"] # possible wildcards if wildcards: filters = dict((k, getattr(wildcards, k)) for k in _identifiers) From 7931239045010c1eebc214eabef6db88fe363fc1 Mon Sep 17 00:00:00 2001 From: leawlb Date: Mon, 28 Nov 2022 09:38:01 +0100 Subject: [PATCH 23/32] altered individual column generation and renamed config --- ...ml => config-interspecies-bonemarrow.yaml} | 7 ++-- workflow/Snakefile | 13 +++---- workflow/scripts/construct_sce_objects.R | 2 +- workflow/scripts/samples.py | 35 +++---------------- 4 files changed, 14 insertions(+), 43 deletions(-) rename config/{config-test-lea.yaml => config-interspecies-bonemarrow.yaml} (92%) diff --git a/config/config-test-lea.yaml b/config/config-interspecies-bonemarrow.yaml similarity index 92% rename from config/config-test-lea.yaml rename to config/config-interspecies-bonemarrow.yaml index 4be12d7..b22c35d 100644 --- a/config/config-test-lea.yaml +++ b/config/config-interspecies-bonemarrow.yaml @@ -1,7 +1,7 @@ paths: - output_dir: "/omics/odcf/analysis/OE0538_projects/DO-0008/test4" + output_dir: "/omics/odcf/analysis/OE0538_projects/DO-0008/data" target_templates: - linked_files: "cellranger/linked_files/{0[Species_ID]}/{0[individual]}" + linked_files: "cellranger/linked_files/{0[Species_ID]}/{0[individual]}" cellranger_count: "cellranger/cellranger_count/{0[Species_ID]}/{0[individual]}/outs" construct_sce_objects: "sce_objects/01_cellranger_output/{0[Species_ID]}/sce_{0[individual]}-01" # define specific target files @@ -23,7 +23,6 @@ metadata: - Age_ID - Fraction_ID - Sample_NR - #- Objects_ID #- PID # col name automatically included in OTP metadata sheets #- Sample Type # col name automatically included in OTP metadata sheets single_cell_object_metadata_fields: @@ -50,7 +49,7 @@ metadata: # Enable / Disable rules and specifiy rule-specific parameters rules: cellranger_count: - # extra: "" # set additional arguments for cellranger count + extra: "" # set additional arguments for cellranger count allele_specific: False wasp_filter_reads: False realign_bam: False \ No newline at end of file diff --git a/workflow/Snakefile b/workflow/Snakefile index f4c6cf3..eab0548 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -41,7 +41,6 @@ rule link_files: directory(f"{OUTPUT_BASE_PATH}/cellranger/linked_files/{{Species_ID}}/{{individual}}") log: f"{OUTPUT_BASE_PATH}/cellranger/logs/{{Species_ID}}/link_files_{{individual}}.log" - threads: 1 run: log_fh = open(log[0], "w") os.makedirs(output[0], exist_ok=True) @@ -58,7 +57,6 @@ rule write_samples_spreadsheet: f"{OUTPUT_BASE_PATH}/cellranger/{config['paths']['target_files']['samples_sheet']}" log: f"{OUTPUT_BASE_PATH}/cellranger/logs/write_samples_spreadsheet.log" - threads: 1 run: with open(log[0], "w") as log_fh: print("storing samples metadata to {output[0]}", file=log_fh) @@ -77,8 +75,8 @@ rule cellranger_count: outs_dir=directory(f"{OUTPUT_BASE_PATH}/cellranger/cellranger_count/{{Species_ID}}/{{individual}}/outs") params: genome=REFERENCES["all_masked"], - output_root_dir=lambda wildcards, output: Path(output[0]).parents[1] # remove last 2 levels from output path - #extra=config["rules"]["cellranger_count"]["extra"] + output_root_dir=lambda wildcards, output: Path(output[0]).parents[1], # remove last 2 levels from output path + extra=config["rules"]["cellranger_count"]["extra"] log: f"{OUTPUT_BASE_PATH}/cellranger/logs/{{Species_ID}}/cellranger_count_{{individual}}.log" threads: 16 @@ -92,7 +90,7 @@ rule cellranger_count: "--transcriptome {params.genome} " "--fastqs {input} " "--localcores={threads} " - #"{params.extra} " + "{params.extra} " "--sample {wildcards.individual} " "> {log}" @@ -113,9 +111,8 @@ rule construct_sce_objects: identifiers = IDENTIFIERS, metadata = METADATA_PATH, single_cell_object_metadata_fields = config["metadata"]["single_cell_object_metadata_fields"] - threads: 1 - conda: - "envs/construct_sce_objects.yaml" # if required packages are installed this can be inactivated + conda: # if all required packages are installed in the snakemake environment this can be omitted + "envs/construct_sce_objects.yaml" # if all required packages are installed in the snakemake environment this can be omitted script: "scripts/construct_sce_objects.R" diff --git a/workflow/scripts/construct_sce_objects.R b/workflow/scripts/construct_sce_objects.R index 8f1bfd8..2adaf20 100644 --- a/workflow/scripts/construct_sce_objects.R +++ b/workflow/scripts/construct_sce_objects.R @@ -22,7 +22,7 @@ metadata <- read.csv(file = snakemake@params[["metadata"]], individual_curr <- snakemake@wildcards[["individual"]] # currently loaded individual sample IDENTIFIERS <- snakemake@params[["identifiers"]] -# if necessary concatenate identifiers again to obtain all possible wildcards +# if necessary, concatenate identifiers again to obtain all possible wildcards metadata_curr <- metadata if(! "individual" %in% colnames(metadata_curr)){ metadata_curr <- metadata_curr %>% diff --git a/workflow/scripts/samples.py b/workflow/scripts/samples.py index c20cdf5..66214b7 100644 --- a/workflow/scripts/samples.py +++ b/workflow/scripts/samples.py @@ -2,9 +2,6 @@ from snakemake.io import Wildcards -# list identifiers identically to config identifiers and in same order -#IDENTIFIERS = ["Species_ID", "Age_ID", "Fraction_ID"] # find a way to lift directly from config - class Samples: """ Convert the OTP-exported metadata spreadsheet into a pandas.DataFrame @@ -26,12 +23,7 @@ class Samples: "LANE_NO", "READ", "CELLRANGER_FASTQ_PATH", - "individual"] # include individual which is later generated - #print(IDENTIFIERS) - #print(columns) - #columns = columns + IDENTIFIERS - #print(columns) - #columns = None + "individual"] # include 'individual' col to be generated below def __init__(self, config): IDENTIFIERS = config["metadata"]["identifiers"] @@ -42,27 +34,10 @@ def __init__(self, config): metadata_full = pd.concat((pd.read_csv(f) for f in metadata_files), ignore_index=True) - """Make "individual" column - - A single identifier col is copied and renamed to "individual" but - entries from multiple identifier cols are concatenated to "individual". - - Entries in the "individual" col are used as wildcards. - - For metadata_full it is used for downstream functions (?). - - This may cause a warning that I still have to take care of but that doesn't seem critical. - """ + # generate 'individual' column containing all concatenated identifiers if not "individual" in metadata_full.columns: - print('establishing "individual" column') - for i in IDENTIFIERS: - if i == IDENTIFIERS[0]: - metadata_full["individual"] = metadata_full[i].map(str) - else: - metadata_full["individual"] = metadata_full["individual"] + '_' + metadata_full[i].map(str) - - metadata_full["DATE_OF_BIRTH"] = metadata_full.apply(self.rename_date_of_birth, axis=1) - + metadata_full['individual'] = metadata_full[IDENTIFIERS].apply('_'.join, axis=1) + metadata_full = self.get_cellranger_filename(metadata_full) self.metadata = self.select_columns(metadata_full, identifiers = IDENTIFIERS) @@ -127,7 +102,7 @@ def filter_by_wildcards( :return param: """ - _identifiers = ["individual", "Species_ID"] # possible wildcards + _identifiers = ["individual", "Species_ID"] if wildcards: filters = dict((k, getattr(wildcards, k)) for k in _identifiers) From 91401fadf0fc12488ae32371e439ac22a2c9f1ae Mon Sep 17 00:00:00 2001 From: leawlb <76486615+leawlb@users.noreply.github.com> Date: Mon, 28 Nov 2022 10:11:37 +0100 Subject: [PATCH 24/32] Update README.md --- README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/README.md b/README.md index ff6e88e..0faa717 100644 --- a/README.md +++ b/README.md @@ -58,3 +58,12 @@ snakemake --cluster "bsub -n16 -q verylong -R rusage[mem=200GB]" -p -j4 -c42 --c - `--cluster` may change depending on the computational footprint of your analyses - `--configfile` should point to your personal configuration +For generating SCE objects, R (version as specified in construct_sce_objects.yaml) has to be installed in the snakemake environment. +For using the construct_sce_objects.yaml environment to resolve additional required packages automatically, also add: + +```bash +--conda-frontend conda +``` + +to the snakemake command. +For multiple runs, it is recommended to install these packages directly into the snakemake environment as well. From 28fc10680b8caf868ab798e1286dd9c00da28ff0 Mon Sep 17 00:00:00 2001 From: leawlb Date: Thu, 26 Jan 2023 16:17:27 +0100 Subject: [PATCH 25/32] added complete env files and smalle improvements --- config/config-interspecies-bonemarrow.yaml | 27 ++- workflow/Snakefile | 20 +- workflow/envs/construct_sce_objects.yaml | 247 ++++++++++++++++++++- workflow/envs/snakemake_cellranger.yaml | 181 +++++++++++++++ workflow/scripts/construct_sce_objects.R | 1 + 5 files changed, 448 insertions(+), 28 deletions(-) create mode 100644 workflow/envs/snakemake_cellranger.yaml diff --git a/config/config-interspecies-bonemarrow.yaml b/config/config-interspecies-bonemarrow.yaml index b22c35d..872907b 100644 --- a/config/config-interspecies-bonemarrow.yaml +++ b/config/config-interspecies-bonemarrow.yaml @@ -1,6 +1,12 @@ paths: - output_dir: "/omics/odcf/analysis/OE0538_projects/DO-0008/data" + # directory to store output data = cellranger files and SingleCellExperiment objects + # paths to input data = fastq files are stored in metadata sheet (OTP export) + output_dir: "/omics/odcf/analysis/OE0538_projects/DO-0008/test" target_templates: + # paths for each type of output file + # remove "/{0[Species_ID]}" to store all objects in one directory or + # replace "Species_ID" with other colname as suitable wildcard from metadata sheet (e.g. Sample_Type, ...) to create subfolders + # remove or replace all instances of Species_ID in Snakefile and samples.py linked_files: "cellranger/linked_files/{0[Species_ID]}/{0[individual]}" cellranger_count: "cellranger/cellranger_count/{0[Species_ID]}/{0[individual]}/outs" construct_sce_objects: "sce_objects/01_cellranger_output/{0[Species_ID]}/sce_{0[individual]}-01" @@ -9,27 +15,24 @@ paths: samples_sheet: "metadata.csv" references: all_masked: "/omics/groups/OE0538/internal/shared_data/CellRangerReferences/GRCm38_masked_allStrains/" - #CAST_EiJ: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_cast/" - #SPRET_EiJ: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_spret/" - #C57BL6J: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_spret/" metadata: raw: + # should have same format as or be based on OTP export sheet - "/omics/odcf/analysis/OE0538_projects/DO-0008/metadata/OE0538_DO-0008_metadata_combined.csv" identifiers: - # Define column name(s) used to define a sample (in correct order) - # entries from multiple cols are concatenated into one sample-specific name + # define one or multiple column name(s) that uniquely define a sample/object + # multiple cols are concatenated into one sample-specific name # in the format of {Species_ID}_{Age_ID}_{Fraction_ID}_{Sample_NR} - - Species_ID # col names I manually added to metadata sheet beforehand + - Species_ID # columns I manually added to metadata sheet - Age_ID - Fraction_ID - Sample_NR - #- PID # col name automatically included in OTP metadata sheets - #- Sample Type # col name automatically included in OTP metadata sheets + #- PID # columns included in OTP metadata sheet + #- Sample_Type single_cell_object_metadata_fields: # Define all columns from the metadata spreadsheet that - # will be included in the SingleCellExperiment / Seurat - # objects - - Object_ID # colnames I manually added to metadata + # will be included in the SingleCellExperiment / Seurat objects + - Object_ID # columns I manually added to metadata sheet - Mouse_ID - Species_ID - Age diff --git a/workflow/Snakefile b/workflow/Snakefile index eab0548..9bc9bca 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -94,15 +94,17 @@ rule cellranger_count: "--sample {wildcards.individual} " "> {log}" -# construct SCE objects from counts matrices found in ../alignment/output/ - -""" -# I set channel priority to strict with << conda config --set channel_priority strict >> -# I had to use << --conda-frontend conda >> in snakemake command -# if used more than a couple times I suggest directly installing the packages into the snakemake env -""" rule construct_sce_objects: + """construct SCE objects from counts matrices + + set channel priority to strict with << conda config --set channel_priority strict >> + use << --conda-frontend conda >> in snakemake command + + Initially, building the environment might take some time. + + Note: output must match target_templates in configuration YAML. + """ input: output_cellranger = rules.cellranger_count.output output: @@ -111,8 +113,8 @@ rule construct_sce_objects: identifiers = IDENTIFIERS, metadata = METADATA_PATH, single_cell_object_metadata_fields = config["metadata"]["single_cell_object_metadata_fields"] - conda: # if all required packages are installed in the snakemake environment this can be omitted - "envs/construct_sce_objects.yaml" # if all required packages are installed in the snakemake environment this can be omitted + conda: + "envs/construct_sce_objects.yaml" script: "scripts/construct_sce_objects.R" diff --git a/workflow/envs/construct_sce_objects.yaml b/workflow/envs/construct_sce_objects.yaml index 2924286..bd49ff1 100644 --- a/workflow/envs/construct_sce_objects.yaml +++ b/workflow/envs/construct_sce_objects.yaml @@ -1,10 +1,243 @@ -name: construct_sce_objects +# this environment is automatically created by snakemake +name: construct-sce-objects channels: - - conda-forge - - bioconda - - defaults +- conda-forge +- bioconda +- defaults dependencies: - - r-base=4.1.3 - - bioconductor-dropletutils=1.14.2 - - r-tidyverse=1.3.2 +- r-dbplyr=2.3.0=r41hc72bb7e_0 +- bioconductor-delayedmatrixstats=1.16.0=r41hdfd78af_0 +- font-ttf-ubuntu=0.83=hab24e00_0 +- libsanitizer=12.2.0=h46fd767_19 +- bwidget=1.9.14=ha770c72_1 +- r-bit=4.0.5=r41h06615bd_0 +- libcurl=7.86.0=h7bff187_1 +- libpng=1.6.39=h753d276_0 +- r-digest=0.6.31=r41h38f115c_0 +- r-fastmap=1.1.0=r41h7525677_1 +- r-cpp11=0.4.3=r41hc72bb7e_0 +- _r-mutex=1.0.1=anacondar_1 +- r-stringi=1.7.12=r41h1ae9187_0 +- r-forcats=0.5.2=r41hc72bb7e_1 +- r-withr=2.5.0=r41hc72bb7e_1 +- r-crayon=1.5.2=r41hc72bb7e_1 +- r-highr=0.10=r41hc72bb7e_0 +- r-sys=3.4.1=r41h06615bd_0 +- r-formatr=1.14=r41hc72bb7e_0 +- r-base=4.1.3=h2f963a2_5 +- bioconductor-delayedarray=0.20.0=r41hc0cfd56_2 +- libstdcxx-ng=12.2.0=h46fd767_19 +- tktable=2.10=hb7b940f_3 +- r-sass=0.4.5=r41h38f115c_0 +- lerc=4.0.0=h27087fc_0 +- libgcc-devel_linux-64=12.2.0=h3b97bd3_19 +- bioconductor-scuttle=1.4.0=r41hc247a5b_2 +- libgfortran5=12.2.0=h337968e_19 +- r-colorspace=2.1_0=r41h133d619_0 +- r-ids=1.0.1=r41hc72bb7e_2 +- libstdcxx-devel_linux-64=12.2.0=h3b97bd3_19 +- jpeg=9e=h166bdaf_2 +- r-callr=3.7.3=r41hc72bb7e_0 +- r-curl=4.3.3=r41h06615bd_1 +- fribidi=1.0.10=h36c2ea0_0 +- krb5=1.19.3=h3790be6_0 +- bioconductor-xvector=0.34.0=r41hc0cfd56_2 +- bioconductor-biocparallel=1.28.3=r41hc247a5b_1 +- libgcc-ng=12.2.0=h65d4601_19 +- r-r.utils=2.12.2=r41hc72bb7e_0 +- r-lubridate=1.9.1=r41h133d619_0 +- r-base64enc=0.1_3=r41h06615bd_1005 +- r-readr=2.1.3=r41h7525677_1 +- ca-certificates=2022.12.7=ha878542_0 +- xorg-kbproto=1.0.7=h7f98852_1002 +- r-xml2=1.3.3=r41h044e5c7_2 +- r-scales=1.2.1=r41hc72bb7e_1 +- sed=4.8=he412f7d_0 +- harfbuzz=6.0.0=h8e241bc_0 +- readline=8.1.2=h0f457ee_0 +- zstd=1.5.2=h3eb15da_6 +- r-rcurl=1.98_1.9=r41h06615bd_1 +- libuuid=2.32.1=h7f98852_1000 +- zlib=1.2.13=h166bdaf_4 +- r-r6=2.5.1=r41hc72bb7e_1 +- bioconductor-edger=3.36.0=r41hc247a5b_2 +- libzlib=1.2.13=h166bdaf_4 +- r-rcolorbrewer=1.1_3=r41h785f33e_1 +- libedit=3.1.20191231=he28a2e2_2 +- r-farver=2.1.1=r41h7525677_1 +- r-lattice=0.20_45=r41h06615bd_1 +- libgfortran-ng=12.2.0=h69a702a_19 +- fonts-conda-ecosystem=1=0 +- r-glue=1.6.2=r41h06615bd_1 +- xorg-libice=1.0.10=h7f98852_0 +- r-utf8=1.2.2=r41h06615bd_1 +- xorg-libxt=1.2.1=h7f98852_2 +- pandoc=2.19.2=h32600fe_1 +- bioconductor-genomicranges=1.46.1=r41hc0cfd56_1 +- r-pkgconfig=2.0.3=r41hc72bb7e_2 +- r-rstudioapi=0.14=r41hc72bb7e_1 +- r-memoise=2.0.1=r41hc72bb7e_1 +- r-futile.options=1.0.1=r41hc72bb7e_1003 +- r-snow=0.4_4=r41hc72bb7e_1 +- r-httr=1.4.4=r41hc72bb7e_1 +- libffi=3.4.2=h7f98852_5 +- _openmp_mutex=4.5=2_gnu +- sysroot_linux-64=2.12=he073ed8_15 +- keyutils=1.6.1=h166bdaf_0 +- bioconductor-biobase=2.54.0=r41hc0cfd56_2 +- r-googledrive=2.0.0=r41hc72bb7e_1 +- r-rvest=1.0.3=r41hc72bb7e_1 +- _libgcc_mutex=0.1=conda_forge +- bioconductor-singlecellexperiment=1.16.0=r41hdfd78af_0 +- r-yaml=2.3.7=r41h133d619_0 +- r-selectr=0.4_2=r41hc72bb7e_2 +- r-generics=0.1.3=r41hc72bb7e_1 +- r-backports=1.4.1=r41h06615bd_1 +- r-ps=1.7.2=r41h06615bd_0 +- font-ttf-inconsolata=3.000=h77eed37_0 +- fonts-conda-forge=1=0 +- bioconductor-genomeinfodbdata=1.2.7=r41hdfd78af_2 +- r-gargle=1.2.1=r41hc72bb7e_1 +- r-viridislite=0.4.1=r41hc72bb7e_1 +- r-nlme=3.1_161=r41hac0b197_0 +- r-hms=1.1.2=r41hc72bb7e_1 +- r-munsell=0.5.0=r41hc72bb7e_1005 +- ld_impl_linux-64=2.39=hcc3a1bd_1 +- curl=7.86.0=h7bff187_1 +- c-ares=1.18.1=h7f98852_0 +- r-sitmo=2.0.2=r41h7525677_1 +- r-locfit=1.5_9.7=r41h133d619_0 +- bioconductor-matrixgenerics=1.6.0=r41hdfd78af_0 +- r-tzdb=0.3.0=r41h7525677_1 +- pango=1.50.12=hd33c08f_1 +- r-jsonlite=1.8.4=r41h133d619_0 +- libev=4.33=h516909a_1 +- libwebp-base=1.2.4=h166bdaf_0 +- bioconductor-dropletutils=1.14.2=r41hc247a5b_1 +- r-dplyr=1.0.10=r41h7525677_1 +- bioconductor-beachmat=2.10.0=r41hc247a5b_2 +- r-vroom=1.6.1=r41h38f115c_0 +- r-data.table=1.14.6=r41h06615bd_0 +- pixman=0.40.0=h36c2ea0_0 +- r-tibble=3.1.8=r41h06615bd_1 +- r-xfun=0.36=r41h38f115c_0 +- bioconductor-sparsematrixstats=1.6.0=r41hc247a5b_2 +- r-evaluate=0.20=r41hc72bb7e_0 +- xorg-xproto=7.0.31=h7f98852_1007 +- r-jquerylib=0.1.4=r41hc72bb7e_1 +- xorg-libxrender=0.9.10=h7f98852_1003 +- r-mass=7.3_58.2=r41h133d619_0 +- r-r.oo=1.25.0=r41hc72bb7e_1 +- r-rematch2=2.1.2=r41hc72bb7e_2 +- r-mime=0.12=r41h06615bd_1 +- liblapack=3.9.0=16_linux64_openblas +- r-processx=3.8.0=r41h06615bd_0 +- r-blob=1.2.3=r41hc72bb7e_1 +- libblas=3.9.0=16_linux64_openblas +- bioconductor-zlibbioc=1.40.0=r41hc0cfd56_2 +- r-haven=2.5.1=r41h7525677_0 +- r-timechange=0.2.0=r41h38f115c_0 +- r-matrixstats=0.63.0=r41h06615bd_0 +- r-assertthat=0.2.1=r41hc72bb7e_3 +- xorg-xextproto=7.3.0=h7f98852_1002 +- bioconductor-s4vectors=0.32.4=r41hc0cfd56_0 +- r-rappdirs=0.3.3=r41h06615bd_1 +- r-ggplot2=3.4.0=r41hc72bb7e_1 +- xz=5.2.6=h166bdaf_0 +- libgomp=12.2.0=h65d4601_19 +- r-cachem=1.0.6=r41h06615bd_1 +- r-tidyverse=1.3.2=r41hc72bb7e_1 +- r-clipr=0.8.0=r41hc72bb7e_1 +- r-rcpp=1.0.10=r41h38f115c_0 +- font-ttf-source-code-pro=2.038=h77eed37_0 +- font-ttf-dejavu-sans-mono=2.37=hab24e00_0 +- bioconductor-iranges=2.28.0=r41hc0cfd56_2 +- libnghttp2=1.51.0=hdcd2b5c_0 +- bioconductor-rhdf5lib=1.16.0=r41hc0cfd56_2 +- r-fansi=1.0.4=r41h133d619_0 +- kernel-headers_linux-64=2.6.32=he073ed8_15 +- gcc_impl_linux-64=12.2.0=hcc96c02_19 +- r-pillar=1.8.1=r41hc72bb7e_1 +- gettext=0.21.1=h27087fc_0 +- libiconv=1.17=h166bdaf_0 +- r-bitops=1.0_7=r41h06615bd_1 +- bioconductor-hdf5array=1.22.1=r41hc0cfd56_1 +- r-purrr=1.0.1=r41h133d619_0 +- r-broom=1.0.3=r41hc72bb7e_0 +- r-bh=1.81.0_1=r41hc72bb7e_0 +- libxml2=2.10.3=h7463322_0 +- r-isoband=0.2.7=r41h38f115c_1 +- r-futile.logger=1.4.3=r41hc72bb7e_1004 +- r-reprex=2.0.2=r41hc72bb7e_1 +- r-openssl=2.0.5=r41hb1dc35e_0 +- r-lifecycle=1.0.3=r41hc72bb7e_1 +- r-stringr=1.5.0=r41h785f33e_0 +- r-tidyselect=1.2.0=r41hc72bb7e_0 +- r-lambda.r=1.2.4=r41hc72bb7e_2 +- expat=2.5.0=h27087fc_0 +- r-r.methodss3=1.8.2=r41hc72bb7e_1 +- r-dbi=1.1.3=r41hc72bb7e_1 +- r-rmarkdown=2.20=r41hc72bb7e_0 +- xorg-libx11=1.7.2=h7f98852_0 +- bzip2=1.0.8=h7f98852_4 +- r-cli=3.6.0=r41h38f115c_0 +- r-gtable=0.3.1=r41hc72bb7e_1 +- libtiff=4.5.0=h6adf6a1_2 +- libopenblas=0.3.21=pthreads_h78a6416_3 +- tk=8.6.12=h27826a3_0 +- bioconductor-genomeinfodb=1.30.1=r41hdfd78af_0 +- bioconductor-limma=3.50.3=r41hc0cfd56_0 +- xorg-libxau=1.0.9=h7f98852_0 +- r-cellranger=1.1.0=r41hc72bb7e_1005 +- make=4.3=hd18ef5c_1 +- r-rematch=1.0.1=r41hc72bb7e_1005 +- xorg-renderproto=0.11.1=h7f98852_1002 +- graphite2=1.3.13=h58526e2_1001 +- r-htmltools=0.5.4=r41h38f115c_0 +- r-mgcv=1.8_41=r41h5f7b363_0 +- r-knitr=1.42=r41hc72bb7e_1 +- libdeflate=1.17=h0b41bf4_0 +- bioconductor-rhdf5filters=1.6.0=r41hc247a5b_2 +- fontconfig=2.14.1=hc2a2eb6_0 +- r-bslib=0.4.2=r41hc72bb7e_0 +- gfortran_impl_linux-64=12.2.0=h55be85b_19 +- r-dqrng=0.3.0=r41h7525677_1 +- libxcb=1.13=h7f98852_1004 +- xorg-libsm=1.2.3=hd9c2040_1000 +- r-googlesheets4=1.0.1=r41h785f33e_1 +- r-prettyunits=1.1.1=r41hc72bb7e_2 +- libcblas=3.9.0=16_linux64_openblas +- bioconductor-rhdf5=2.38.1=r41hbe1951d_0 +- binutils_impl_linux-64=2.39=he00db2b_1 +- icu=70.1=h27087fc_0 +- pcre2=10.40=hc3806b6_0 +- r-rlang=1.0.6=r41h7525677_1 +- r-readxl=1.4.1=r41h3ebcfa7_1 +- pthread-stubs=0.4=h36c2ea0_1001 +- r-ellipsis=0.3.2=r41h06615bd_1 +- r-magrittr=2.0.3=r41h06615bd_1 +- openssl=1.1.1s=h0b41bf4_1 +- gxx_impl_linux-64=12.2.0=hcc96c02_19 +- r-progress=1.2.2=r41hc72bb7e_3 +- r-fs=1.6.0=r41h38f115c_0 +- xorg-libxext=1.3.4=h7f98852_1 +- gsl=2.7=he838d99_0 +- r-tidyr=1.3.0=r41h38f115c_0 +- libssh2=1.10.0=haa6b8db_3 +- r-bit64=4.0.5=r41h06615bd_1 +- ncurses=6.3=h27087fc_1 +- r-matrix=1.5_3=r41h5f7b363_0 +- r-uuid=1.1_0=r41h06615bd_1 +- cairo=1.16.0=ha61ee94_1014 +- libglib=2.74.1=h606061b_1 +- r-vctrs=0.5.2=r41h38f115c_0 +- freetype=2.12.1=hca18f0e_1 +- r-tinytex=0.43=r41hc72bb7e_0 +- xorg-libxdmcp=1.1.3=h7f98852_0 +- bioconductor-biocgenerics=0.40.0=r41hdfd78af_0 +- r-dtplyr=1.2.2=r41hc72bb7e_2 +- r-modelr=0.1.10=r41hc72bb7e_0 +- r-askpass=1.1=r41h06615bd_3 +- bioconductor-summarizedexperiment=1.24.0=r41hdfd78af_0 +- r-labeling=0.4.2=r41hc72bb7e_2 \ No newline at end of file diff --git a/workflow/envs/snakemake_cellranger.yaml b/workflow/envs/snakemake_cellranger.yaml new file mode 100644 index 0000000..d44ce8f --- /dev/null +++ b/workflow/envs/snakemake_cellranger.yaml @@ -0,0 +1,181 @@ +# this env is required, please install and activate for running snakemake +name: snakemake-cellranger +channels: +- conda-forge +- bioconda +- defaults +dependencies: +- libsqlite=3.39.4=h753d276_0 +- toposort=1.7=pyhd8ed1ab_0 +- importlib-metadata=5.0.0=pyha770c72_1 +- jinja2=3.1.2=pyhd8ed1ab_1 +- exceptiongroup=1.0.1=pyhd8ed1ab_0 +- veracitools=0.1.3=py_0 +- google-cloud-core=2.3.2=pyhd8ed1ab_0 +- coincbc=2.10.8=0_metapackage +- oauth2client=4.1.3=py_0 +- ubiquerg=0.6.2=pyhd8ed1ab_0 +- google-auth=2.14.0=pyh1a96a4e_0 +- python-dateutil=2.8.2=pyhd8ed1ab_0 +- liblapacke=3.9.0=16_linux64_openblas +- micromamba=1.0.0=1 +- coin-or-clp=1.17.7=hc56784d_2 +- aioeasywebdav=2.4.0=pyha770c72_0 +- libgomp=12.2.0=h65d4601_19 +- ply=3.11=py_1 +- importlib_resources=5.10.0=pyhd8ed1ab_0 +- libgfortran5=12.2.0=h337968e_19 +- datrie=0.8.2=py310h5764c6d_6 +- pip=22.3.1=pyhd8ed1ab_0 +- packaging=21.3=pyhd8ed1ab_0 +- dataclasses=0.8=pyhc8e2a94_3 +- pycparser=2.21=pyhd8ed1ab_0 +- configargparse=1.5.3=pyhd8ed1ab_0 +- urllib3=1.26.11=pyhd8ed1ab_0 +- colorama=0.4.6=pyhd8ed1ab_0 +- yarl=1.8.1=py310h5764c6d_0 +- psutil=5.9.4=py310h5764c6d_0 +- plac=1.3.5=pyhd8ed1ab_0 +- certifi=2022.9.24=pyhd8ed1ab_0 +- markupsafe=2.1.1=py310h5764c6d_2 +- toolz=0.12.0=pyhd8ed1ab_0 +- cachetools=5.2.0=pyhd8ed1ab_0 +- google-crc32c=1.1.2=py310he8fe98e_4 +- amply=0.1.5=pyhd8ed1ab_0 +- reretry=0.11.1=pyhd8ed1ab_0 +- c-ares=1.18.1=h7f98852_0 +- libsodium=1.0.18=h36c2ea0_1 +- peppy=0.35.2=pyhd8ed1ab_0 +- snakemake=7.18.1=hdfd78af_0 +- pytz=2022.6=pyhd8ed1ab_0 +- pytest=7.2.0=pyhd8ed1ab_2 +- libcblas=3.9.0=16_linux64_openblas +- httplib2=0.21.0=pyhd8ed1ab_0 +- yte=1.5.1=py310hff52083_1 +- pyrsistent=0.19.2=py310h5764c6d_0 +- libgrpc=1.49.1=h30feacc_1 +- pulp=2.7.0=py310hff52083_0 +- attrs=22.1.0=pyh71513ae_1 +- pandas=1.5.1=py310h769672d_1 +- multidict=6.0.2=py310h5764c6d_2 +- connection_pool=0.0.3=pyhd3deb0d_0 +- _libgcc_mutex=0.1=conda_forge +- google-api-core=2.10.2=pyhd8ed1ab_0 +- smmap=3.0.5=pyh44b312d_0 +- pygments=2.13.0=pyhd8ed1ab_0 +- wheel=0.38.3=pyhd8ed1ab_0 +- docutils=0.19=py310hff52083_1 +- ftputil=5.0.4=pyhd8ed1ab_0 +- conda=22.9.0=py310hff52083_2 +- gitdb=4.0.9=pyhd8ed1ab_0 +- aiohttp=3.8.3=py310h5764c6d_1 +- libgfortran-ng=12.2.0=h69a702a_19 +- stopit=1.1.2=py_0 +- defusedxml=0.7.1=pyhd8ed1ab_0 +- liblapack=3.9.0=16_linux64_openblas +- backports=1.0=py_2 +- numpy=1.23.4=py310h53a5b5f_1 +- iniconfig=1.1.1=pyh9f0ad1d_0 +- snakemake-minimal=7.18.1=pyhdfd78af_0 +- coin-or-cbc=2.10.8=h3786ebc_0 +- tzdata=2022f=h191b570_0 +- readline=8.1.2=h0f457ee_0 +- frozenlist=1.3.3=py310h5764c6d_0 +- filelock=3.8.0=pyhd8ed1ab_0 +- ruamel_yaml=0.15.80=py310h5764c6d_1008 +- pycosat=0.6.4=py310h5764c6d_1 +- logmuse=0.2.6=pyh8c360ce_0 +- boto3=1.26.5=pyhd8ed1ab_0 +- pyparsing=3.0.9=pyhd8ed1ab_0 +- backports.functools_lru_cache=1.6.4=pyhd8ed1ab_0 +- pyu2f=0.1.5=pyhd8ed1ab_0 +- protobuf=4.21.9=py310hd8f1fbe_0 +- conda-package-handling=1.9.0=py310h5764c6d_1 +- pysftp=0.2.9=py_1 +- pkgutil-resolve-name=1.3.10=pyhd8ed1ab_0 +- python-fastjsonschema=2.16.2=pyhd8ed1ab_0 +- ld_impl_linux-64=2.39=hc81fddc_0 +- ncurses=6.3=h27087fc_1 +- googleapis-common-protos=1.56.4=py310hff52083_1 +- google-cloud-storage=2.6.0=pyh1a96a4e_0 +- slacker=0.14.0=py_0 +- libffi=3.4.2=h7f98852_5 +- _openmp_mutex=4.5=2_gnu +- libzlib=1.2.13=h166bdaf_4 +- coin-or-utils=2.11.6=h202d8b1_2 +- libabseil=20220623.0=cxx17_h48a1fff_5 +- prettytable=3.4.1=pyhd8ed1ab_0 +- cffi=1.15.1=py310h255011f_2 +- typing-extensions=4.4.0=hd8ed1ab_0 +- pyyaml=6.0=py310h5764c6d_5 +- dropbox=11.35.0=pyhd8ed1ab_0 +- python_abi=3.10=2_cp310 +- nbformat=5.7.0=pyhd8ed1ab_0 +- libnsl=2.0.0=h7f98852_0 +- stone=3.3.1=pyhd8ed1ab_0 +- libblas=3.9.0=16_linux64_openblas +- cryptography=38.0.3=py310h600f1e7_0 +- google-auth-httplib2=0.1.0=pyhd8ed1ab_1 +- tk=8.6.12=h27826a3_0 +- libopenblas=0.3.21=pthreads_h78a6416_3 +- pyasn1-modules=0.2.7=py_0 +- jsonschema=4.17.0=pyhd8ed1ab_0 +- coin-or-cgl=0.60.6=h6f57e76_2 +- dpath=2.0.6=py310hff52083_2 +- google-api-python-client=2.65.0=pyhd8ed1ab_0 +- setuptools-scm=7.0.5=pyhd8ed1ab_1 +- rsa=4.9=pyhd8ed1ab_0 +- pyasn1=0.4.8=py_0 +- wcwidth=0.2.5=pyh9f0ad1d_2 +- tqdm=4.64.1=pyhd8ed1ab_0 +- traitlets=5.5.0=pyhd8ed1ab_0 +- wrapt=1.14.1=py310h5764c6d_1 +- zipp=3.10.0=pyhd8ed1ab_0 +- botocore=1.29.5=pyhd8ed1ab_0 +- idna=3.4=pyhd8ed1ab_0 +- google-resumable-media=2.4.0=pyhd8ed1ab_0 +- bcrypt=3.2.2=py310h5764c6d_1 +- attmap=0.13.2=pyhd8ed1ab_0 +- requests=2.28.1=pyhd8ed1ab_1 +- xz=5.2.6=h166bdaf_0 +- grpcio=1.49.1=py310hc32fa93_1 +- libprotobuf=3.21.9=h6239696_0 +- gitpython=3.1.29=pyhd8ed1ab_0 +- s3transfer=0.6.0=pyhd8ed1ab_0 +- yaml=0.2.5=h7f98852_2 +- ratelimiter=1.2.0=pyhd8ed1ab_1003 +- bzip2=1.0.8=h7f98852_4 +- ca-certificates=2022.9.24=ha878542_0 +- uritemplate=4.1.1=pyhd8ed1ab_0 +- future=0.18.2=pyhd8ed1ab_6 +- jupyter_core=4.11.2=py310hff52083_0 +- pluggy=1.0.0=pyhd8ed1ab_5 +- brotlipy=0.7.0=py310h5764c6d_1005 +- jmespath=1.0.1=pyhd8ed1ab_0 +- setuptools=65.5.1=pyhd8ed1ab_0 +- libcrc32c=1.1.2=h9c3ff4c_0 +- libstdcxx-ng=12.2.0=h46fd767_19 +- commonmark=0.9.1=py_0 +- zlib=1.2.13=h166bdaf_4 +- tabulate=0.9.0=pyhd8ed1ab_1 +- re2=2022.06.01=h27087fc_0 +- appdirs=1.4.4=pyh9f0ad1d_0 +- aiosignal=1.3.1=pyhd8ed1ab_0 +- paramiko=2.12.0=pyhd8ed1ab_0 +- filechunkio=1.8=py_2 +- charset-normalizer=2.1.1=pyhd8ed1ab_0 +- python-irodsclient=1.1.5=pyhd8ed1ab_0 +- rich=12.6.0=pyhd8ed1ab_0 +- async-timeout=4.0.2=pyhd8ed1ab_0 +- six=1.16.0=pyh6c4a22f_0 +- coin-or-osi=0.108.7=h2720bb7_2 +- pyopenssl=22.1.0=pyhd8ed1ab_0 +- libuuid=2.32.1=h7f98852_1000 +- pysocks=1.7.1=pyha2e5f31_6 +- openssl=3.0.7=h166bdaf_0 +- typing_extensions=4.4.0=pyha770c72_0 +- python=3.10.6=ha86cf86_0_cpython +- pynacl=1.5.0=py310h5764c6d_2 +- smart_open=6.2.0=pyha770c72_0 +- tomli=2.0.1=pyhd8ed1ab_0 +- libgcc-ng=12.2.0=h65d4601_19 \ No newline at end of file diff --git a/workflow/scripts/construct_sce_objects.R b/workflow/scripts/construct_sce_objects.R index 2adaf20..c4565f7 100644 --- a/workflow/scripts/construct_sce_objects.R +++ b/workflow/scripts/construct_sce_objects.R @@ -2,6 +2,7 @@ library(tidyverse, quietly = TRUE) library(DropletUtils, quietly = TRUE) +set.seed(1) # construct SCE object from raw cellranger output print(paste0(snakemake@input[["output_cellranger"]], "/raw_feature_bc_matrix")) From 17648670cf91339d3d68267d49cb554e8ae866f6 Mon Sep 17 00:00:00 2001 From: leawlb Date: Tue, 31 Jan 2023 13:44:51 +0100 Subject: [PATCH 26/32] Merge branch 'add_SCE' of https://github.com/odomlab2/snakemake-cellranger into add_SCE --- README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/README.md b/README.md index ff6e88e..0faa717 100644 --- a/README.md +++ b/README.md @@ -58,3 +58,12 @@ snakemake --cluster "bsub -n16 -q verylong -R rusage[mem=200GB]" -p -j4 -c42 --c - `--cluster` may change depending on the computational footprint of your analyses - `--configfile` should point to your personal configuration +For generating SCE objects, R (version as specified in construct_sce_objects.yaml) has to be installed in the snakemake environment. +For using the construct_sce_objects.yaml environment to resolve additional required packages automatically, also add: + +```bash +--conda-frontend conda +``` + +to the snakemake command. +For multiple runs, it is recommended to install these packages directly into the snakemake environment as well. From ed54ac16ea5b216059b647dbdcd03a8e6a07e73c Mon Sep 17 00:00:00 2001 From: Fritjof Lammers Date: Fri, 17 Feb 2023 12:09:49 +0100 Subject: [PATCH 27/32] extract method to rename BIRTH columns --- utils/process_otp_metadata.py | 13 +++++++++++++ workflow/scripts/samples.py | 15 ++------------- 2 files changed, 15 insertions(+), 13 deletions(-) create mode 100644 utils/process_otp_metadata.py diff --git a/utils/process_otp_metadata.py b/utils/process_otp_metadata.py new file mode 100644 index 0000000..10ca797 --- /dev/null +++ b/utils/process_otp_metadata.py @@ -0,0 +1,13 @@ +import pandas as pd + + +def rename_date_of_birth(row: pd.Series): + """Unify the 'BIRTH' and 'DATE_OF_BIRTH' columns into + a single 'DATE_OF_BIRTH' column. + """ + values = [row["BIRTH"], row["DATE_OF_BIRTH"]] + dates = [val for val in values if not pd.isna(val)] + if len(dates) == 1: + return dates[0] + elif len(dates) != 1: + return pd.NA diff --git a/workflow/scripts/samples.py b/workflow/scripts/samples.py index 66214b7..a447908 100644 --- a/workflow/scripts/samples.py +++ b/workflow/scripts/samples.py @@ -2,6 +2,7 @@ from snakemake.io import Wildcards + class Samples: """ Convert the OTP-exported metadata spreadsheet into a pandas.DataFrame @@ -23,7 +24,7 @@ class Samples: "LANE_NO", "READ", "CELLRANGER_FASTQ_PATH", - "individual"] # include 'individual' col to be generated below + "individual"] # include 'individual' col to be generated below def __init__(self, config): IDENTIFIERS = config["metadata"]["identifiers"] @@ -42,18 +43,6 @@ def __init__(self, config): self.metadata = self.select_columns(metadata_full, identifiers = IDENTIFIERS) - def rename_date_of_birth(self, - row: pd.Series): - """Unify the 'BIRTH' and 'DATE_OF_BIRTH' columns into - a single 'DATE_OF_BIRTH' column. - """ - values = [row["BIRTH"], row["DATE_OF_BIRTH"]] - dates = [val for val in values if not pd.isna(val)] - if len(dates) == 1: - return dates[0] - elif len(dates) != 1: - return pd.NA - def get_cellranger_filename(self, df: pd.DataFrame) -> pd.DataFrame: """Add column containing CellRanger compatible filename, From 5d9d69f11676bb593357a1ee8b622a8da4e2b081 Mon Sep 17 00:00:00 2001 From: leawlb Date: Mon, 27 Mar 2023 15:30:09 +0200 Subject: [PATCH 28/32] improved readability --- README.md | 20 ++- config/config-interspecies-bonemarrow.yaml | 4 +- workflow/Snakefile | 2 +- workflow/envs/snakemake_cellranger.yaml | 181 --------------------- workflow/scripts/construct_sce_objects.R | 19 +-- 5 files changed, 23 insertions(+), 203 deletions(-) delete mode 100644 workflow/envs/snakemake_cellranger.yaml diff --git a/README.md b/README.md index 0faa717..4341f2a 100644 --- a/README.md +++ b/README.md @@ -49,21 +49,23 @@ Minimal changes needed are: ## How to run -You may call the pipeline as follows in the directory where you cloned it. +Use snakemake_cellranger.yaml to build an environment with all required packages. ```bash -snakemake --cluster "bsub -n16 -q verylong -R rusage[mem=200GB]" -p -j4 -c42 --configfile config/config-cluster.yaml --use-conda --use-envmodules +micromamba create -f snakemake_cellranger.yaml ``` - - `--cluster` may change depending on the computational footprint of your analyses - - `--configfile` should point to your personal configuration +or -For generating SCE objects, R (version as specified in construct_sce_objects.yaml) has to be installed in the snakemake environment. -For using the construct_sce_objects.yaml environment to resolve additional required packages automatically, also add: +```bash +conda env create -f snakemake_cellranger.yaml +``` + +You may call the pipeline as follows in the directory where you cloned it. ```bash ---conda-frontend conda +snakemake --cluster "bsub -n16 -q verylong -R rusage[mem=200GB]" -p -j4 -c42 --configfile config/config-cluster.yaml --use-conda --use-envmodules --conda-frontend conda ``` -to the snakemake command. -For multiple runs, it is recommended to install these packages directly into the snakemake environment as well. + - `--cluster` may change depending on the computational footprint of your analyses + - `--configfile` should point to your personal configuration diff --git a/config/config-interspecies-bonemarrow.yaml b/config/config-interspecies-bonemarrow.yaml index 872907b..3eedb8e 100644 --- a/config/config-interspecies-bonemarrow.yaml +++ b/config/config-interspecies-bonemarrow.yaml @@ -1,7 +1,7 @@ paths: # directory to store output data = cellranger files and SingleCellExperiment objects # paths to input data = fastq files are stored in metadata sheet (OTP export) - output_dir: "/omics/odcf/analysis/OE0538_projects/DO-0008/test" + output_dir: "/omics/odcf/analysis/OE0538_projects/DO-0008/data" target_templates: # paths for each type of output file # remove "/{0[Species_ID]}" to store all objects in one directory or @@ -16,7 +16,7 @@ paths: references: all_masked: "/omics/groups/OE0538/internal/shared_data/CellRangerReferences/GRCm38_masked_allStrains/" metadata: - raw: + table: # should have same format as or be based on OTP export sheet - "/omics/odcf/analysis/OE0538_projects/DO-0008/metadata/OE0538_DO-0008_metadata_combined.csv" identifiers: diff --git a/workflow/Snakefile b/workflow/Snakefile index 9bc9bca..de65a9a 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -6,7 +6,7 @@ from scripts.samples import Samples samples = Samples(config) -METADATA_PATH = config["metadata"]["raw"] +METADATA_PATH = config["metadata"]["table"] OUTPUT_BASE_PATH = config["paths"]["output_dir"] REFERENCES = config["references"] diff --git a/workflow/envs/snakemake_cellranger.yaml b/workflow/envs/snakemake_cellranger.yaml deleted file mode 100644 index d44ce8f..0000000 --- a/workflow/envs/snakemake_cellranger.yaml +++ /dev/null @@ -1,181 +0,0 @@ -# this env is required, please install and activate for running snakemake -name: snakemake-cellranger -channels: -- conda-forge -- bioconda -- defaults -dependencies: -- libsqlite=3.39.4=h753d276_0 -- toposort=1.7=pyhd8ed1ab_0 -- importlib-metadata=5.0.0=pyha770c72_1 -- jinja2=3.1.2=pyhd8ed1ab_1 -- exceptiongroup=1.0.1=pyhd8ed1ab_0 -- veracitools=0.1.3=py_0 -- google-cloud-core=2.3.2=pyhd8ed1ab_0 -- coincbc=2.10.8=0_metapackage -- oauth2client=4.1.3=py_0 -- ubiquerg=0.6.2=pyhd8ed1ab_0 -- google-auth=2.14.0=pyh1a96a4e_0 -- python-dateutil=2.8.2=pyhd8ed1ab_0 -- liblapacke=3.9.0=16_linux64_openblas -- micromamba=1.0.0=1 -- coin-or-clp=1.17.7=hc56784d_2 -- aioeasywebdav=2.4.0=pyha770c72_0 -- libgomp=12.2.0=h65d4601_19 -- ply=3.11=py_1 -- importlib_resources=5.10.0=pyhd8ed1ab_0 -- libgfortran5=12.2.0=h337968e_19 -- datrie=0.8.2=py310h5764c6d_6 -- pip=22.3.1=pyhd8ed1ab_0 -- packaging=21.3=pyhd8ed1ab_0 -- dataclasses=0.8=pyhc8e2a94_3 -- pycparser=2.21=pyhd8ed1ab_0 -- configargparse=1.5.3=pyhd8ed1ab_0 -- urllib3=1.26.11=pyhd8ed1ab_0 -- colorama=0.4.6=pyhd8ed1ab_0 -- yarl=1.8.1=py310h5764c6d_0 -- psutil=5.9.4=py310h5764c6d_0 -- plac=1.3.5=pyhd8ed1ab_0 -- certifi=2022.9.24=pyhd8ed1ab_0 -- markupsafe=2.1.1=py310h5764c6d_2 -- toolz=0.12.0=pyhd8ed1ab_0 -- cachetools=5.2.0=pyhd8ed1ab_0 -- google-crc32c=1.1.2=py310he8fe98e_4 -- amply=0.1.5=pyhd8ed1ab_0 -- reretry=0.11.1=pyhd8ed1ab_0 -- c-ares=1.18.1=h7f98852_0 -- libsodium=1.0.18=h36c2ea0_1 -- peppy=0.35.2=pyhd8ed1ab_0 -- snakemake=7.18.1=hdfd78af_0 -- pytz=2022.6=pyhd8ed1ab_0 -- pytest=7.2.0=pyhd8ed1ab_2 -- libcblas=3.9.0=16_linux64_openblas -- httplib2=0.21.0=pyhd8ed1ab_0 -- yte=1.5.1=py310hff52083_1 -- pyrsistent=0.19.2=py310h5764c6d_0 -- libgrpc=1.49.1=h30feacc_1 -- pulp=2.7.0=py310hff52083_0 -- attrs=22.1.0=pyh71513ae_1 -- pandas=1.5.1=py310h769672d_1 -- multidict=6.0.2=py310h5764c6d_2 -- connection_pool=0.0.3=pyhd3deb0d_0 -- _libgcc_mutex=0.1=conda_forge -- google-api-core=2.10.2=pyhd8ed1ab_0 -- smmap=3.0.5=pyh44b312d_0 -- pygments=2.13.0=pyhd8ed1ab_0 -- wheel=0.38.3=pyhd8ed1ab_0 -- docutils=0.19=py310hff52083_1 -- ftputil=5.0.4=pyhd8ed1ab_0 -- conda=22.9.0=py310hff52083_2 -- gitdb=4.0.9=pyhd8ed1ab_0 -- aiohttp=3.8.3=py310h5764c6d_1 -- libgfortran-ng=12.2.0=h69a702a_19 -- stopit=1.1.2=py_0 -- defusedxml=0.7.1=pyhd8ed1ab_0 -- liblapack=3.9.0=16_linux64_openblas -- backports=1.0=py_2 -- numpy=1.23.4=py310h53a5b5f_1 -- iniconfig=1.1.1=pyh9f0ad1d_0 -- snakemake-minimal=7.18.1=pyhdfd78af_0 -- coin-or-cbc=2.10.8=h3786ebc_0 -- tzdata=2022f=h191b570_0 -- readline=8.1.2=h0f457ee_0 -- frozenlist=1.3.3=py310h5764c6d_0 -- filelock=3.8.0=pyhd8ed1ab_0 -- ruamel_yaml=0.15.80=py310h5764c6d_1008 -- pycosat=0.6.4=py310h5764c6d_1 -- logmuse=0.2.6=pyh8c360ce_0 -- boto3=1.26.5=pyhd8ed1ab_0 -- pyparsing=3.0.9=pyhd8ed1ab_0 -- backports.functools_lru_cache=1.6.4=pyhd8ed1ab_0 -- pyu2f=0.1.5=pyhd8ed1ab_0 -- protobuf=4.21.9=py310hd8f1fbe_0 -- conda-package-handling=1.9.0=py310h5764c6d_1 -- pysftp=0.2.9=py_1 -- pkgutil-resolve-name=1.3.10=pyhd8ed1ab_0 -- python-fastjsonschema=2.16.2=pyhd8ed1ab_0 -- ld_impl_linux-64=2.39=hc81fddc_0 -- ncurses=6.3=h27087fc_1 -- googleapis-common-protos=1.56.4=py310hff52083_1 -- google-cloud-storage=2.6.0=pyh1a96a4e_0 -- slacker=0.14.0=py_0 -- libffi=3.4.2=h7f98852_5 -- _openmp_mutex=4.5=2_gnu -- libzlib=1.2.13=h166bdaf_4 -- coin-or-utils=2.11.6=h202d8b1_2 -- libabseil=20220623.0=cxx17_h48a1fff_5 -- prettytable=3.4.1=pyhd8ed1ab_0 -- cffi=1.15.1=py310h255011f_2 -- typing-extensions=4.4.0=hd8ed1ab_0 -- pyyaml=6.0=py310h5764c6d_5 -- dropbox=11.35.0=pyhd8ed1ab_0 -- python_abi=3.10=2_cp310 -- nbformat=5.7.0=pyhd8ed1ab_0 -- libnsl=2.0.0=h7f98852_0 -- stone=3.3.1=pyhd8ed1ab_0 -- libblas=3.9.0=16_linux64_openblas -- cryptography=38.0.3=py310h600f1e7_0 -- google-auth-httplib2=0.1.0=pyhd8ed1ab_1 -- tk=8.6.12=h27826a3_0 -- libopenblas=0.3.21=pthreads_h78a6416_3 -- pyasn1-modules=0.2.7=py_0 -- jsonschema=4.17.0=pyhd8ed1ab_0 -- coin-or-cgl=0.60.6=h6f57e76_2 -- dpath=2.0.6=py310hff52083_2 -- google-api-python-client=2.65.0=pyhd8ed1ab_0 -- setuptools-scm=7.0.5=pyhd8ed1ab_1 -- rsa=4.9=pyhd8ed1ab_0 -- pyasn1=0.4.8=py_0 -- wcwidth=0.2.5=pyh9f0ad1d_2 -- tqdm=4.64.1=pyhd8ed1ab_0 -- traitlets=5.5.0=pyhd8ed1ab_0 -- wrapt=1.14.1=py310h5764c6d_1 -- zipp=3.10.0=pyhd8ed1ab_0 -- botocore=1.29.5=pyhd8ed1ab_0 -- idna=3.4=pyhd8ed1ab_0 -- google-resumable-media=2.4.0=pyhd8ed1ab_0 -- bcrypt=3.2.2=py310h5764c6d_1 -- attmap=0.13.2=pyhd8ed1ab_0 -- requests=2.28.1=pyhd8ed1ab_1 -- xz=5.2.6=h166bdaf_0 -- grpcio=1.49.1=py310hc32fa93_1 -- libprotobuf=3.21.9=h6239696_0 -- gitpython=3.1.29=pyhd8ed1ab_0 -- s3transfer=0.6.0=pyhd8ed1ab_0 -- yaml=0.2.5=h7f98852_2 -- ratelimiter=1.2.0=pyhd8ed1ab_1003 -- bzip2=1.0.8=h7f98852_4 -- ca-certificates=2022.9.24=ha878542_0 -- uritemplate=4.1.1=pyhd8ed1ab_0 -- future=0.18.2=pyhd8ed1ab_6 -- jupyter_core=4.11.2=py310hff52083_0 -- pluggy=1.0.0=pyhd8ed1ab_5 -- brotlipy=0.7.0=py310h5764c6d_1005 -- jmespath=1.0.1=pyhd8ed1ab_0 -- setuptools=65.5.1=pyhd8ed1ab_0 -- libcrc32c=1.1.2=h9c3ff4c_0 -- libstdcxx-ng=12.2.0=h46fd767_19 -- commonmark=0.9.1=py_0 -- zlib=1.2.13=h166bdaf_4 -- tabulate=0.9.0=pyhd8ed1ab_1 -- re2=2022.06.01=h27087fc_0 -- appdirs=1.4.4=pyh9f0ad1d_0 -- aiosignal=1.3.1=pyhd8ed1ab_0 -- paramiko=2.12.0=pyhd8ed1ab_0 -- filechunkio=1.8=py_2 -- charset-normalizer=2.1.1=pyhd8ed1ab_0 -- python-irodsclient=1.1.5=pyhd8ed1ab_0 -- rich=12.6.0=pyhd8ed1ab_0 -- async-timeout=4.0.2=pyhd8ed1ab_0 -- six=1.16.0=pyh6c4a22f_0 -- coin-or-osi=0.108.7=h2720bb7_2 -- pyopenssl=22.1.0=pyhd8ed1ab_0 -- libuuid=2.32.1=h7f98852_1000 -- pysocks=1.7.1=pyha2e5f31_6 -- openssl=3.0.7=h166bdaf_0 -- typing_extensions=4.4.0=pyha770c72_0 -- python=3.10.6=ha86cf86_0_cpython -- pynacl=1.5.0=py310h5764c6d_2 -- smart_open=6.2.0=pyha770c72_0 -- tomli=2.0.1=pyhd8ed1ab_0 -- libgcc-ng=12.2.0=h65d4601_19 \ No newline at end of file diff --git a/workflow/scripts/construct_sce_objects.R b/workflow/scripts/construct_sce_objects.R index c4565f7..4be32d8 100644 --- a/workflow/scripts/construct_sce_objects.R +++ b/workflow/scripts/construct_sce_objects.R @@ -4,6 +4,9 @@ library(tidyverse, quietly = TRUE) library(DropletUtils, quietly = TRUE) set.seed(1) +individual_curr <- snakemake@wildcards[["individual"]] # currently loaded individual sample +IDENTIFIERS <- snakemake@params[["identifiers"]] + # construct SCE object from raw cellranger output print(paste0(snakemake@input[["output_cellranger"]], "/raw_feature_bc_matrix")) sce <- read10xCounts(samples = paste0(snakemake@input[["output_cellranger"]], @@ -20,25 +23,21 @@ metadata <- read.csv(file = snakemake@params[["metadata"]], as.is=TRUE, colClasses = "character") -individual_curr <- snakemake@wildcards[["individual"]] # currently loaded individual sample -IDENTIFIERS <- snakemake@params[["identifiers"]] - # if necessary, concatenate identifiers again to obtain all possible wildcards -metadata_curr <- metadata -if(! "individual" %in% colnames(metadata_curr)){ - metadata_curr <- metadata_curr %>% +if(! "individual" %in% colnames(metadata)){ + metadata <- metadata %>% unite(col = "individual", all_of(IDENTIFIERS), sep = "_", remove = FALSE) print('establishing "individual" column, construct_sce_objects.R') } # subset data as specified by wildcard and single_cell_object_metadata_fields -metadata_curr <- metadata_curr[which(metadata_curr$individual == individual_curr),] +metadata <- metadata[which(metadata$individual == individual_curr),] cols_add <- snakemake@params[["single_cell_object_metadata_fields"]] -metadata_curr <- metadata_curr[,colnames(metadata_curr) %in% cols_add] +metadata <- metadata[,colnames(metadata) %in% cols_add] # add cell-level metadata to each barcode in the SCE object -for(i in colnames(metadata_curr)){ - colData(sce)[i] <- rep(metadata_curr[1,i], ncol(sce)) +for(i in colnames(metadata)){ + colData(sce)[i] <- rep(metadata[1,i], ncol(sce)) } print(sce) From 47c9b94db7075df7c988501064d91521d2e70ad1 Mon Sep 17 00:00:00 2001 From: leawlb Date: Mon, 27 Mar 2023 15:46:34 +0200 Subject: [PATCH 29/32] readability improvements --- README.md | 8 +- snakemake_cellranger.yaml | 181 +++++++++++++++++++++++ workflow/Snakefile | 4 +- workflow/scripts/construct_sce_objects.R | 2 +- 4 files changed, 190 insertions(+), 5 deletions(-) create mode 100644 snakemake_cellranger.yaml diff --git a/README.md b/README.md index 4341f2a..3a3289c 100644 --- a/README.md +++ b/README.md @@ -49,7 +49,7 @@ Minimal changes needed are: ## How to run -Use snakemake_cellranger.yaml to build an environment with all required packages. +Use snakemake_cellranger.yaml to create an environment with all required packages. ```bash micromamba create -f snakemake_cellranger.yaml @@ -61,6 +61,12 @@ or conda env create -f snakemake_cellranger.yaml ``` +Set channel priority to strict. + +```bash +conda config --set channel_priority strict +``` + You may call the pipeline as follows in the directory where you cloned it. ```bash diff --git a/snakemake_cellranger.yaml b/snakemake_cellranger.yaml new file mode 100644 index 0000000..d44ce8f --- /dev/null +++ b/snakemake_cellranger.yaml @@ -0,0 +1,181 @@ +# this env is required, please install and activate for running snakemake +name: snakemake-cellranger +channels: +- conda-forge +- bioconda +- defaults +dependencies: +- libsqlite=3.39.4=h753d276_0 +- toposort=1.7=pyhd8ed1ab_0 +- importlib-metadata=5.0.0=pyha770c72_1 +- jinja2=3.1.2=pyhd8ed1ab_1 +- exceptiongroup=1.0.1=pyhd8ed1ab_0 +- veracitools=0.1.3=py_0 +- google-cloud-core=2.3.2=pyhd8ed1ab_0 +- coincbc=2.10.8=0_metapackage +- oauth2client=4.1.3=py_0 +- ubiquerg=0.6.2=pyhd8ed1ab_0 +- google-auth=2.14.0=pyh1a96a4e_0 +- python-dateutil=2.8.2=pyhd8ed1ab_0 +- liblapacke=3.9.0=16_linux64_openblas +- micromamba=1.0.0=1 +- coin-or-clp=1.17.7=hc56784d_2 +- aioeasywebdav=2.4.0=pyha770c72_0 +- libgomp=12.2.0=h65d4601_19 +- ply=3.11=py_1 +- importlib_resources=5.10.0=pyhd8ed1ab_0 +- libgfortran5=12.2.0=h337968e_19 +- datrie=0.8.2=py310h5764c6d_6 +- pip=22.3.1=pyhd8ed1ab_0 +- packaging=21.3=pyhd8ed1ab_0 +- dataclasses=0.8=pyhc8e2a94_3 +- pycparser=2.21=pyhd8ed1ab_0 +- configargparse=1.5.3=pyhd8ed1ab_0 +- urllib3=1.26.11=pyhd8ed1ab_0 +- colorama=0.4.6=pyhd8ed1ab_0 +- yarl=1.8.1=py310h5764c6d_0 +- psutil=5.9.4=py310h5764c6d_0 +- plac=1.3.5=pyhd8ed1ab_0 +- certifi=2022.9.24=pyhd8ed1ab_0 +- markupsafe=2.1.1=py310h5764c6d_2 +- toolz=0.12.0=pyhd8ed1ab_0 +- cachetools=5.2.0=pyhd8ed1ab_0 +- google-crc32c=1.1.2=py310he8fe98e_4 +- amply=0.1.5=pyhd8ed1ab_0 +- reretry=0.11.1=pyhd8ed1ab_0 +- c-ares=1.18.1=h7f98852_0 +- libsodium=1.0.18=h36c2ea0_1 +- peppy=0.35.2=pyhd8ed1ab_0 +- snakemake=7.18.1=hdfd78af_0 +- pytz=2022.6=pyhd8ed1ab_0 +- pytest=7.2.0=pyhd8ed1ab_2 +- libcblas=3.9.0=16_linux64_openblas +- httplib2=0.21.0=pyhd8ed1ab_0 +- yte=1.5.1=py310hff52083_1 +- pyrsistent=0.19.2=py310h5764c6d_0 +- libgrpc=1.49.1=h30feacc_1 +- pulp=2.7.0=py310hff52083_0 +- attrs=22.1.0=pyh71513ae_1 +- pandas=1.5.1=py310h769672d_1 +- multidict=6.0.2=py310h5764c6d_2 +- connection_pool=0.0.3=pyhd3deb0d_0 +- _libgcc_mutex=0.1=conda_forge +- google-api-core=2.10.2=pyhd8ed1ab_0 +- smmap=3.0.5=pyh44b312d_0 +- pygments=2.13.0=pyhd8ed1ab_0 +- wheel=0.38.3=pyhd8ed1ab_0 +- docutils=0.19=py310hff52083_1 +- ftputil=5.0.4=pyhd8ed1ab_0 +- conda=22.9.0=py310hff52083_2 +- gitdb=4.0.9=pyhd8ed1ab_0 +- aiohttp=3.8.3=py310h5764c6d_1 +- libgfortran-ng=12.2.0=h69a702a_19 +- stopit=1.1.2=py_0 +- defusedxml=0.7.1=pyhd8ed1ab_0 +- liblapack=3.9.0=16_linux64_openblas +- backports=1.0=py_2 +- numpy=1.23.4=py310h53a5b5f_1 +- iniconfig=1.1.1=pyh9f0ad1d_0 +- snakemake-minimal=7.18.1=pyhdfd78af_0 +- coin-or-cbc=2.10.8=h3786ebc_0 +- tzdata=2022f=h191b570_0 +- readline=8.1.2=h0f457ee_0 +- frozenlist=1.3.3=py310h5764c6d_0 +- filelock=3.8.0=pyhd8ed1ab_0 +- ruamel_yaml=0.15.80=py310h5764c6d_1008 +- pycosat=0.6.4=py310h5764c6d_1 +- logmuse=0.2.6=pyh8c360ce_0 +- boto3=1.26.5=pyhd8ed1ab_0 +- pyparsing=3.0.9=pyhd8ed1ab_0 +- backports.functools_lru_cache=1.6.4=pyhd8ed1ab_0 +- pyu2f=0.1.5=pyhd8ed1ab_0 +- protobuf=4.21.9=py310hd8f1fbe_0 +- conda-package-handling=1.9.0=py310h5764c6d_1 +- pysftp=0.2.9=py_1 +- pkgutil-resolve-name=1.3.10=pyhd8ed1ab_0 +- python-fastjsonschema=2.16.2=pyhd8ed1ab_0 +- ld_impl_linux-64=2.39=hc81fddc_0 +- ncurses=6.3=h27087fc_1 +- googleapis-common-protos=1.56.4=py310hff52083_1 +- google-cloud-storage=2.6.0=pyh1a96a4e_0 +- slacker=0.14.0=py_0 +- libffi=3.4.2=h7f98852_5 +- _openmp_mutex=4.5=2_gnu +- libzlib=1.2.13=h166bdaf_4 +- coin-or-utils=2.11.6=h202d8b1_2 +- libabseil=20220623.0=cxx17_h48a1fff_5 +- prettytable=3.4.1=pyhd8ed1ab_0 +- cffi=1.15.1=py310h255011f_2 +- typing-extensions=4.4.0=hd8ed1ab_0 +- pyyaml=6.0=py310h5764c6d_5 +- dropbox=11.35.0=pyhd8ed1ab_0 +- python_abi=3.10=2_cp310 +- nbformat=5.7.0=pyhd8ed1ab_0 +- libnsl=2.0.0=h7f98852_0 +- stone=3.3.1=pyhd8ed1ab_0 +- libblas=3.9.0=16_linux64_openblas +- cryptography=38.0.3=py310h600f1e7_0 +- google-auth-httplib2=0.1.0=pyhd8ed1ab_1 +- tk=8.6.12=h27826a3_0 +- libopenblas=0.3.21=pthreads_h78a6416_3 +- pyasn1-modules=0.2.7=py_0 +- jsonschema=4.17.0=pyhd8ed1ab_0 +- coin-or-cgl=0.60.6=h6f57e76_2 +- dpath=2.0.6=py310hff52083_2 +- google-api-python-client=2.65.0=pyhd8ed1ab_0 +- setuptools-scm=7.0.5=pyhd8ed1ab_1 +- rsa=4.9=pyhd8ed1ab_0 +- pyasn1=0.4.8=py_0 +- wcwidth=0.2.5=pyh9f0ad1d_2 +- tqdm=4.64.1=pyhd8ed1ab_0 +- traitlets=5.5.0=pyhd8ed1ab_0 +- wrapt=1.14.1=py310h5764c6d_1 +- zipp=3.10.0=pyhd8ed1ab_0 +- botocore=1.29.5=pyhd8ed1ab_0 +- idna=3.4=pyhd8ed1ab_0 +- google-resumable-media=2.4.0=pyhd8ed1ab_0 +- bcrypt=3.2.2=py310h5764c6d_1 +- attmap=0.13.2=pyhd8ed1ab_0 +- requests=2.28.1=pyhd8ed1ab_1 +- xz=5.2.6=h166bdaf_0 +- grpcio=1.49.1=py310hc32fa93_1 +- libprotobuf=3.21.9=h6239696_0 +- gitpython=3.1.29=pyhd8ed1ab_0 +- s3transfer=0.6.0=pyhd8ed1ab_0 +- yaml=0.2.5=h7f98852_2 +- ratelimiter=1.2.0=pyhd8ed1ab_1003 +- bzip2=1.0.8=h7f98852_4 +- ca-certificates=2022.9.24=ha878542_0 +- uritemplate=4.1.1=pyhd8ed1ab_0 +- future=0.18.2=pyhd8ed1ab_6 +- jupyter_core=4.11.2=py310hff52083_0 +- pluggy=1.0.0=pyhd8ed1ab_5 +- brotlipy=0.7.0=py310h5764c6d_1005 +- jmespath=1.0.1=pyhd8ed1ab_0 +- setuptools=65.5.1=pyhd8ed1ab_0 +- libcrc32c=1.1.2=h9c3ff4c_0 +- libstdcxx-ng=12.2.0=h46fd767_19 +- commonmark=0.9.1=py_0 +- zlib=1.2.13=h166bdaf_4 +- tabulate=0.9.0=pyhd8ed1ab_1 +- re2=2022.06.01=h27087fc_0 +- appdirs=1.4.4=pyh9f0ad1d_0 +- aiosignal=1.3.1=pyhd8ed1ab_0 +- paramiko=2.12.0=pyhd8ed1ab_0 +- filechunkio=1.8=py_2 +- charset-normalizer=2.1.1=pyhd8ed1ab_0 +- python-irodsclient=1.1.5=pyhd8ed1ab_0 +- rich=12.6.0=pyhd8ed1ab_0 +- async-timeout=4.0.2=pyhd8ed1ab_0 +- six=1.16.0=pyh6c4a22f_0 +- coin-or-osi=0.108.7=h2720bb7_2 +- pyopenssl=22.1.0=pyhd8ed1ab_0 +- libuuid=2.32.1=h7f98852_1000 +- pysocks=1.7.1=pyha2e5f31_6 +- openssl=3.0.7=h166bdaf_0 +- typing_extensions=4.4.0=pyha770c72_0 +- python=3.10.6=ha86cf86_0_cpython +- pynacl=1.5.0=py310h5764c6d_2 +- smart_open=6.2.0=pyha770c72_0 +- tomli=2.0.1=pyhd8ed1ab_0 +- libgcc-ng=12.2.0=h65d4601_19 \ No newline at end of file diff --git a/workflow/Snakefile b/workflow/Snakefile index de65a9a..8c705fb 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -97,9 +97,6 @@ rule cellranger_count: rule construct_sce_objects: """construct SCE objects from counts matrices - - set channel priority to strict with << conda config --set channel_priority strict >> - use << --conda-frontend conda >> in snakemake command Initially, building the environment might take some time. @@ -113,6 +110,7 @@ rule construct_sce_objects: identifiers = IDENTIFIERS, metadata = METADATA_PATH, single_cell_object_metadata_fields = config["metadata"]["single_cell_object_metadata_fields"] + threads: 2 conda: "envs/construct_sce_objects.yaml" script: diff --git a/workflow/scripts/construct_sce_objects.R b/workflow/scripts/construct_sce_objects.R index 4be32d8..3e21d8f 100644 --- a/workflow/scripts/construct_sce_objects.R +++ b/workflow/scripts/construct_sce_objects.R @@ -31,7 +31,7 @@ if(! "individual" %in% colnames(metadata)){ } # subset data as specified by wildcard and single_cell_object_metadata_fields -metadata <- metadata[which(metadata$individual == individual_curr),] +metadata <- metadata |> filter(individual == individual_curr) cols_add <- snakemake@params[["single_cell_object_metadata_fields"]] metadata <- metadata[,colnames(metadata) %in% cols_add] From c5033c6d6f461077a3d1a1ac18e33dc4562a1374 Mon Sep 17 00:00:00 2001 From: leawlb <76486615+leawlb@users.noreply.github.com> Date: Mon, 27 Mar 2023 15:54:25 +0200 Subject: [PATCH 30/32] Update workflow/scripts/samples.py Co-authored-by: Fritjof Lammers <25619971+mobilegenome@users.noreply.github.com> --- workflow/scripts/samples.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/scripts/samples.py b/workflow/scripts/samples.py index a447908..1f85517 100644 --- a/workflow/scripts/samples.py +++ b/workflow/scripts/samples.py @@ -41,7 +41,7 @@ def __init__(self, config): metadata_full = self.get_cellranger_filename(metadata_full) - self.metadata = self.select_columns(metadata_full, identifiers = IDENTIFIERS) + self.metadata = self.select_columns(metadata_full, custom_columns = IDENTIFIERS) def get_cellranger_filename(self, df: pd.DataFrame) -> pd.DataFrame: From b2d5b8c1d770f832d9cbbf1acf68636ebf30060f Mon Sep 17 00:00:00 2001 From: leawlb Date: Tue, 28 Mar 2023 09:50:19 +0200 Subject: [PATCH 31/32] adjusted select_columns function plus minor corrections --- README.md | 2 +- config/config-interspecies-bonemarrow.yaml | 2 +- workflow/scripts/samples.py | 60 ++++++++++++---------- 3 files changed, 34 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index 3a3289c..7580ac7 100644 --- a/README.md +++ b/README.md @@ -70,7 +70,7 @@ conda config --set channel_priority strict You may call the pipeline as follows in the directory where you cloned it. ```bash -snakemake --cluster "bsub -n16 -q verylong -R rusage[mem=200GB]" -p -j4 -c42 --configfile config/config-cluster.yaml --use-conda --use-envmodules --conda-frontend conda +snakemake --cluster "bsub -n16 -q verylong -R rusage[mem=200GB]" -p -j4 -c42 --configfile config/config-cluster.yaml --use-conda --use-envmodules --conda-frontend conda ``` - `--cluster` may change depending on the computational footprint of your analyses diff --git a/config/config-interspecies-bonemarrow.yaml b/config/config-interspecies-bonemarrow.yaml index 3eedb8e..4a571cf 100644 --- a/config/config-interspecies-bonemarrow.yaml +++ b/config/config-interspecies-bonemarrow.yaml @@ -1,7 +1,7 @@ paths: # directory to store output data = cellranger files and SingleCellExperiment objects # paths to input data = fastq files are stored in metadata sheet (OTP export) - output_dir: "/omics/odcf/analysis/OE0538_projects/DO-0008/data" + output_dir: "/omics/odcf/analysis/OE0538_projects/DO-0008/test2" target_templates: # paths for each type of output file # remove "/{0[Species_ID]}" to store all objects in one directory or diff --git a/workflow/scripts/samples.py b/workflow/scripts/samples.py index 1f85517..a41a331 100644 --- a/workflow/scripts/samples.py +++ b/workflow/scripts/samples.py @@ -2,46 +2,46 @@ from snakemake.io import Wildcards - class Samples: """ Convert the OTP-exported metadata spreadsheet into a pandas.DataFrame that provides metadata for the workflow. """ - # columns to select - columns = ["PID", - "Sample Type", - "Species Scientific Name", - "STRAIN", - "FastQ Path", - "FastQC Path", - "Run ID", - "ILSE_NO", - "TISSUE", - "BIRTH", - "DATE_OF_BIRTH", - "DATE_OF_DEATH", - "LANE_NO", - "READ", - "CELLRANGER_FASTQ_PATH", - "individual"] # include 'individual' col to be generated below def __init__(self, config): IDENTIFIERS = config["metadata"]["identifiers"] - metadata_files = config["metadata"]["raw"] + metadata_files = config["metadata"]["table"] self.output_base_dir = config["paths"]["output_dir"] self.target_templates = config["paths"]["target_templates"] + + # columns to select for subsampling + columns = ["PID", + "Sample Type", + "Species Scientific Name", + "STRAIN", + "FastQ Path", + "FastQC Path", + "Run ID", + "ILSE_NO", + "TISSUE", + "BIRTH", + "DATE_OF_BIRTH", + "DATE_OF_DEATH", + "LANE_NO", + "READ", + "CELLRANGER_FASTQ_PATH", + "individual"] # include 'individual' col to be generated below metadata_full = pd.concat((pd.read_csv(f) for f in metadata_files), ignore_index=True) - # generate 'individual' column containing all concatenated identifiers + # if required, generate 'individual' column containing all concatenated identifiers if not "individual" in metadata_full.columns: metadata_full['individual'] = metadata_full[IDENTIFIERS].apply('_'.join, axis=1) metadata_full = self.get_cellranger_filename(metadata_full) - self.metadata = self.select_columns(metadata_full, custom_columns = IDENTIFIERS) + self.metadata = self.select_columns(metadata_full, custom_columns = columns + IDENTIFIERS) def get_cellranger_filename(self, df: pd.DataFrame) -> pd.DataFrame: @@ -69,14 +69,18 @@ def get_cellranger_filename(self, df: pd.DataFrame) -> pd.DataFrame: def select_columns(self, df: pd.DataFrame, - columns: list = None, - identifiers: str = None): + custom_columns: list = None): """Select/Subset columns from DataFrame to reduce - DataFrame dimensions. """ - - if not columns: - columns = self.columns + identifiers - return df[columns] + DataFrame dimensions. A list of column names can be provided by `custom_columns` """ + + if custom_columns: + print(custom_columns) + df_subset = df[custom_columns] + else: + print(custom_columns) + df_subset = df[columns] + + return df_subset @staticmethod def filter_by_wildcards( From cdcda74c65592cd3ea79300724136a91c21ee909 Mon Sep 17 00:00:00 2001 From: leawlb Date: Thu, 30 Mar 2023 09:20:23 +0200 Subject: [PATCH 32/32] minor corrections --- config/config-interspecies-bonemarrow.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/config-interspecies-bonemarrow.yaml b/config/config-interspecies-bonemarrow.yaml index 4a571cf..3eedb8e 100644 --- a/config/config-interspecies-bonemarrow.yaml +++ b/config/config-interspecies-bonemarrow.yaml @@ -1,7 +1,7 @@ paths: # directory to store output data = cellranger files and SingleCellExperiment objects # paths to input data = fastq files are stored in metadata sheet (OTP export) - output_dir: "/omics/odcf/analysis/OE0538_projects/DO-0008/test2" + output_dir: "/omics/odcf/analysis/OE0538_projects/DO-0008/data" target_templates: # paths for each type of output file # remove "/{0[Species_ID]}" to store all objects in one directory or