From 6351c4ddc5083db5d7baa6a1e39f20d7be091189 Mon Sep 17 00:00:00 2001
From: leawlb <lea.woelbert@gmail.com>
Date: Tue, 4 Oct 2022 16:45:29 +0200
Subject: [PATCH 01/32] most minimal changes that still work in dry run

---
 config/config-test.yaml     |  8 ++++----
 workflow/Snakefile          |  7 +++++--
 workflow/scripts/samples.py | 10 ----------
 3 files changed, 9 insertions(+), 16 deletions(-)

diff --git a/config/config-test.yaml b/config/config-test.yaml
index a4904a1..df1436d 100644
--- a/config/config-test.yaml
+++ b/config/config-test.yaml
@@ -1,5 +1,5 @@
 paths:
-  output_dir: "/omics/groups/OE0538/internal/users/f844s/2022-bonemarrow/results"
+  output_dir: "/omics/groups/OE0538/internal/users/l012t/snakemake-cellranger/data"
   target_templates:
     linked_files: "linked_files/{0[individual]}/{0[sample_type]}"
     cellranger_count: "cellranger_count/{0[individual]}_{0[sample_type]}/outs"
@@ -11,7 +11,7 @@ references:
   C57BL6J: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_spret/"
 metadata:
   raw:
-   - "~/projects/bonemarrow-study/data/otp-export/2022-09-21/OE0538_DO-0008_mmus/data-files.csv"
+   - "/omics/groups/OE0538/internal/users/l012t/Interspecies_BM_phd/data/data-files.csv"
   identifiers:
     - SAMPLE_NAME
     - Sample_Type
@@ -26,7 +26,7 @@ metadata:
 # Enable / Disable rules and specifiy rule-specific parameters
 rules:
   cellranger_count:
-    extra: ""  # set additional arguments for cellranger count
+#    extra: ""  # set additional arguments for cellranger count
   allele_specific: False
   wasp_filter_reads: False
-  realign_bam: False
+  realign_bam: False
\ No newline at end of file
diff --git a/workflow/Snakefile b/workflow/Snakefile
index 184622b..8e65206 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -4,6 +4,8 @@ import os
 from pathlib import Path
 from scripts.samples import Samples
 
+print("works")
+
 samples = Samples(config)
 
 REFERENCES = config["references"]
@@ -15,6 +17,7 @@ def get_sample_string(identifiers):
 SAMPLE_WILDCARDS = get_sample_string(config['metadata']['identifiers'])
 
 print(samples.metadata)
+print(SAMPLE_WILDCARDS)
 
 localrules: link_files, write_samples_spreadsheet, all  # local execution of non-demanding rules
 
@@ -79,7 +82,7 @@ rule cellranger_count:
     params:
         genome=REFERENCES["all_masked"],
         output_root_dir=lambda wildcards, output: Path(output[0]).parents[1],  # remove last 2 levels from output path
-        extra=config["rules"]["cellranger_count"]["extra"]
+        #extra=config["rules"]["cellranger_count"]["extra"]
     log:
         "logs/cellranger_count_{individual}_{sample_type}.log"
     threads: 16
@@ -92,6 +95,6 @@ rule cellranger_count:
         "--transcriptome {params.genome} "
         "--fastqs {input} "
         "--localcores={threads} "
-        "{extra} "
+        #"{extra} "
         "--sample {wildcards.individual}_{wildcards.sample_type} "
         "> {log}"
diff --git a/workflow/scripts/samples.py b/workflow/scripts/samples.py
index bf9c40a..56d557c 100644
--- a/workflow/scripts/samples.py
+++ b/workflow/scripts/samples.py
@@ -7,8 +7,6 @@ class Samples:
     """
     Convert the OTP-exported metadata spreadsheet into a pandas.DataFrame
     that provides metadata for the workflow.
-
-
     """
     # columns to select
     columns = ["PID",
@@ -64,11 +62,8 @@ def get_cellranger_filename(self, df: pd.DataFrame) -> pd.DataFrame:
 
         """Add column containing CellRanger compatible filename,
         i.e. in the format of
-
         [Sample Name]_S[Sample_Number]_L00[Lane Number]_[Read Type]_001.fastq.gz
-
         Here, [Sample Name] consists of PID and Sample Type.
-
         See also: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/fastq-input
         """
 
@@ -104,7 +99,6 @@ def filter_by_wildcards(
     ) -> list:
         """Get item from DataFrame by subsetting using the following index
         attributes provided via Snakemakes wildcards object.
-
         :return param:
         """
 
@@ -138,7 +132,3 @@ def targets(self) -> list:
 
         targets = list(map(lambda x: self.metadata.agg(x.format, axis=1).drop_duplicates().to_list(), target_templates.values()))
         return targets
-
-
-
-#grouped = metadata_subset.groupby(["PID", "Sample Type"])["FastQ Path"].apply(list)

From 18d078dbefb749644dc8a65aa9610dfafd21a5b6 Mon Sep 17 00:00:00 2001
From: leawlb <lea.woelbert@gmail.com>
Date: Wed, 5 Oct 2022 10:50:02 +0200
Subject: [PATCH 02/32] customized for one object_ID as identifier

---
 config/config-test.yaml     | 13 ++++++-------
 workflow/Snakefile          | 27 ++++++++++++++-------------
 workflow/scripts/samples.py | 14 +++++++-------
 3 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/config/config-test.yaml b/config/config-test.yaml
index df1436d..54e6b0c 100644
--- a/config/config-test.yaml
+++ b/config/config-test.yaml
@@ -1,8 +1,8 @@
 paths:
   output_dir: "/omics/groups/OE0538/internal/users/l012t/snakemake-cellranger/data"
   target_templates:
-    linked_files: "linked_files/{0[individual]}/{0[sample_type]}"
-    cellranger_count: "cellranger_count/{0[individual]}_{0[sample_type]}/outs"
+    linked_files: "linked_files/{0[individual]}"
+    cellranger_count: "cellranger_count/{0[individual]}/outs"
     samples_sheet: "metadata/samples.csv"
 references:
   all_masked: "/omics/groups/OE0538/internal/users/f844s/2022-bonemarrow/data/masked_reference/GRCm38_masked_all_strains"
@@ -11,12 +11,11 @@ references:
   C57BL6J: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_spret/"
 metadata:
   raw:
-   - "/omics/groups/OE0538/internal/users/l012t/Interspecies_BM_phd/data/data-files.csv"
+   - "~/Interspecies_BM_phd/data/metadata_full.csv"
   identifiers:
-    - SAMPLE_NAME
-    - Sample_Type
+    - Object_ID
     - Age
-    - fraction
+    - Fraction
   # Define all columns from the metadata spreadsheet that
   # will be included in the SingleCellExperiment / Seurat
   # objects
@@ -26,7 +25,7 @@ metadata:
 # Enable / Disable rules and specifiy rule-specific parameters
 rules:
   cellranger_count:
-#    extra: ""  # set additional arguments for cellranger count
+    extra: ""  # set additional arguments for cellranger count
   allele_specific: False
   wasp_filter_reads: False
   realign_bam: False
\ No newline at end of file
diff --git a/workflow/Snakefile b/workflow/Snakefile
index 8e65206..b76e796 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -4,8 +4,6 @@ import os
 from pathlib import Path
 from scripts.samples import Samples
 
-print("works")
-
 samples = Samples(config)
 
 REFERENCES = config["references"]
@@ -16,20 +14,21 @@ def get_sample_string(identifiers):
 
 SAMPLE_WILDCARDS = get_sample_string(config['metadata']['identifiers'])
 
-print(samples.metadata)
-print(SAMPLE_WILDCARDS)
+#rint(samples.metadata)
+#print(SAMPLE_WILDCARDS)
 
 localrules: link_files, write_samples_spreadsheet, all  # local execution of non-demanding rules
 
 wildcard_constraints:
-    sample_type="[A-z0-9_-]+",
-    individual="[A-z0-9_-]+"
+    individual="[A-z0-9_.-]+"
+
+print(samples.targets)
+
 
 rule all:
     input:
         samples.targets
 
-
 rule link_files:
     """Link files with filename pattern required for Cellranger
     
@@ -44,9 +43,9 @@ rule link_files:
 
         )
     output:
-        directory(f"{OUTPUT_BASE_PATH}/linked_files/{{individual}}/{{sample_type}}/")
+        directory(f"{OUTPUT_BASE_PATH}/linked_files/{{individual}}/")
     log:
-        "logs/link_files_{individual}_{sample_type}.log"
+        "logs/link_files_{individual}.log"
     run:
         log_fh  = open(log[0], "w")
         os.makedirs(output[0], exist_ok=True)
@@ -69,6 +68,8 @@ rule write_samples_spreadsheet:
             samples.metadata.to_csv(output[0], index=False)
             print("done.", file=log_fh)
 
+print("works")
+
 rule cellranger_count:
     """Run Cellranger count
     
@@ -78,23 +79,23 @@ rule cellranger_count:
     input:
         lambda wildcards: [os.path.abspath(path) for path in rules.link_files.output]
     output:
-        outs_dir=f"{OUTPUT_BASE_PATH}/cellranger_count/{{individual}}_{{sample_type}}/outs"
+        outs_dir=f"{OUTPUT_BASE_PATH}/cellranger_count/{{individual}}/outs"
     params:
         genome=REFERENCES["all_masked"],
         output_root_dir=lambda wildcards, output: Path(output[0]).parents[1],  # remove last 2 levels from output path
         #extra=config["rules"]["cellranger_count"]["extra"]
     log:
-        "logs/cellranger_count_{individual}_{sample_type}.log"
+        "logs/cellranger_count_{individual}.log"
     threads: 16
     envmodules:
         "cellranger/6.1.1",
     shell:
         "cd {params.output_root_dir}; "
         "cellranger count "
-        "--id {wildcards.individual}_{wildcards.sample_type} "  # letters, numbers, hyphens, and underscores
+        "--id {wildcards.individual} "  # letters, numbers, hyphens, and underscores
         "--transcriptome {params.genome} "
         "--fastqs {input} "
         "--localcores={threads} "
         #"{extra} "
-        "--sample {wildcards.individual}_{wildcards.sample_type} "
+        "--sample {wildcards.individual} "
         "> {log}"
diff --git a/workflow/scripts/samples.py b/workflow/scripts/samples.py
index 56d557c..76e8f1a 100644
--- a/workflow/scripts/samples.py
+++ b/workflow/scripts/samples.py
@@ -23,12 +23,12 @@ class Samples:
                "DATE_OF_DEATH",
                "LANE_NO",
                "READ",
-               "CELLRANGER_FASTQ_PATH"]
+               "CELLRANGER_FASTQ_PATH",
+               "Object_ID"]
 
     # map to rename columns in the format of (old):(new)
     columns_map = {
-        "PID": "individual",
-        "Sample Type": "sample_type",
+        "Object_ID": "individual"
     }
 
     def __init__(self, config):
@@ -63,17 +63,17 @@ def get_cellranger_filename(self, df: pd.DataFrame) -> pd.DataFrame:
         """Add column containing CellRanger compatible filename,
         i.e. in the format of
         [Sample Name]_S[Sample_Number]_L00[Lane Number]_[Read Type]_001.fastq.gz
-        Here, [Sample Name] consists of PID and Sample Type.
+        Here, [Sample Name] consists of Object_ID.
         See also: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/fastq-input
         """
 
-        grouped = df.groupby(["PID", "Sample Type", "LANE_NO", "READ"])
+        grouped = df.groupby(["Object_ID", "LANE_NO", "READ"])
         groups = []
         for name, group in grouped:
             group = group.sort_values("FastQ Path")  # import to have consistent sorting
             group["multi_sample_idx"] = range(1, len(group)+1)
             group["CELLRANGER_FASTQ_PATH"] = group.agg(
-                "{0[PID]}_{0[Sample Type]}_S{0[multi_sample_idx]}_L00{0[LANE_NO]}_R{0[READ]}_001.fastq.gz".format,
+                "{0[Object_ID]}_S{0[multi_sample_idx]}_L00{0[LANE_NO]}_R{0[READ]}_001.fastq.gz".format,
                 axis=1,
             )
             groups.append(group)
@@ -102,7 +102,7 @@ def filter_by_wildcards(
         :return param:
         """
 
-        _identifiers = ["individual", "sample_type"]
+        _identifiers = ["individual"]
 
         if wildcards:
             filters = dict((k, getattr(wildcards, k)) for k in _identifiers)

From ded0ee90bbec4251647b9b34c7528aca0db76414 Mon Sep 17 00:00:00 2001
From: leawlb <lea.woelbert@gmail.com>
Date: Wed, 5 Oct 2022 11:12:14 +0200
Subject: [PATCH 03/32] WIP small addition for customization later

---
 config/config-test.yaml     | 7 ++++---
 workflow/Snakefile          | 7 ++-----
 workflow/scripts/samples.py | 4 ++--
 3 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/config/config-test.yaml b/config/config-test.yaml
index 54e6b0c..328ff60 100644
--- a/config/config-test.yaml
+++ b/config/config-test.yaml
@@ -13,9 +13,10 @@ metadata:
   raw:
    - "~/Interspecies_BM_phd/data/metadata_full.csv"
   identifiers:
-    - Object_ID
-    - Age
-    - Fraction
+    - Species_ID
+    - Age_ID
+    - Fraction_ID
+    - Sample_NR
   # Define all columns from the metadata spreadsheet that
   # will be included in the SingleCellExperiment / Seurat
   # objects
diff --git a/workflow/Snakefile b/workflow/Snakefile
index b76e796..dde2ce0 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -14,17 +14,14 @@ def get_sample_string(identifiers):
 
 SAMPLE_WILDCARDS = get_sample_string(config['metadata']['identifiers'])
 
-#rint(samples.metadata)
-#print(SAMPLE_WILDCARDS)
+print(samples.metadata)
+print(SAMPLE_WILDCARDS)
 
 localrules: link_files, write_samples_spreadsheet, all  # local execution of non-demanding rules
 
 wildcard_constraints:
     individual="[A-z0-9_.-]+"
 
-print(samples.targets)
-
-
 rule all:
     input:
         samples.targets
diff --git a/workflow/scripts/samples.py b/workflow/scripts/samples.py
index 76e8f1a..ee8e5b6 100644
--- a/workflow/scripts/samples.py
+++ b/workflow/scripts/samples.py
@@ -2,7 +2,6 @@
 
 from snakemake.io import Wildcards
 
-
 class Samples:
     """
     Convert the OTP-exported metadata spreadsheet into a pandas.DataFrame
@@ -25,7 +24,7 @@ class Samples:
                "READ",
                "CELLRANGER_FASTQ_PATH",
                "Object_ID"]
-
+    
     # map to rename columns in the format of (old):(new)
     columns_map = {
         "Object_ID": "individual"
@@ -33,6 +32,7 @@ class Samples:
 
     def __init__(self, config):
         metadata_files = config["metadata"]["raw"]
+        identifiers = config["metadata"]["identifiers"] # for customization later
 
         self.output_base_dir = config["paths"]["output_dir"]
         self.target_templates = config["paths"]["target_templates"]

From 028af443578345b1cdbba562355d91ef410f5a38 Mon Sep 17 00:00:00 2001
From: leawlb <lea.woelbert@gmail.com>
Date: Wed, 5 Oct 2022 16:06:46 +0200
Subject: [PATCH 04/32] WIP reying to run rule cellranger_count

---
 config/config-test.yaml |  2 +-
 workflow/Snakefile      | 15 +++++++--------
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/config/config-test.yaml b/config/config-test.yaml
index 328ff60..67d667f 100644
--- a/config/config-test.yaml
+++ b/config/config-test.yaml
@@ -5,7 +5,7 @@ paths:
     cellranger_count: "cellranger_count/{0[individual]}/outs"
     samples_sheet: "metadata/samples.csv"
 references:
-  all_masked: "/omics/groups/OE0538/internal/users/f844s/2022-bonemarrow/data/masked_reference/GRCm38_masked_all_strains"
+  all_masked: "/omics/groups/OE0538/internal/users/f844s/2022-bonemarrow/data/masked_reference/GRCm38_masked_allStrains"
   CAST_EiJ: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_cast/"
   SPRET_EiJ: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_spret/"
   C57BL6J: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_spret/"
diff --git a/workflow/Snakefile b/workflow/Snakefile
index dde2ce0..29b5516 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -14,13 +14,13 @@ def get_sample_string(identifiers):
 
 SAMPLE_WILDCARDS = get_sample_string(config['metadata']['identifiers'])
 
-print(samples.metadata)
-print(SAMPLE_WILDCARDS)
+#print(samples.metadata)
+#print(SAMPLE_WILDCARDS)
 
 localrules: link_files, write_samples_spreadsheet, all  # local execution of non-demanding rules
 
-wildcard_constraints:
-    individual="[A-z0-9_.-]+"
+wildcard_constraints: #individuals should not contain . or cellranger will not accept it as ID
+    individual="[A-z0-9_-]+"
 
 rule all:
     input:
@@ -65,8 +65,6 @@ rule write_samples_spreadsheet:
             samples.metadata.to_csv(output[0], index=False)
             print("done.", file=log_fh)
 
-print("works")
-
 rule cellranger_count:
     """Run Cellranger count
     
@@ -76,10 +74,10 @@ rule cellranger_count:
     input:
         lambda wildcards: [os.path.abspath(path) for path in rules.link_files.output]
     output:
-        outs_dir=f"{OUTPUT_BASE_PATH}/cellranger_count/{{individual}}/outs"
+        outs_dir=directory(f"{OUTPUT_BASE_PATH}/cellranger_count/{{individual}}/outs")
     params:
         genome=REFERENCES["all_masked"],
-        output_root_dir=lambda wildcards, output: Path(output[0]).parents[1],  # remove last 2 levels from output path
+        output_root_dir=lambda wildcards, output: Path(output[0]).parents[1]  # remove last 2 levels from output path
         #extra=config["rules"]["cellranger_count"]["extra"]
     log:
         "logs/cellranger_count_{individual}.log"
@@ -88,6 +86,7 @@ rule cellranger_count:
         "cellranger/6.1.1",
     shell:
         "cd {params.output_root_dir}; "
+        "pwd;"
         "cellranger count "
         "--id {wildcards.individual} "  # letters, numbers, hyphens, and underscores
         "--transcriptome {params.genome} "

From 17e1d0d83e43c04def396a73115c7af3cde3a490 Mon Sep 17 00:00:00 2001
From: leawlb <lea.woelbert@gmail.com>
Date: Thu, 6 Oct 2022 13:27:58 +0200
Subject: [PATCH 05/32] path changes to try successful run

---
 config/config-test.yaml | 10 +++++-----
 workflow/Snakefile      |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/config/config-test.yaml b/config/config-test.yaml
index 67d667f..e42cb6e 100644
--- a/config/config-test.yaml
+++ b/config/config-test.yaml
@@ -6,12 +6,12 @@ paths:
     samples_sheet: "metadata/samples.csv"
 references:
   all_masked: "/omics/groups/OE0538/internal/users/f844s/2022-bonemarrow/data/masked_reference/GRCm38_masked_allStrains"
-  CAST_EiJ: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_cast/"
-  SPRET_EiJ: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_spret/"
-  C57BL6J: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_spret/"
+  #CAST_EiJ: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_cast/"
+  #SPRET_EiJ: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_spret/"
+  #C57BL6J: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_spret/"
 metadata:
   raw:
-   - "~/Interspecies_BM_phd/data/metadata_full.csv"
+   - "~/snakemake-cellranger/data/metadata_full.csv"
   identifiers:
     - Species_ID
     - Age_ID
@@ -26,7 +26,7 @@ metadata:
 # Enable / Disable rules and specifiy rule-specific parameters
 rules:
   cellranger_count:
-    extra: ""  # set additional arguments for cellranger count
+  #  extra: ""  # set additional arguments for cellranger count
   allele_specific: False
   wasp_filter_reads: False
   realign_bam: False
\ No newline at end of file
diff --git a/workflow/Snakefile b/workflow/Snakefile
index 29b5516..04f5ebe 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -58,7 +58,7 @@ rule write_samples_spreadsheet:
     output:
         f"{OUTPUT_BASE_PATH}/{config['paths']['target_templates']['samples_sheet']}"
     log:
-        "logs/write_samples_spreadsheet.log"
+        f"{OUTPUT_BASE_PATH}/logs/write_samples_spreadsheet.log"
     run:
         with open(log[0], "w") as log_fh:
             print("storing samples metadata to {output[0]}", file=log_fh)
@@ -80,7 +80,7 @@ rule cellranger_count:
         output_root_dir=lambda wildcards, output: Path(output[0]).parents[1]  # remove last 2 levels from output path
         #extra=config["rules"]["cellranger_count"]["extra"]
     log:
-        "logs/cellranger_count_{individual}.log"
+        f"{OUTPUT_BASE_PATH}/logs/cellranger_count_{individual}.log"
     threads: 16
     envmodules:
         "cellranger/6.1.1",

From 31cc8e7d291a9cde7cdd5deea3e11959e389b8af Mon Sep 17 00:00:00 2001
From: leawlb <lea.woelbert@gmail.com>
Date: Thu, 6 Oct 2022 16:19:44 +0200
Subject: [PATCH 06/32] minor change for successful run with test dataset

---
 workflow/Snakefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/workflow/Snakefile b/workflow/Snakefile
index 04f5ebe..0fb4bbf 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -42,7 +42,7 @@ rule link_files:
     output:
         directory(f"{OUTPUT_BASE_PATH}/linked_files/{{individual}}/")
     log:
-        "logs/link_files_{individual}.log"
+        f"{OUTPUT_BASE_PATH}/logs/link_files_{{individual}}.log"
     run:
         log_fh  = open(log[0], "w")
         os.makedirs(output[0], exist_ok=True)
@@ -80,7 +80,7 @@ rule cellranger_count:
         output_root_dir=lambda wildcards, output: Path(output[0]).parents[1]  # remove last 2 levels from output path
         #extra=config["rules"]["cellranger_count"]["extra"]
     log:
-        f"{OUTPUT_BASE_PATH}/logs/cellranger_count_{individual}.log"
+        f"{OUTPUT_BASE_PATH}/logs/cellranger_count_{{individual}}.log"
     threads: 16
     envmodules:
         "cellranger/6.1.1",

From 159870053f3265bd7a5f169643af70aa0446e56a Mon Sep 17 00:00:00 2001
From: leawlb <lea.woelbert@gmail.com>
Date: Thu, 6 Oct 2022 16:48:27 +0200
Subject: [PATCH 07/32] added R script for constructing SCE objects

---
 config/config-test.yaml |  1 +
 workflow/Snakefile      | 10 ++++++++++
 2 files changed, 11 insertions(+)

diff --git a/config/config-test.yaml b/config/config-test.yaml
index e42cb6e..dafdf53 100644
--- a/config/config-test.yaml
+++ b/config/config-test.yaml
@@ -4,6 +4,7 @@ paths:
     linked_files: "linked_files/{0[individual]}"
     cellranger_count: "cellranger_count/{0[individual]}/outs"
     samples_sheet: "metadata/samples.csv"
+    construct_sce_objects: "sce_objects/{0[individual]}"
 references:
   all_masked: "/omics/groups/OE0538/internal/users/f844s/2022-bonemarrow/data/masked_reference/GRCm38_masked_allStrains"
   #CAST_EiJ: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_cast/"
diff --git a/workflow/Snakefile b/workflow/Snakefile
index 0fb4bbf..7e865ec 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -95,3 +95,13 @@ rule cellranger_count:
         #"{extra} "
         "--sample {wildcards.individual} "
         "> {log}"
+        
+# construct SCE objects from counts matrices found in ../alignment/output/
+rule construct_sce_objects:
+    input: 
+        output_cellranger = 
+        f"{OUTPUT_BASE_PATH}/cellranger_count/{{individual}}/outs/raw_feature_bc_matrix"
+    output:
+       sce_objects = f"{OUTPUT_BASE_PATH}/sce_objects/{{individual}}"
+    script:
+        "scripts/construct_sce_objects.R"

From cf7af69b8005ff70ec4d3d86f9902efe30a49e4e Mon Sep 17 00:00:00 2001
From: leawlb <lea.woelbert@gmail.com>
Date: Thu, 6 Oct 2022 16:49:26 +0200
Subject: [PATCH 08/32] added R script for SCE object construction

---
 workflow/scripts/construct_sce_objects.R | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 workflow/scripts/construct_sce_objects.R

diff --git a/workflow/scripts/construct_sce_objects.R b/workflow/scripts/construct_sce_objects.R
new file mode 100644
index 0000000..9684ca3
--- /dev/null
+++ b/workflow/scripts/construct_sce_objects.R
@@ -0,0 +1,24 @@
+#-------------------------------------------------------------------------------
+# call objects from raw cellranger output matrices and add metadata
+
+library(DropletUtils, quietly = TRUE) 
+# r and DropletUtils modules must be installed in snakemake conda environment
+# https://anaconda.org/conda-forge/r-base
+# https://anaconda.org/bioconda/bioconductor-dropletutils
+
+#identifiers <- snakemake@params[["identifiers"]]
+#single_cell_object_metadata_fields <- snakemake@params[["single_cell_object_metadata_fields"]]
+
+
+#print(metadata)
+#print(identifiers)
+#print(single_cell_object_metadata_fields)
+
+print(snakemake@input[["output_cellranger"]])
+
+# construct from raw cellranger output
+sce <- read10xCounts(samples = snakemake@input[["output_cellranger"]], col.names = TRUE, 
+                     type = "sparse" )
+
+
+saveRDS(sce, file = snakemake@output[["sce_objects"]])

From 2fa037f51d20703d7559aeb0120164fdc3a26f35 Mon Sep 17 00:00:00 2001
From: leawlb <lea.woelbert@gmail.com>
Date: Thu, 6 Oct 2022 18:53:38 +0200
Subject: [PATCH 09/32] WIP trying to change 'individual' to identifier
 wildcards

---
 config/config-test.yaml                  |  3 +-
 workflow/Snakefile                       | 35 ++++++++++++++----------
 workflow/scripts/construct_sce_objects.R | 35 +++++++++++++++++++-----
 workflow/scripts/samples.py              |  3 ++
 4 files changed, 53 insertions(+), 23 deletions(-)

diff --git a/config/config-test.yaml b/config/config-test.yaml
index dafdf53..2888a05 100644
--- a/config/config-test.yaml
+++ b/config/config-test.yaml
@@ -14,10 +14,11 @@ metadata:
   raw:
    - "~/snakemake-cellranger/data/metadata_full.csv"
   identifiers:
-    - Species_ID
+    #- Species_ID # not needed atm since I already have only one species
     - Age_ID
     - Fraction_ID
     - Sample_NR
+  #  - Object_ID
   # Define all columns from the metadata spreadsheet that
   # will be included in the SingleCellExperiment / Seurat
   # objects
diff --git a/workflow/Snakefile b/workflow/Snakefile
index 7e865ec..63c69f2 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -6,21 +6,24 @@ from scripts.samples import Samples
 
 samples = Samples(config)
 
-REFERENCES = config["references"]
+METADATA_PATH = config["metadata"]["raw"]
 OUTPUT_BASE_PATH = config["paths"]["output_dir"]
 
+REFERENCES = config["references"]
+IDENTIFIERS = config["metadata"]["identifiers"]
+
 def get_sample_string(identifiers):
     return "_".join(["{" + label + "}" for label in identifiers])
-
-SAMPLE_WILDCARDS = get_sample_string(config['metadata']['identifiers'])
-
-#print(samples.metadata)
-#print(SAMPLE_WILDCARDS)
+SAMPLE_WILDCARDS = get_sample_string(IDENTIFIERS)
 
 localrules: link_files, write_samples_spreadsheet, all  # local execution of non-demanding rules
 
-wildcard_constraints: #individuals should not contain . or cellranger will not accept it as ID
-    individual="[A-z0-9_-]+"
+wildcard_constraints: #individuals should not contain "." 
+    #individual="[A-z0-9_-]+",
+    #Species_ID="[A-z0-9_-]+",
+    Age_ID=IDENTIFIERS[0],
+    Fraction_ID="[A-z0-9_-]+",
+    Sample_NR="[A-z0-9-]+"
 
 rule all:
     input:
@@ -40,9 +43,8 @@ rule link_files:
 
         )
     output:
-        directory(f"{OUTPUT_BASE_PATH}/linked_files/{{individual}}/")
     log:
-        f"{OUTPUT_BASE_PATH}/logs/link_files_{{individual}}.log"
+        f"{OUTPUT_BASE_PATH}/logs/link_files_{SAMPLE_WILDCARDS}.log"
     run:
         log_fh  = open(log[0], "w")
         os.makedirs(output[0], exist_ok=True)
@@ -74,13 +76,13 @@ rule cellranger_count:
     input:
         lambda wildcards: [os.path.abspath(path) for path in rules.link_files.output]
     output:
-        outs_dir=directory(f"{OUTPUT_BASE_PATH}/cellranger_count/{{individual}}/outs")
+        outs_dir=directory(f"{OUTPUT_BASE_PATH}/cellranger_count/{SAMPLE_WILDCARDS}/outs")
     params:
         genome=REFERENCES["all_masked"],
         output_root_dir=lambda wildcards, output: Path(output[0]).parents[1]  # remove last 2 levels from output path
         #extra=config["rules"]["cellranger_count"]["extra"]
     log:
-        f"{OUTPUT_BASE_PATH}/logs/cellranger_count_{{individual}}.log"
+        f"{OUTPUT_BASE_PATH}/logs/cellranger_count_{SAMPLE_WILDCARDS}.log"
     threads: 16
     envmodules:
         "cellranger/6.1.1",
@@ -95,13 +97,16 @@ rule cellranger_count:
         #"{extra} "
         "--sample {wildcards.individual} "
         "> {log}"
-        
+
 # construct SCE objects from counts matrices found in ../alignment/output/
 rule construct_sce_objects:
     input: 
         output_cellranger = 
-        f"{OUTPUT_BASE_PATH}/cellranger_count/{{individual}}/outs/raw_feature_bc_matrix"
+        f"{OUTPUT_BASE_PATH}/cellranger_count/{SAMPLE_WILDCARDS}/outs/raw_feature_bc_matrix"
     output:
-       sce_objects = f"{OUTPUT_BASE_PATH}/sce_objects/{{individual}}"
+        sce_objects = f"{OUTPUT_BASE_PATH}/sce_objects/{SAMPLE_WILDCARDS}"
+    params:
+        identifiers = IDENTIFIERS,
+        metadata = METADATA_PATH
     script:
         "scripts/construct_sce_objects.R"
diff --git a/workflow/scripts/construct_sce_objects.R b/workflow/scripts/construct_sce_objects.R
index 9684ca3..0e36696 100644
--- a/workflow/scripts/construct_sce_objects.R
+++ b/workflow/scripts/construct_sce_objects.R
@@ -9,16 +9,37 @@ library(DropletUtils, quietly = TRUE)
 #identifiers <- snakemake@params[["identifiers"]]
 #single_cell_object_metadata_fields <- snakemake@params[["single_cell_object_metadata_fields"]]
 
-
-#print(metadata)
-#print(identifiers)
-#print(single_cell_object_metadata_fields)
-
-print(snakemake@input[["output_cellranger"]])
-
 # construct from raw cellranger output
 sce <- read10xCounts(samples = snakemake@input[["output_cellranger"]], col.names = TRUE, 
                      type = "sparse" )
 
+# add specified metadata to object 
+metadata <- read.csv(file = snakemake@params[["metadata"]], 
+                     head = TRUE, 
+                     sep = ",", 
+                     check.names=FALSE, 
+                     stringsAsFactors=FALSE, 
+                     as.is=TRUE, 
+                     colClasses = "character")
+
+identifiers <- snakemake@params[["identifiers"]]
+print(identifiers)
+
+#wildcards_curr <- snakemake@wildcards[[identifiers[1]]]
+#print(wildcards_curr)
+
+# construct a unique Object_ID from the identifiers and add to metadata if necessary
+for(i in snakemake@wildcards){
+  print(i)
+}
+print(class(c(snakemake@wildcards)))
+
+#object_id_curr <- paste(identifiers, collapse = "_")
+#print(object_id_curr)
+
+
+
 
 saveRDS(sce, file = snakemake@output[["sce_objects"]])
+
+
diff --git a/workflow/scripts/samples.py b/workflow/scripts/samples.py
index ee8e5b6..87d628c 100644
--- a/workflow/scripts/samples.py
+++ b/workflow/scripts/samples.py
@@ -1,5 +1,7 @@
 import pandas as pd
 
+#config['metadata']['identifiers']
+
 from snakemake.io import Wildcards
 
 class Samples:
@@ -29,6 +31,7 @@ class Samples:
     columns_map = {
         "Object_ID": "individual"
     }
+    
 
     def __init__(self, config):
         metadata_files = config["metadata"]["raw"]

From 224a296687342336f0ecac50e3c99ca5c935aa71 Mon Sep 17 00:00:00 2001
From: leawlb <lea.woelbert@gmail.com>
Date: Fri, 7 Oct 2022 10:04:52 +0200
Subject: [PATCH 10/32] WIP going back to individual

---
 config/config-test.yaml                  |  8 ++++----
 workflow/Snakefile                       | 17 +++++++++--------
 workflow/scripts/construct_sce_objects.R | 19 ++++++++++---------
 workflow/scripts/samples.py              |  5 +++--
 4 files changed, 26 insertions(+), 23 deletions(-)

diff --git a/config/config-test.yaml b/config/config-test.yaml
index 2888a05..d5dfb1f 100644
--- a/config/config-test.yaml
+++ b/config/config-test.yaml
@@ -15,10 +15,10 @@ metadata:
    - "~/snakemake-cellranger/data/metadata_full.csv"
   identifiers:
     #- Species_ID # not needed atm since I already have only one species
-    - Age_ID
-    - Fraction_ID
-    - Sample_NR
-  #  - Object_ID
+    #- Age_ID
+    #- Fraction_ID
+    #- Sample_NR
+     - Object_ID
   # Define all columns from the metadata spreadsheet that
   # will be included in the SingleCellExperiment / Seurat
   # objects
diff --git a/workflow/Snakefile b/workflow/Snakefile
index 63c69f2..38dcc49 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -21,9 +21,10 @@ localrules: link_files, write_samples_spreadsheet, all  # local execution of non
 wildcard_constraints: #individuals should not contain "." 
     #individual="[A-z0-9_-]+",
     #Species_ID="[A-z0-9_-]+",
-    Age_ID=IDENTIFIERS[0],
-    Fraction_ID="[A-z0-9_-]+",
-    Sample_NR="[A-z0-9-]+"
+    #Age_ID="[A-z0-9_-]+",
+    #Fraction_ID="[A-z0-9_-]+",
+    #Sample_NR="[A-z0-9-]+"
+    individual="[A-z0-9-]+"
 
 rule all:
     input:
@@ -44,7 +45,7 @@ rule link_files:
         )
     output:
     log:
-        f"{OUTPUT_BASE_PATH}/logs/link_files_{SAMPLE_WILDCARDS}.log"
+        f"{OUTPUT_BASE_PATH}/logs/link_files_{{individual}}.log"
     run:
         log_fh  = open(log[0], "w")
         os.makedirs(output[0], exist_ok=True)
@@ -76,13 +77,13 @@ rule cellranger_count:
     input:
         lambda wildcards: [os.path.abspath(path) for path in rules.link_files.output]
     output:
-        outs_dir=directory(f"{OUTPUT_BASE_PATH}/cellranger_count/{SAMPLE_WILDCARDS}/outs")
+        outs_dir=directory(f"{OUTPUT_BASE_PATH}/cellranger_count/{{individual}}/outs")
     params:
         genome=REFERENCES["all_masked"],
         output_root_dir=lambda wildcards, output: Path(output[0]).parents[1]  # remove last 2 levels from output path
         #extra=config["rules"]["cellranger_count"]["extra"]
     log:
-        f"{OUTPUT_BASE_PATH}/logs/cellranger_count_{SAMPLE_WILDCARDS}.log"
+        f"{OUTPUT_BASE_PATH}/logs/cellranger_count_{{individual}}.log"
     threads: 16
     envmodules:
         "cellranger/6.1.1",
@@ -102,9 +103,9 @@ rule cellranger_count:
 rule construct_sce_objects:
     input: 
         output_cellranger = 
-        f"{OUTPUT_BASE_PATH}/cellranger_count/{SAMPLE_WILDCARDS}/outs/raw_feature_bc_matrix"
+        f"{OUTPUT_BASE_PATH}/cellranger_count/{{individual}}/outs/raw_feature_bc_matrix"
     output:
-        sce_objects = f"{OUTPUT_BASE_PATH}/sce_objects/{SAMPLE_WILDCARDS}"
+        sce_objects = f"{OUTPUT_BASE_PATH}/sce_objects/{{individual}}"
     params:
         identifiers = IDENTIFIERS,
         metadata = METADATA_PATH
diff --git a/workflow/scripts/construct_sce_objects.R b/workflow/scripts/construct_sce_objects.R
index 0e36696..3a94b89 100644
--- a/workflow/scripts/construct_sce_objects.R
+++ b/workflow/scripts/construct_sce_objects.R
@@ -25,17 +25,11 @@ metadata <- read.csv(file = snakemake@params[["metadata"]],
 identifiers <- snakemake@params[["identifiers"]]
 print(identifiers)
 
-#wildcards_curr <- snakemake@wildcards[[identifiers[1]]]
-#print(wildcards_curr)
+wildcards_curr <- snakemake@wildcards[["individual"]]
+print(wildcards_curr)
+
 
-# construct a unique Object_ID from the identifiers and add to metadata if necessary
-for(i in snakemake@wildcards){
-  print(i)
-}
-print(class(c(snakemake@wildcards)))
 
-#object_id_curr <- paste(identifiers, collapse = "_")
-#print(object_id_curr)
 
 
 
@@ -43,3 +37,10 @@ print(class(c(snakemake@wildcards)))
 saveRDS(sce, file = snakemake@output[["sce_objects"]])
 
 
+
+
+metadata <- read.csv(file = "~/snakemake-cellranger/data/metadata_full.csv", head = TRUE,  sep = ",",     check.names=FALSE,   stringsAsFactors=FALSE,  as.is=TRUE, colClasses = "character")
+sce <- read10xCounts(samples = "/omics/groups/OE0538/internal/users/l012t/snakemake-cellranger/data/cellranger_count/mmus_old_str_2_0/outs/raw_feature_bc_matrix", col.names = TRUE,   type = "sparse" )
+
+
+
diff --git a/workflow/scripts/samples.py b/workflow/scripts/samples.py
index 87d628c..3889625 100644
--- a/workflow/scripts/samples.py
+++ b/workflow/scripts/samples.py
@@ -25,11 +25,11 @@ class Samples:
                "LANE_NO",
                "READ",
                "CELLRANGER_FASTQ_PATH",
-               "Object_ID"]
+               "Object_ID"] 
     
     # map to rename columns in the format of (old):(new)
     columns_map = {
-        "Object_ID": "individual"
+        "Object_ID": "individual" # Object_ID should uniquely identify each sample/object
     }
     
 
@@ -48,6 +48,7 @@ def __init__(self, config):
 
         self.metadata = self.select_columns(metadata_full)
         self.metadata = self.metadata.rename(self.columns_map, axis="columns")
+        
 
     def rename_date_of_birth(self,
                              row: pd.Series):

From 135c3c1b2388e73b29c4528bbfe46519712d0d02 Mon Sep 17 00:00:00 2001
From: leawlb <lea.woelbert@gmail.com>
Date: Fri, 7 Oct 2022 10:41:26 +0200
Subject: [PATCH 11/32] addition of metadata to SCE objects during construction

---
 config/config-test.yaml                  | 34 ---------------------
 workflow/Snakefile                       | 20 +++++--------
 workflow/scripts/construct_sce_objects.R | 38 ++++++++++--------------
 workflow/scripts/samples.py              |  6 ++--
 4 files changed, 28 insertions(+), 70 deletions(-)
 delete mode 100644 config/config-test.yaml

diff --git a/config/config-test.yaml b/config/config-test.yaml
deleted file mode 100644
index d5dfb1f..0000000
--- a/config/config-test.yaml
+++ /dev/null
@@ -1,34 +0,0 @@
-paths:
-  output_dir: "/omics/groups/OE0538/internal/users/l012t/snakemake-cellranger/data"
-  target_templates:
-    linked_files: "linked_files/{0[individual]}"
-    cellranger_count: "cellranger_count/{0[individual]}/outs"
-    samples_sheet: "metadata/samples.csv"
-    construct_sce_objects: "sce_objects/{0[individual]}"
-references:
-  all_masked: "/omics/groups/OE0538/internal/users/f844s/2022-bonemarrow/data/masked_reference/GRCm38_masked_allStrains"
-  #CAST_EiJ: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_cast/"
-  #SPRET_EiJ: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_spret/"
-  #C57BL6J: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_spret/"
-metadata:
-  raw:
-   - "~/snakemake-cellranger/data/metadata_full.csv"
-  identifiers:
-    #- Species_ID # not needed atm since I already have only one species
-    #- Age_ID
-    #- Fraction_ID
-    #- Sample_NR
-     - Object_ID
-  # Define all columns from the metadata spreadsheet that
-  # will be included in the SingleCellExperiment / Seurat
-  # objects
-  single_cell_object_metadata_fields:
-      - SAMPLE_NAME
-      - Age
-# Enable / Disable rules and specifiy rule-specific parameters
-rules:
-  cellranger_count:
-  #  extra: ""  # set additional arguments for cellranger count
-  allele_specific: False
-  wasp_filter_reads: False
-  realign_bam: False
\ No newline at end of file
diff --git a/workflow/Snakefile b/workflow/Snakefile
index 38dcc49..8d8a8ff 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -10,20 +10,15 @@ METADATA_PATH = config["metadata"]["raw"]
 OUTPUT_BASE_PATH = config["paths"]["output_dir"]
 
 REFERENCES = config["references"]
-IDENTIFIERS = config["metadata"]["identifiers"]
+IDENTIFIER = config["metadata"]["identifier"]
 
-def get_sample_string(identifiers):
-    return "_".join(["{" + label + "}" for label in identifiers])
-SAMPLE_WILDCARDS = get_sample_string(IDENTIFIERS)
+#def get_sample_string(identifiers):
+#    return "_".join(["{" + label + "}" for label in identifiers])
+#SAMPLE_WILDCARDS = get_sample_string(IDENTIFIERS)
 
 localrules: link_files, write_samples_spreadsheet, all  # local execution of non-demanding rules
 
-wildcard_constraints: #individuals should not contain "." 
-    #individual="[A-z0-9_-]+",
-    #Species_ID="[A-z0-9_-]+",
-    #Age_ID="[A-z0-9_-]+",
-    #Fraction_ID="[A-z0-9_-]+",
-    #Sample_NR="[A-z0-9-]+"
+wildcard_constraints: #should not contain ".", see rule cellranger_count
     individual="[A-z0-9-]+"
 
 rule all:
@@ -107,7 +102,8 @@ rule construct_sce_objects:
     output:
         sce_objects = f"{OUTPUT_BASE_PATH}/sce_objects/{{individual}}"
     params:
-        identifiers = IDENTIFIERS,
-        metadata = METADATA_PATH
+        identifier = IDENTIFIER,
+        metadata = METADATA_PATH,
+        single_cell_object_metadata_fields = config["metadata"]["single_cell_object_metadata_fields"]
     script:
         "scripts/construct_sce_objects.R"
diff --git a/workflow/scripts/construct_sce_objects.R b/workflow/scripts/construct_sce_objects.R
index 3a94b89..86af012 100644
--- a/workflow/scripts/construct_sce_objects.R
+++ b/workflow/scripts/construct_sce_objects.R
@@ -6,11 +6,9 @@ library(DropletUtils, quietly = TRUE)
 # https://anaconda.org/conda-forge/r-base
 # https://anaconda.org/bioconda/bioconductor-dropletutils
 
-#identifiers <- snakemake@params[["identifiers"]]
-#single_cell_object_metadata_fields <- snakemake@params[["single_cell_object_metadata_fields"]]
-
-# construct from raw cellranger output
-sce <- read10xCounts(samples = snakemake@input[["output_cellranger"]], col.names = TRUE, 
+# construct SCE object from raw cellranger output
+sce <- read10xCounts(samples = snakemake@input[["output_cellranger"]], 
+                     col.names = TRUE, 
                      type = "sparse" )
 
 # add specified metadata to object 
@@ -22,25 +20,21 @@ metadata <- read.csv(file = snakemake@params[["metadata"]],
                      as.is=TRUE, 
                      colClasses = "character")
 
-identifiers <- snakemake@params[["identifiers"]]
-print(identifiers)
-
-wildcards_curr <- snakemake@wildcards[["individual"]]
-print(wildcards_curr)
-
-
-
-
+wildcards_curr <- snakemake@wildcards[["individual"]] # to know which sample is currently loaded
+identifier <- snakemake@params[["identifier"]] # to know which metadata column to compare it to to get the correct info
 
+# subset data that is relevant for current object only, as specified by wildcard and by by single_cell_object_metadata_fields
+metadata_curr <- metadata[which(metadata[which(colnames(metadata) == identifier)] == wildcards_curr),]
+cols_add <- snakemake@params[["single_cell_object_metadata_fields"]]
+metadata_curr <- metadata_curr[,colnames(metadata_curr) %in% cols_add]
 
+# add cell-level metadata to each barcode in the SCE object
+for(i in colnames(metadata_curr)){
+  colData(sce)[i] <- rep(metadata_curr[1,i], ncol(sce))
+}
 
 saveRDS(sce, file = snakemake@output[["sce_objects"]])
 
-
-
-
-metadata <- read.csv(file = "~/snakemake-cellranger/data/metadata_full.csv", head = TRUE,  sep = ",",     check.names=FALSE,   stringsAsFactors=FALSE,  as.is=TRUE, colClasses = "character")
-sce <- read10xCounts(samples = "/omics/groups/OE0538/internal/users/l012t/snakemake-cellranger/data/cellranger_count/mmus_old_str_2_0/outs/raw_feature_bc_matrix", col.names = TRUE,   type = "sparse" )
-
-
-
+# for testing purposes
+#metadata <- read.csv(file = "~/snakemake-cellranger/data/metadata_full.csv", head = TRUE,  sep = ",",     check.names=FALSE,   stringsAsFactors=FALSE,  as.is=TRUE, colClasses = "character")
+#sce <- read10xCounts(samples = "/omics/groups/OE0538/internal/users/l012t/snakemake-cellranger/data/cellranger_count/mmus_old_str_2_0/outs/raw_feature_bc_matrix", col.names = TRUE,   type = "sparse" )
diff --git a/workflow/scripts/samples.py b/workflow/scripts/samples.py
index 3889625..4b7a419 100644
--- a/workflow/scripts/samples.py
+++ b/workflow/scripts/samples.py
@@ -28,14 +28,16 @@ class Samples:
                "Object_ID"] 
     
     # map to rename columns in the format of (old):(new)
+    # Object_ID should uniquely identify each sample/object, it must already be contained in the metadata sheet 
+    # it must be identical to config metadata identifier
+    # I haven't found a way to directly lift it from config yet
     columns_map = {
-        "Object_ID": "individual" # Object_ID should uniquely identify each sample/object
+        "Object_ID": "individual" 
     }
     
 
     def __init__(self, config):
         metadata_files = config["metadata"]["raw"]
-        identifiers = config["metadata"]["identifiers"] # for customization later
 
         self.output_base_dir = config["paths"]["output_dir"]
         self.target_templates = config["paths"]["target_templates"]

From e9691333807dad9b3cd15517f58947177b007f11 Mon Sep 17 00:00:00 2001
From: leawlb <lea.woelbert@gmail.com>
Date: Fri, 7 Oct 2022 13:53:23 +0200
Subject: [PATCH 12/32] minor fixes

---
 workflow/scripts/construct_sce_objects.R | 19 +++++++++----------
 workflow/scripts/samples.py              |  2 --
 2 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/workflow/scripts/construct_sce_objects.R b/workflow/scripts/construct_sce_objects.R
index 86af012..45795b8 100644
--- a/workflow/scripts/construct_sce_objects.R
+++ b/workflow/scripts/construct_sce_objects.R
@@ -1,5 +1,4 @@
 #-------------------------------------------------------------------------------
-# call objects from raw cellranger output matrices and add metadata
 
 library(DropletUtils, quietly = TRUE) 
 # r and DropletUtils modules must be installed in snakemake conda environment
@@ -20,11 +19,15 @@ metadata <- read.csv(file = snakemake@params[["metadata"]],
                      as.is=TRUE, 
                      colClasses = "character")
 
-wildcards_curr <- snakemake@wildcards[["individual"]] # to know which sample is currently loaded
-identifier <- snakemake@params[["identifier"]] # to know which metadata column to compare it to to get the correct info
+# to know which sample is currently loaded
+wildcard_curr <- snakemake@wildcards[["individual"]] 
+# to know which metadata column to compare it to 
+identifier <- snakemake@params[["identifier"]] 
 
-# subset data that is relevant for current object only, as specified by wildcard and by by single_cell_object_metadata_fields
-metadata_curr <- metadata[which(metadata[which(colnames(metadata) == identifier)] == wildcards_curr),]
+# subset data that is relevant for current object only
+# as specified by wildcard and by by single_cell_object_metadata_fields
+metadata_curr <- metadata[
+  which(metadata[which(colnames(metadata) == identifier)] == wildcard_curr),]
 cols_add <- snakemake@params[["single_cell_object_metadata_fields"]]
 metadata_curr <- metadata_curr[,colnames(metadata_curr) %in% cols_add]
 
@@ -33,8 +36,4 @@ for(i in colnames(metadata_curr)){
   colData(sce)[i] <- rep(metadata_curr[1,i], ncol(sce))
 }
 
-saveRDS(sce, file = snakemake@output[["sce_objects"]])
-
-# for testing purposes
-#metadata <- read.csv(file = "~/snakemake-cellranger/data/metadata_full.csv", head = TRUE,  sep = ",",     check.names=FALSE,   stringsAsFactors=FALSE,  as.is=TRUE, colClasses = "character")
-#sce <- read10xCounts(samples = "/omics/groups/OE0538/internal/users/l012t/snakemake-cellranger/data/cellranger_count/mmus_old_str_2_0/outs/raw_feature_bc_matrix", col.names = TRUE,   type = "sparse" )
+saveRDS(sce, file = snakemake@output[["sce_objects"]])
\ No newline at end of file
diff --git a/workflow/scripts/samples.py b/workflow/scripts/samples.py
index 4b7a419..4073787 100644
--- a/workflow/scripts/samples.py
+++ b/workflow/scripts/samples.py
@@ -1,7 +1,5 @@
 import pandas as pd
 
-#config['metadata']['identifiers']
-
 from snakemake.io import Wildcards
 
 class Samples:

From 9276f95bfff6393b13519a3ab4012f0e6377cfbc Mon Sep 17 00:00:00 2001
From: leawlb <lea.woelbert@gmail.com>
Date: Mon, 10 Oct 2022 10:33:38 +0200
Subject: [PATCH 13/32] restored modified versions after merge

---
 config/config-test-lea.yaml | 48 ++++++++++++++++++++++++++++++++++
 workflow/Snakefile          | 51 +++++++++++++++++++++++++------------
 2 files changed, 83 insertions(+), 16 deletions(-)
 create mode 100644 config/config-test-lea.yaml

diff --git a/config/config-test-lea.yaml b/config/config-test-lea.yaml
new file mode 100644
index 0000000..abecb23
--- /dev/null
+++ b/config/config-test-lea.yaml
@@ -0,0 +1,48 @@
+paths:
+  output_dir: "/omics/groups/OE0538/internal/users/l012t/snakemake-cellranger/data"
+  target_templates:
+    linked_files: "linked_files/{0[individual]}"
+    cellranger_count: "cellranger_count/{0[individual]}/outs"
+    samples_sheet: "metadata/samples.csv"
+    construct_sce_objects: "sce_objects/{0[individual]}"
+references:
+  all_masked: "/omics/groups/OE0538/internal/users/f844s/2022-bonemarrow/data/masked_reference/GRCm38_masked_allStrains"
+  #CAST_EiJ: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_cast/"
+  #SPRET_EiJ: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_spret/"
+  #C57BL6J: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_spret/"
+metadata:
+  raw:
+   - "/omics/groups/OE0538/internal/users/l012t/snakemake-cellranger/data/metadata_full.csv"
+  identifier:
+    #- Species_ID # not needed atm since I already have only one species
+    #- Age_ID
+    #- Fraction_ID 
+    #- Sample_NR
+     - Object_ID
+  # Define all columns from the metadata spreadsheet that
+  # will be included in the SingleCellExperiment / Seurat
+  # objects
+  single_cell_object_metadata_fields:
+      - Object_ID
+      - Mouse_ID
+      - Species_ID
+      - Age
+      - Age_ID
+      - Age_weeks
+      - Fraction
+      - Fraction_ID
+      - Antibody_combination
+      - Sample_NR
+      - Extrarun
+      - Batch_exp_day
+      - Batch_sequencing
+      - Date_collected
+      - Discard
+      - PID
+# Enable / Disable rules and specifiy rule-specific parameters
+rules:
+  cellranger_count:
+  #  extra: ""  # set additional arguments for cellranger count
+  allele_specific: False
+  wasp_filter_reads: False
+  realign_bam: False
\ No newline at end of file
diff --git a/workflow/Snakefile b/workflow/Snakefile
index b035d0c..9f50f37 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -6,27 +6,27 @@ from scripts.samples import Samples
 
 samples = Samples(config)
 
-REFERENCES = config["references"]
+METADATA_PATH = config["metadata"]["raw"]
 OUTPUT_BASE_PATH = config["paths"]["output_dir"]
 
-def get_sample_string(identifiers):
-    return "_".join(["{" + label + "}" for label in identifiers])
+REFERENCES = config["references"]
+IDENTIFIER = config["metadata"]["identifier"]
 
-SAMPLE_WILDCARDS = get_sample_string(config['metadata']['identifiers'])
+#def get_sample_string(identifiers):
+#    return "_".join(["{" + label + "}" for label in identifiers])
+#SAMPLE_WILDCARDS = get_sample_string(IDENTIFIERS)
 
 print(samples.metadata)
 
 localrules: link_files, write_samples_spreadsheet, all  # local execution of non-demanding rules
 
-wildcard_constraints:
-    sample_type="[A-z0-9_-]+",
-    individual="[A-z0-9_-]+"
+wildcard_constraints: #should not contain ".", see rule cellranger_count
+    individual="[A-z0-9-]+"
 
 rule all:
     input:
         samples.targets
 
-
 rule link_files:
     """Link files with filename pattern required for Cellranger
     
@@ -41,9 +41,8 @@ rule link_files:
 
         )
     output:
-        directory(f"{OUTPUT_BASE_PATH}/linked_files/{{individual}}/{{sample_type}}/")
     log:
-        "logs/link_files_{individual}_{sample_type}.log"
+        f"{OUTPUT_BASE_PATH}/logs/link_files_{{individual}}.log"
     run:
         log_fh  = open(log[0], "w")
         os.makedirs(output[0], exist_ok=True)
@@ -59,7 +58,7 @@ rule write_samples_spreadsheet:
     output:
         f"{OUTPUT_BASE_PATH}/{config['paths']['target_files']['samples_sheet']}"
     log:
-        "logs/write_samples_spreadsheet.log"
+        f"{OUTPUT_BASE_PATH}/logs/write_samples_spreadsheet.log"
     run:
         with open(log[0], "w") as log_fh:
             print("storing samples metadata to {output[0]}", file=log_fh)
@@ -75,23 +74,43 @@ rule cellranger_count:
     input:
         rules.link_files.output
     output:
-        outs_dir=f"{OUTPUT_BASE_PATH}/cellranger_count/{{individual}}_{{sample_type}}/outs"
+        outs_dir=directory(f"{OUTPUT_BASE_PATH}/cellranger_count/{{individual}}/outs")
     params:
         genome=REFERENCES["all_masked"],
-        output_root_dir=lambda wildcards, output: Path(output[0]).parents[1],  # remove last 2 levels from output path
-        extra=config["rules"]["cellranger_count"]["extra"]
+        output_root_dir=lambda wildcards, output: Path(output[0]).parents[1]  # remove last 2 levels from output path
+        #extra=config["rules"]["cellranger_count"]["extra"]
     log:
-        "logs/cellranger_count_{individual}_{sample_type}.log"
+        f"{OUTPUT_BASE_PATH}/logs/cellranger_count_{{individual}}.log"
     threads: 16
     envmodules:
         "cellranger/6.1.1",
     shell:
         "cd {params.output_root_dir}; "
+        "pwd;"
         "cellranger count "
-        "--id {wildcards.individual}_{wildcards.sample_type} "  # letters, numbers, hyphens, and underscores
+        "--id {wildcards.individual} "  # letters, numbers, hyphens, and underscores
         "--transcriptome {params.genome} "
         "--fastqs {input} "
         "--localcores={threads} "
+<<<<<<< HEAD
+        #"{extra} "
+        "--sample {wildcards.individual} "
+=======
         "{params.extra} "
         "--sample {wildcards.individual}_{wildcards.sample_type} "
+>>>>>>> cf05a99b38eb3233b3533b0392a99aa750ce3fa0
         "> {log}"
+
+# construct SCE objects from counts matrices found in ../alignment/output/
+rule construct_sce_objects:
+    input: 
+        output_cellranger = 
+        f"{OUTPUT_BASE_PATH}/cellranger_count/{{individual}}/outs/raw_feature_bc_matrix"
+    output:
+        sce_objects = f"{OUTPUT_BASE_PATH}/sce_objects/{{individual}}"
+    params:
+        identifier = IDENTIFIER,
+        metadata = METADATA_PATH,
+        single_cell_object_metadata_fields = config["metadata"]["single_cell_object_metadata_fields"]
+    script:
+        "scripts/construct_sce_objects.R"

From a2895324875afd31e1b48343a752d35c804bf58e Mon Sep 17 00:00:00 2001
From: leawlb <lea.woelbert@gmail.com>
Date: Mon, 10 Oct 2022 11:26:51 +0200
Subject: [PATCH 14/32] minor fixes

---
 config/config-test-lea.yaml | 3 +++
 workflow/Snakefile          | 8 +++-----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/config/config-test-lea.yaml b/config/config-test-lea.yaml
index abecb23..60401e9 100644
--- a/config/config-test-lea.yaml
+++ b/config/config-test-lea.yaml
@@ -5,6 +5,9 @@ paths:
     cellranger_count: "cellranger_count/{0[individual]}/outs"
     samples_sheet: "metadata/samples.csv"
     construct_sce_objects: "sce_objects/{0[individual]}"
+  # define specific target files
+  target_files:
+    samples_sheet: "metadata.csv"
 references:
   all_masked: "/omics/groups/OE0538/internal/users/f844s/2022-bonemarrow/data/masked_reference/GRCm38_masked_allStrains"
   #CAST_EiJ: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_cast/"
diff --git a/workflow/Snakefile b/workflow/Snakefile
index 9f50f37..0af6b5e 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -41,6 +41,7 @@ rule link_files:
 
         )
     output:
+       directory(f"{OUTPUT_BASE_PATH}/linked_files/{{individual}}")
     log:
         f"{OUTPUT_BASE_PATH}/logs/link_files_{{individual}}.log"
     run:
@@ -92,13 +93,10 @@ rule cellranger_count:
         "--transcriptome {params.genome} "
         "--fastqs {input} "
         "--localcores={threads} "
-<<<<<<< HEAD
         #"{extra} "
         "--sample {wildcards.individual} "
-=======
-        "{params.extra} "
-        "--sample {wildcards.individual}_{wildcards.sample_type} "
->>>>>>> cf05a99b38eb3233b3533b0392a99aa750ce3fa0
+        #"{params.extra} "
+        "--sample {wildcards.individual} "
         "> {log}"
 
 # construct SCE objects from counts matrices found in ../alignment/output/

From 0b12e341b11c19f49c8c223c7855050f990d7b0d Mon Sep 17 00:00:00 2001
From: leawlb <lea.woelbert@gmail.com>
Date: Mon, 10 Oct 2022 11:48:09 +0200
Subject: [PATCH 15/32] minor adjustments

---
 workflow/Snakefile          | 4 +---
 workflow/scripts/samples.py | 6 ++++++
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/workflow/Snakefile b/workflow/Snakefile
index 0af6b5e..3a1b251 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -87,14 +87,12 @@ rule cellranger_count:
         "cellranger/6.1.1",
     shell:
         "cd {params.output_root_dir}; "
-        "pwd;"
+        "pwd; "
         "cellranger count "
         "--id {wildcards.individual} "  # letters, numbers, hyphens, and underscores
         "--transcriptome {params.genome} "
         "--fastqs {input} "
         "--localcores={threads} "
-        #"{extra} "
-        "--sample {wildcards.individual} "
         #"{params.extra} "
         "--sample {wildcards.individual} "
         "> {log}"
diff --git a/workflow/scripts/samples.py b/workflow/scripts/samples.py
index 4073787..c2f3a90 100644
--- a/workflow/scripts/samples.py
+++ b/workflow/scripts/samples.py
@@ -67,7 +67,9 @@ def get_cellranger_filename(self, df: pd.DataFrame) -> pd.DataFrame:
         """Add column containing CellRanger compatible filename,
         i.e. in the format of
         [Sample Name]_S[Sample_Number]_L00[Lane Number]_[Read Type]_001.fastq.gz
+        
         Here, [Sample Name] consists of Object_ID.
+        
         See also: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/fastq-input
         """
 
@@ -136,3 +138,7 @@ def targets(self) -> list:
 
         targets = list(map(lambda x: self.metadata.agg(x.format, axis=1).drop_duplicates().to_list(), target_templates.values()))
         return targets
+      
+      
+      
+#grouped = metadata_subset.groupby(["PID", "Sample Type"])["FastQ Path"].apply(list)

From 9a869c76123a62cb819ec495e8ac3161e95b9894 Mon Sep 17 00:00:00 2001
From: leawlb <lea.woelbert@gmail.com>
Date: Tue, 11 Oct 2022 12:03:09 +0200
Subject: [PATCH 16/32] SCe construction now functional with metadata_full

---
 config/config-test-lea.yaml              | 3 +--
 workflow/Snakefile                       | 2 +-
 workflow/scripts/construct_sce_objects.R | 4 +++-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/config/config-test-lea.yaml b/config/config-test-lea.yaml
index 60401e9..a929214 100644
--- a/config/config-test-lea.yaml
+++ b/config/config-test-lea.yaml
@@ -1,9 +1,8 @@
 paths:
-  output_dir: "/omics/groups/OE0538/internal/users/l012t/snakemake-cellranger/data"
+  output_dir: "/omics/groups/OE0538/internal/users/l012t/snakemake-cellranger/test5"
   target_templates:
     linked_files: "linked_files/{0[individual]}"
     cellranger_count: "cellranger_count/{0[individual]}/outs"
-    samples_sheet: "metadata/samples.csv"
     construct_sce_objects: "sce_objects/{0[individual]}"
   # define specific target files
   target_files:
diff --git a/workflow/Snakefile b/workflow/Snakefile
index 3a1b251..103dc7b 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -101,7 +101,7 @@ rule cellranger_count:
 rule construct_sce_objects:
     input: 
         output_cellranger = 
-        f"{OUTPUT_BASE_PATH}/cellranger_count/{{individual}}/outs/raw_feature_bc_matrix"
+        f"{OUTPUT_BASE_PATH}/cellranger_count/{{individual}}/outs/"
     output:
         sce_objects = f"{OUTPUT_BASE_PATH}/sce_objects/{{individual}}"
     params:
diff --git a/workflow/scripts/construct_sce_objects.R b/workflow/scripts/construct_sce_objects.R
index 45795b8..578265f 100644
--- a/workflow/scripts/construct_sce_objects.R
+++ b/workflow/scripts/construct_sce_objects.R
@@ -6,7 +6,9 @@ library(DropletUtils, quietly = TRUE)
 # https://anaconda.org/bioconda/bioconductor-dropletutils
 
 # construct SCE object from raw cellranger output
-sce <- read10xCounts(samples = snakemake@input[["output_cellranger"]], 
+print(paste0(snakemake@input[["output_cellranger"]], "raw_feature_bc_matrix"))
+
+sce <- read10xCounts(samples = paste0(snakemake@input[["output_cellranger"]], "/raw_feature_bc_matrix"), 
                      col.names = TRUE, 
                      type = "sparse" )
 

From cf8fb83523faaaf477fa73b806012a9af0bdbbfe Mon Sep 17 00:00:00 2001
From: leawlb <lea.woelbert@gmail.com>
Date: Wed, 12 Oct 2022 00:56:25 +0200
Subject: [PATCH 17/32] concatenate multiple identifiers

---
 config/config-test-lea.yaml              | 20 ++++++-----
 workflow/Snakefile                       | 18 ++++------
 workflow/scripts/construct_sce_objects.R | 31 ++++++++++-------
 workflow/scripts/samples.py              | 42 ++++++++++++++++++------
 4 files changed, 70 insertions(+), 41 deletions(-)

diff --git a/config/config-test-lea.yaml b/config/config-test-lea.yaml
index a929214..d9718ef 100644
--- a/config/config-test-lea.yaml
+++ b/config/config-test-lea.yaml
@@ -1,5 +1,5 @@
 paths:
-  output_dir: "/omics/groups/OE0538/internal/users/l012t/snakemake-cellranger/test5"
+  output_dir: "/omics/groups/OE0538/internal/users/l012t/snakemake-cellranger/test10"
   target_templates:
     linked_files: "linked_files/{0[individual]}"
     cellranger_count: "cellranger_count/{0[individual]}/outs"
@@ -15,16 +15,20 @@ references:
 metadata:
   raw:
    - "/omics/groups/OE0538/internal/users/l012t/snakemake-cellranger/data/metadata_full.csv"
-  identifier:
-    #- Species_ID # not needed atm since I already have only one species
-    #- Age_ID
-    #- Fraction_ID 
-    #- Sample_NR
-     - Object_ID
+  identifiers: 
+  # Define column name(s) used to define a sample (in correct order)
+  # entries from multiple cols are concatenated into one sample-specific name
+  # in the format of {Species_ID}_{Age_ID}_{Fraction_ID}_{Sample_NR}
+    - Species_ID # col names I manually added to metadata sheet beforehand
+    - Age_ID
+    - Fraction_ID 
+    - Sample_NR
+    #- Objects_ID 
+    #- PID # col name automatically included in OTP metadata sheets
+  single_cell_object_metadata_fields:
   # Define all columns from the metadata spreadsheet that
   # will be included in the SingleCellExperiment / Seurat
   # objects
-  single_cell_object_metadata_fields:
       - Object_ID
       - Mouse_ID
       - Species_ID
diff --git a/workflow/Snakefile b/workflow/Snakefile
index 103dc7b..b7faef1 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -10,18 +10,15 @@ METADATA_PATH = config["metadata"]["raw"]
 OUTPUT_BASE_PATH = config["paths"]["output_dir"]
 
 REFERENCES = config["references"]
-IDENTIFIER = config["metadata"]["identifier"]
-
-#def get_sample_string(identifiers):
-#    return "_".join(["{" + label + "}" for label in identifiers])
-#SAMPLE_WILDCARDS = get_sample_string(IDENTIFIERS)
+IDENTIFIERS = config["metadata"]["identifiers"]
 
 print(samples.metadata)
+print(samples.metadata["individual"].unique())
 
-localrules: link_files, write_samples_spreadsheet, all  # local execution of non-demanding rules
+localrules: link_files, write_samples_spreadsheet, all, construct_sce_objects  # local execution of non-demanding rules
 
 wildcard_constraints: #should not contain ".", see rule cellranger_count
-    individual="[A-z0-9-]+"
+    individual="[A-z0-9_-]+" 
 
 rule all:
     input:
@@ -100,13 +97,12 @@ rule cellranger_count:
 # construct SCE objects from counts matrices found in ../alignment/output/
 rule construct_sce_objects:
     input: 
-        output_cellranger = 
-        f"{OUTPUT_BASE_PATH}/cellranger_count/{{individual}}/outs/"
+        output_cellranger = rules.cellranger_count.output
     output:
         sce_objects = f"{OUTPUT_BASE_PATH}/sce_objects/{{individual}}"
     params:
-        identifier = IDENTIFIER,
-        metadata = METADATA_PATH,
+        identifiers = IDENTIFIERS,
+        metadata = METADATA_PATH, # could be replaced with samples.metadata if R script is changed as well
         single_cell_object_metadata_fields = config["metadata"]["single_cell_object_metadata_fields"]
     script:
         "scripts/construct_sce_objects.R"
diff --git a/workflow/scripts/construct_sce_objects.R b/workflow/scripts/construct_sce_objects.R
index 578265f..7ba72c1 100644
--- a/workflow/scripts/construct_sce_objects.R
+++ b/workflow/scripts/construct_sce_objects.R
@@ -1,14 +1,16 @@
 #-------------------------------------------------------------------------------
 
 library(DropletUtils, quietly = TRUE) 
-# r and DropletUtils modules must be installed in snakemake conda environment
+library(tidyverse, quietly = TRUE) 
+# r and library modules must be installed in snakemake conda environment
 # https://anaconda.org/conda-forge/r-base
 # https://anaconda.org/bioconda/bioconductor-dropletutils
+# https://anaconda.org/r/r-tidyverse
 
 # construct SCE object from raw cellranger output
-print(paste0(snakemake@input[["output_cellranger"]], "raw_feature_bc_matrix"))
-
-sce <- read10xCounts(samples = paste0(snakemake@input[["output_cellranger"]], "/raw_feature_bc_matrix"), 
+print(paste0(snakemake@input[["output_cellranger"]], "/raw_feature_bc_matrix"))
+sce <- read10xCounts(samples = paste0(snakemake@input[["output_cellranger"]], 
+                                      "/raw_feature_bc_matrix"), 
                      col.names = TRUE, 
                      type = "sparse" )
 
@@ -21,15 +23,20 @@ metadata <- read.csv(file = snakemake@params[["metadata"]],
                      as.is=TRUE, 
                      colClasses = "character")
 
-# to know which sample is currently loaded
-wildcard_curr <- snakemake@wildcards[["individual"]] 
-# to know which metadata column to compare it to 
-identifier <- snakemake@params[["identifier"]] 
+wildcard_curr <- snakemake@wildcards[["individual"]] # currently loaded sample 
+IDENTIFIERS <- snakemake@params[["identifiers"]] 
+
+# if necessary concatenate identifiers again to obtain all possible wildcards 
+metadata_curr <- metadata
+if(! "individual" %in% colnames(metadata_curr)){
+  metadata_curr <- metadata_curr %>%
+    unite(col = "individual", all_of(IDENTIFIERS), sep = "_", remove = FALSE)
+  print('establishing "individual" column, construct_sce_objects.R')
+}
 
-# subset data that is relevant for current object only
-# as specified by wildcard and by by single_cell_object_metadata_fields
-metadata_curr <- metadata[
-  which(metadata[which(colnames(metadata) == identifier)] == wildcard_curr),]
+# subset data as specified by wildcard and single_cell_object_metadata_fields
+metadata_curr <- metadata_curr[
+  which(metadata_curr$individual == wildcard_curr),]
 cols_add <- snakemake@params[["single_cell_object_metadata_fields"]]
 metadata_curr <- metadata_curr[,colnames(metadata_curr) %in% cols_add]
 
diff --git a/workflow/scripts/samples.py b/workflow/scripts/samples.py
index c2f3a90..482a981 100644
--- a/workflow/scripts/samples.py
+++ b/workflow/scripts/samples.py
@@ -2,6 +2,9 @@
 
 from snakemake.io import Wildcards
 
+# list identifiers identically to config identifiers and in same order
+IDENTIFIERS = ["Species_ID", "Age_ID", "Fraction_ID", "Sample_NR"] # find a way to lift directly from config
+
 class Samples:
     """
     Convert the OTP-exported metadata spreadsheet into a pandas.DataFrame
@@ -23,15 +26,16 @@ class Samples:
                "LANE_NO",
                "READ",
                "CELLRANGER_FASTQ_PATH",
-               "Object_ID"] 
+               "individual"] # include individual which is later generated
     
+    columns = columns + IDENTIFIERS
     # map to rename columns in the format of (old):(new)
     # Object_ID should uniquely identify each sample/object, it must already be contained in the metadata sheet 
     # it must be identical to config metadata identifier
     # I haven't found a way to directly lift it from config yet
-    columns_map = {
-        "Object_ID": "individual" 
-    }
+    #columns_map = {
+    #    "Object_ID": "individual" 
+    #}
     
 
     def __init__(self, config):
@@ -42,13 +46,31 @@ def __init__(self, config):
 
         metadata_full = pd.concat((pd.read_csv(f) for f in metadata_files), ignore_index=True)
 
+        """Make "individual" column
+        
+        A single identifier col is effectively renamed to "individual" but 
+        entries from multiple identifier cols are concatenated to "individual".
+        
+        Entries in the "individual" col are used as wildcards.
+        
+        For metadata_full it is used for downstream functions (?).
+        
+        This may cause a warning that I still have to take care of but that doesn't seem critical.
+        """       
+        if not "individual" in metadata_full.columns: 
+            print('establishing "individual" column') 
+            for i in IDENTIFIERS:
+                if i == IDENTIFIERS[0]:
+                    metadata_full["individual"] = ""
+                    metadata_full["individual"] = metadata_full["individual"] + metadata_full[i].map(str)
+                else: 
+                    metadata_full["individual"] = metadata_full["individual"] + '_' + metadata_full[i].map(str)
+      
         metadata_full["DATE_OF_BIRTH"] = metadata_full.apply(self.rename_date_of_birth, axis=1)
-
+        
         metadata_full = self.get_cellranger_filename(metadata_full)
 
         self.metadata = self.select_columns(metadata_full)
-        self.metadata = self.metadata.rename(self.columns_map, axis="columns")
-        
 
     def rename_date_of_birth(self,
                              row: pd.Series):
@@ -68,18 +90,18 @@ def get_cellranger_filename(self, df: pd.DataFrame) -> pd.DataFrame:
         i.e. in the format of
         [Sample Name]_S[Sample_Number]_L00[Lane Number]_[Read Type]_001.fastq.gz
         
-        Here, [Sample Name] consists of Object_ID.
+        Here, [Sample Name] consists of "individual".
         
         See also: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/fastq-input
         """
 
-        grouped = df.groupby(["Object_ID", "LANE_NO", "READ"])
+        grouped = df.groupby(["individual", "LANE_NO", "READ"])
         groups = []
         for name, group in grouped:
             group = group.sort_values("FastQ Path")  # import to have consistent sorting
             group["multi_sample_idx"] = range(1, len(group)+1)
             group["CELLRANGER_FASTQ_PATH"] = group.agg(
-                "{0[Object_ID]}_S{0[multi_sample_idx]}_L00{0[LANE_NO]}_R{0[READ]}_001.fastq.gz".format,
+                "{0[individual]}_S{0[multi_sample_idx]}_L00{0[LANE_NO]}_R{0[READ]}_001.fastq.gz".format,
                 axis=1,
             )
             groups.append(group)

From f415304c377cc877da4b58f055cb7b4c50d0c91f Mon Sep 17 00:00:00 2001
From: leawlb <lea.woelbert@gmail.com>
Date: Wed, 12 Oct 2022 13:09:49 +0200
Subject: [PATCH 18/32] minor changes for functionality

---
 config/config-example.yaml               | 9 ++++-----
 config/config-test-lea.yaml              | 6 ++++--
 workflow/Snakefile                       | 2 +-
 workflow/scripts/construct_sce_objects.R | 5 ++---
 workflow/scripts/samples.py              | 5 ++---
 5 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/config/config-example.yaml b/config/config-example.yaml
index 2040a43..0e12dce 100644
--- a/config/config-example.yaml
+++ b/config/config-example.yaml
@@ -22,16 +22,15 @@ metadata:
     - ""
   # Define the column names used to define a sample
   identifiers:
-    - SAMPLE_NAME
+    - PID
     - Sample_Type
-    - Age
-    - fraction
   # Define all columns from the metadata spreadsheet that
   # will be included in the SingleCellExperiment / Seurat
   # objects
   single_cell_object_metadata_fields:
-      - SAMPLE_NAME
-      - Age
+      - PID
+      - Sample_Type
+      - individual
 # Enable / Disable rules and specifiy rule-specific parameters
 rules:
   cellranger_count:
diff --git a/config/config-test-lea.yaml b/config/config-test-lea.yaml
index d9718ef..327b7db 100644
--- a/config/config-test-lea.yaml
+++ b/config/config-test-lea.yaml
@@ -1,5 +1,5 @@
 paths:
-  output_dir: "/omics/groups/OE0538/internal/users/l012t/snakemake-cellranger/test10"
+  output_dir: "/omics/groups/OE0538/internal/users/l012t/snakemake-cellranger/data"
   target_templates:
     linked_files: "linked_files/{0[individual]}"
     cellranger_count: "cellranger_count/{0[individual]}/outs"
@@ -25,11 +25,12 @@ metadata:
     - Sample_NR
     #- Objects_ID 
     #- PID # col name automatically included in OTP metadata sheets
+    #- Sample Type # col name automatically included in OTP metadata sheets
   single_cell_object_metadata_fields:
   # Define all columns from the metadata spreadsheet that
   # will be included in the SingleCellExperiment / Seurat
   # objects
-      - Object_ID
+      - Object_ID # colnames I manually added to metadata 
       - Mouse_ID
       - Species_ID
       - Age
@@ -44,6 +45,7 @@ metadata:
       - Batch_sequencing
       - Date_collected
       - Discard
+      - individual
       - PID
 # Enable / Disable rules and specifiy rule-specific parameters
 rules:
diff --git a/workflow/Snakefile b/workflow/Snakefile
index b7faef1..d3d99cb 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -102,7 +102,7 @@ rule construct_sce_objects:
         sce_objects = f"{OUTPUT_BASE_PATH}/sce_objects/{{individual}}"
     params:
         identifiers = IDENTIFIERS,
-        metadata = METADATA_PATH, # could be replaced with samples.metadata if R script is changed as well
+        metadata = METADATA_PATH,
         single_cell_object_metadata_fields = config["metadata"]["single_cell_object_metadata_fields"]
     script:
         "scripts/construct_sce_objects.R"
diff --git a/workflow/scripts/construct_sce_objects.R b/workflow/scripts/construct_sce_objects.R
index 7ba72c1..e2625aa 100644
--- a/workflow/scripts/construct_sce_objects.R
+++ b/workflow/scripts/construct_sce_objects.R
@@ -23,7 +23,7 @@ metadata <- read.csv(file = snakemake@params[["metadata"]],
                      as.is=TRUE, 
                      colClasses = "character")
 
-wildcard_curr <- snakemake@wildcards[["individual"]] # currently loaded sample 
+wildcard_curr <- snakemake@wildcards[["individual"]] # currently loaded individual sample 
 IDENTIFIERS <- snakemake@params[["identifiers"]] 
 
 # if necessary concatenate identifiers again to obtain all possible wildcards 
@@ -35,8 +35,7 @@ if(! "individual" %in% colnames(metadata_curr)){
 }
 
 # subset data as specified by wildcard and single_cell_object_metadata_fields
-metadata_curr <- metadata_curr[
-  which(metadata_curr$individual == wildcard_curr),]
+metadata_curr <- metadata_curr[which(metadata_curr$individual == wildcard_curr),]
 cols_add <- snakemake@params[["single_cell_object_metadata_fields"]]
 metadata_curr <- metadata_curr[,colnames(metadata_curr) %in% cols_add]
 
diff --git a/workflow/scripts/samples.py b/workflow/scripts/samples.py
index 482a981..511668e 100644
--- a/workflow/scripts/samples.py
+++ b/workflow/scripts/samples.py
@@ -48,7 +48,7 @@ def __init__(self, config):
 
         """Make "individual" column
         
-        A single identifier col is effectively renamed to "individual" but 
+        A single identifier col is copied and renamed to "individual" but 
         entries from multiple identifier cols are concatenated to "individual".
         
         Entries in the "individual" col are used as wildcards.
@@ -61,8 +61,7 @@ def __init__(self, config):
             print('establishing "individual" column') 
             for i in IDENTIFIERS:
                 if i == IDENTIFIERS[0]:
-                    metadata_full["individual"] = ""
-                    metadata_full["individual"] = metadata_full["individual"] + metadata_full[i].map(str)
+                    metadata_full["individual"] = metadata_full[i].map(str)
                 else: 
                     metadata_full["individual"] = metadata_full["individual"] + '_' + metadata_full[i].map(str)
       

From 1489930b1306cdbc82ff9f4da4fee314fd5bc673 Mon Sep 17 00:00:00 2001
From: leawlb <lea.woelbert@gmail.com>
Date: Fri, 21 Oct 2022 17:48:45 +0200
Subject: [PATCH 19/32] changed subfolderstructure, added second wildcard again

---
 config/config-test-lea.yaml | 12 ++++++------
 workflow/Snakefile          | 21 ++++++++++++---------
 workflow/scripts/samples.py |  2 +-
 3 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/config/config-test-lea.yaml b/config/config-test-lea.yaml
index 327b7db..9c66a52 100644
--- a/config/config-test-lea.yaml
+++ b/config/config-test-lea.yaml
@@ -1,9 +1,9 @@
 paths:
-  output_dir: "/omics/groups/OE0538/internal/users/l012t/snakemake-cellranger/data"
+  output_dir: "/omics/odcf/analysis/OE0538_projects/DO-0008/test"
   target_templates:
-    linked_files: "linked_files/{0[individual]}"
-    cellranger_count: "cellranger_count/{0[individual]}/outs"
-    construct_sce_objects: "sce_objects/{0[individual]}"
+    linked_files: "cellranger/linked_files/{0[Species_ID]}/{0[individual]}"
+    cellranger_count: "cellranger/cellranger_count/{0[Species_ID]}/{0[individual]}/outs"
+    construct_sce_objects: "sce_objects/01_cellranger_output/{0[Species_ID]}/sce_{0[individual]}-01"
   # define specific target files
   target_files:
     samples_sheet: "metadata.csv"
@@ -14,7 +14,7 @@ references:
   #C57BL6J: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_spret/"
 metadata:
   raw:
-   - "/omics/groups/OE0538/internal/users/l012t/snakemake-cellranger/data/metadata_full.csv"
+   - "/omics/odcf/analysis/OE0538_projects/DO-0008/metadata/OE0538_DO-0008_metadata_combined.csv"
   identifiers: 
   # Define column name(s) used to define a sample (in correct order)
   # entries from multiple cols are concatenated into one sample-specific name
@@ -44,7 +44,7 @@ metadata:
       - Batch_exp_day
       - Batch_sequencing
       - Date_collected
-      - Discard
+      - Keep_sample
       - individual
       - PID
 # Enable / Disable rules and specifiy rule-specific parameters
diff --git a/workflow/Snakefile b/workflow/Snakefile
index d3d99cb..ae4d191 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -15,10 +15,11 @@ IDENTIFIERS = config["metadata"]["identifiers"]
 print(samples.metadata)
 print(samples.metadata["individual"].unique())
 
-localrules: link_files, write_samples_spreadsheet, all, construct_sce_objects  # local execution of non-demanding rules
+localrules: link_files, write_samples_spreadsheet, all  # local execution of non-demanding rules
 
 wildcard_constraints: #should not contain ".", see rule cellranger_count
-    individual="[A-z0-9_-]+" 
+    individual="[A-z0-9_-]+",
+    Species_ID="[A-z0-9_-]+"
 
 rule all:
     input:
@@ -38,9 +39,9 @@ rule link_files:
 
         )
     output:
-       directory(f"{OUTPUT_BASE_PATH}/linked_files/{{individual}}")
+       directory(f"{OUTPUT_BASE_PATH}/cellranger/linked_files/{{Species_ID}}/{{individual}}")
     log:
-        f"{OUTPUT_BASE_PATH}/logs/link_files_{{individual}}.log"
+        f"{OUTPUT_BASE_PATH}/cellranger/logs/{{Species_ID}}/link_files_{{individual}}.log"
     run:
         log_fh  = open(log[0], "w")
         os.makedirs(output[0], exist_ok=True)
@@ -54,9 +55,9 @@ rule link_files:
 
 rule write_samples_spreadsheet:
     output:
-        f"{OUTPUT_BASE_PATH}/{config['paths']['target_files']['samples_sheet']}"
+        f"{OUTPUT_BASE_PATH}/cellranger/{config['paths']['target_files']['samples_sheet']}"
     log:
-        f"{OUTPUT_BASE_PATH}/logs/write_samples_spreadsheet.log"
+        f"{OUTPUT_BASE_PATH}/cellranger/logs/write_samples_spreadsheet.log"
     run:
         with open(log[0], "w") as log_fh:
             print("storing samples metadata to {output[0]}", file=log_fh)
@@ -72,13 +73,13 @@ rule cellranger_count:
     input:
         rules.link_files.output
     output:
-        outs_dir=directory(f"{OUTPUT_BASE_PATH}/cellranger_count/{{individual}}/outs")
+        outs_dir=directory(f"{OUTPUT_BASE_PATH}/cellranger/cellranger_count/{{Species_ID}}/{{individual}}/outs")
     params:
         genome=REFERENCES["all_masked"],
         output_root_dir=lambda wildcards, output: Path(output[0]).parents[1]  # remove last 2 levels from output path
         #extra=config["rules"]["cellranger_count"]["extra"]
     log:
-        f"{OUTPUT_BASE_PATH}/logs/cellranger_count_{{individual}}.log"
+        f"{OUTPUT_BASE_PATH}/cellranger/logs/{{Species_ID}}/cellranger_count_{{individual}}.log"
     threads: 16
     envmodules:
         "cellranger/6.1.1",
@@ -99,10 +100,12 @@ rule construct_sce_objects:
     input: 
         output_cellranger = rules.cellranger_count.output
     output:
-        sce_objects = f"{OUTPUT_BASE_PATH}/sce_objects/{{individual}}"
+        sce_objects = f"{OUTPUT_BASE_PATH}/sce_objects/01_cellranger_output/{{Species_ID}}/sce_{{individual}}-01"
     params:
         identifiers = IDENTIFIERS,
         metadata = METADATA_PATH,
         single_cell_object_metadata_fields = config["metadata"]["single_cell_object_metadata_fields"]
     script:
         "scripts/construct_sce_objects.R"
+        
+
diff --git a/workflow/scripts/samples.py b/workflow/scripts/samples.py
index 511668e..1f0c2c5 100644
--- a/workflow/scripts/samples.py
+++ b/workflow/scripts/samples.py
@@ -129,7 +129,7 @@ def filter_by_wildcards(
         :return param:
         """
 
-        _identifiers = ["individual"]
+        _identifiers = ["individual", "Species_ID"]
 
         if wildcards:
             filters = dict((k, getattr(wildcards, k)) for k in _identifiers)

From fd312f0cdfc0602902d7f0f669b9471ffdd8a94b Mon Sep 17 00:00:00 2001
From: leawlb <lea.woelbert@gmail.com>
Date: Wed, 9 Nov 2022 20:16:33 +0100
Subject: [PATCH 20/32] add automatic resolution of required conda env for SCE
 construction

---
 config/config-test-lea.yaml              |  4 ++--
 workflow/Snakefile                       |  5 +++--
 workflow/envs/construct_sce_objects.yaml | 12 ++++++++++++
 workflow/scripts/construct_sce_objects.R | 12 +++++-------
 4 files changed, 22 insertions(+), 11 deletions(-)
 create mode 100644 workflow/envs/construct_sce_objects.yaml

diff --git a/config/config-test-lea.yaml b/config/config-test-lea.yaml
index 9c66a52..d910fe9 100644
--- a/config/config-test-lea.yaml
+++ b/config/config-test-lea.yaml
@@ -1,9 +1,9 @@
 paths:
-  output_dir: "/omics/odcf/analysis/OE0538_projects/DO-0008/test"
+  output_dir: "/omics/odcf/analysis/OE0538_projects/DO-0008/data"
   target_templates:
     linked_files: "cellranger/linked_files/{0[Species_ID]}/{0[individual]}"
     cellranger_count: "cellranger/cellranger_count/{0[Species_ID]}/{0[individual]}/outs"
-    construct_sce_objects: "sce_objects/01_cellranger_output/{0[Species_ID]}/sce_{0[individual]}-01"
+    construct_sce_objects: "sce_objects/archive/01_cellranger_output/{0[Species_ID]}/sce_{0[individual]}-01"
   # define specific target files
   target_files:
     samples_sheet: "metadata.csv"
diff --git a/workflow/Snakefile b/workflow/Snakefile
index ae4d191..4b1a849 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -100,12 +100,13 @@ rule construct_sce_objects:
     input: 
         output_cellranger = rules.cellranger_count.output
     output:
-        sce_objects = f"{OUTPUT_BASE_PATH}/sce_objects/01_cellranger_output/{{Species_ID}}/sce_{{individual}}-01"
+        sce_objects = f"{OUTPUT_BASE_PATH}/sce_objects/archive/01_cellranger_output/{{Species_ID}}/sce_{{individual}}-01"
     params:
         identifiers = IDENTIFIERS,
         metadata = METADATA_PATH,
         single_cell_object_metadata_fields = config["metadata"]["single_cell_object_metadata_fields"]
+    conda:
+        "envs/construct_sce_objects.yaml"
     script:
         "scripts/construct_sce_objects.R"
         
-
diff --git a/workflow/envs/construct_sce_objects.yaml b/workflow/envs/construct_sce_objects.yaml
new file mode 100644
index 0000000..048c041
--- /dev/null
+++ b/workflow/envs/construct_sce_objects.yaml
@@ -0,0 +1,12 @@
+name: construct_sce_objects
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - r-base=4.1.3
+  - bioconductor-dropletutils=1.14.2
+  - r-tidyverse=1.3.2
+  
+# set channel priority to strict with conda config --set channel_priority strict
+# use --conda-frontend conda in snakemake command
diff --git a/workflow/scripts/construct_sce_objects.R b/workflow/scripts/construct_sce_objects.R
index e2625aa..36d379e 100644
--- a/workflow/scripts/construct_sce_objects.R
+++ b/workflow/scripts/construct_sce_objects.R
@@ -1,11 +1,8 @@
 #-------------------------------------------------------------------------------
 
-library(DropletUtils, quietly = TRUE) 
 library(tidyverse, quietly = TRUE) 
-# r and library modules must be installed in snakemake conda environment
-# https://anaconda.org/conda-forge/r-base
-# https://anaconda.org/bioconda/bioconductor-dropletutils
-# https://anaconda.org/r/r-tidyverse
+#library(SummarizedExperiment, quietly = TRUE) 
+library(DropletUtils, quietly = TRUE) 
 
 # construct SCE object from raw cellranger output
 print(paste0(snakemake@input[["output_cellranger"]], "/raw_feature_bc_matrix"))
@@ -23,7 +20,7 @@ metadata <- read.csv(file = snakemake@params[["metadata"]],
                      as.is=TRUE, 
                      colClasses = "character")
 
-wildcard_curr <- snakemake@wildcards[["individual"]] # currently loaded individual sample 
+individual_curr <- snakemake@wildcards[["individual"]] # currently loaded individual sample 
 IDENTIFIERS <- snakemake@params[["identifiers"]] 
 
 # if necessary concatenate identifiers again to obtain all possible wildcards 
@@ -35,7 +32,7 @@ if(! "individual" %in% colnames(metadata_curr)){
 }
 
 # subset data as specified by wildcard and single_cell_object_metadata_fields
-metadata_curr <- metadata_curr[which(metadata_curr$individual == wildcard_curr),]
+metadata_curr <- metadata_curr[which(metadata_curr$individual == individual_curr),]
 cols_add <- snakemake@params[["single_cell_object_metadata_fields"]]
 metadata_curr <- metadata_curr[,colnames(metadata_curr) %in% cols_add]
 
@@ -44,4 +41,5 @@ for(i in colnames(metadata_curr)){
   colData(sce)[i] <- rep(metadata_curr[1,i], ncol(sce))
 }
 
+print(sce)
 saveRDS(sce, file = snakemake@output[["sce_objects"]])
\ No newline at end of file

From 3bdeeeca4ea163936a7161d4e1111e1fa53d6398 Mon Sep 17 00:00:00 2001
From: leawlb <lea.woelbert@gmail.com>
Date: Wed, 9 Nov 2022 22:28:34 +0100
Subject: [PATCH 21/32] removed manual addition of identifiers to samples.py

---
 workflow/Snakefile                       | 14 +++++++----
 workflow/envs/construct_sce_objects.yaml |  4 +---
 workflow/scripts/construct_sce_objects.R |  1 -
 workflow/scripts/samples.py              | 30 +++++++++++-------------
 4 files changed, 25 insertions(+), 24 deletions(-)

diff --git a/workflow/Snakefile b/workflow/Snakefile
index 4b1a849..de3aaa3 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -13,7 +13,6 @@ REFERENCES = config["references"]
 IDENTIFIERS = config["metadata"]["identifiers"]
 
 print(samples.metadata)
-print(samples.metadata["individual"].unique())
 
 localrules: link_files, write_samples_spreadsheet, all  # local execution of non-demanding rules
 
@@ -96,17 +95,24 @@ rule cellranger_count:
         "> {log}"
 
 # construct SCE objects from counts matrices found in ../alignment/output/
+
+"""
+# I set channel priority to strict with << conda config --set channel_priority strict >>
+# I had to use << --conda-frontend conda >> in snakemake command
+# if used more than a couple times I suggest directly installing the packages into the snakemake env
+"""
+
 rule construct_sce_objects:
     input: 
         output_cellranger = rules.cellranger_count.output
     output:
-        sce_objects = f"{OUTPUT_BASE_PATH}/sce_objects/archive/01_cellranger_output/{{Species_ID}}/sce_{{individual}}-01"
+        sce_objects = f"{OUTPUT_BASE_PATH}/sce_objects/01_cellranger_output/{{Species_ID}}/sce_{{individual}}-01"
     params:
         identifiers = IDENTIFIERS,
         metadata = METADATA_PATH,
         single_cell_object_metadata_fields = config["metadata"]["single_cell_object_metadata_fields"]
-    conda:
-        "envs/construct_sce_objects.yaml"
+    #conda:
+    #    "envs/construct_sce_objects.yaml" # if required packages are installed this can be inactivated
     script:
         "scripts/construct_sce_objects.R"
         
diff --git a/workflow/envs/construct_sce_objects.yaml b/workflow/envs/construct_sce_objects.yaml
index 048c041..2924286 100644
--- a/workflow/envs/construct_sce_objects.yaml
+++ b/workflow/envs/construct_sce_objects.yaml
@@ -7,6 +7,4 @@ dependencies:
   - r-base=4.1.3
   - bioconductor-dropletutils=1.14.2
   - r-tidyverse=1.3.2
-  
-# set channel priority to strict with conda config --set channel_priority strict
-# use --conda-frontend conda in snakemake command
+  
\ No newline at end of file
diff --git a/workflow/scripts/construct_sce_objects.R b/workflow/scripts/construct_sce_objects.R
index 36d379e..8f1bfd8 100644
--- a/workflow/scripts/construct_sce_objects.R
+++ b/workflow/scripts/construct_sce_objects.R
@@ -1,7 +1,6 @@
 #-------------------------------------------------------------------------------
 
 library(tidyverse, quietly = TRUE) 
-#library(SummarizedExperiment, quietly = TRUE) 
 library(DropletUtils, quietly = TRUE) 
 
 # construct SCE object from raw cellranger output
diff --git a/workflow/scripts/samples.py b/workflow/scripts/samples.py
index 1f0c2c5..4915cc4 100644
--- a/workflow/scripts/samples.py
+++ b/workflow/scripts/samples.py
@@ -3,7 +3,7 @@
 from snakemake.io import Wildcards
 
 # list identifiers identically to config identifiers and in same order
-IDENTIFIERS = ["Species_ID", "Age_ID", "Fraction_ID", "Sample_NR"] # find a way to lift directly from config
+#IDENTIFIERS = ["Species_ID", "Age_ID", "Fraction_ID"] # find a way to lift directly from config
 
 class Samples:
     """
@@ -27,18 +27,14 @@ class Samples:
                "READ",
                "CELLRANGER_FASTQ_PATH",
                "individual"] # include individual which is later generated
-    
-    columns = columns + IDENTIFIERS
-    # map to rename columns in the format of (old):(new)
-    # Object_ID should uniquely identify each sample/object, it must already be contained in the metadata sheet 
-    # it must be identical to config metadata identifier
-    # I haven't found a way to directly lift it from config yet
-    #columns_map = {
-    #    "Object_ID": "individual" 
-    #}
-    
-
+    #print(IDENTIFIERS)
+    #print(columns)
+    #columns = columns + IDENTIFIERS 
+    #print(columns)
+    #columns = None
+               
     def __init__(self, config):
+        IDENTIFIERS = config["metadata"]["identifiers"]
         metadata_files = config["metadata"]["raw"]
 
         self.output_base_dir = config["paths"]["output_dir"]
@@ -69,7 +65,7 @@ def __init__(self, config):
         
         metadata_full = self.get_cellranger_filename(metadata_full)
 
-        self.metadata = self.select_columns(metadata_full)
+        self.metadata = self.select_columns(metadata_full, identifiers = IDENTIFIERS)
 
     def rename_date_of_birth(self,
                              row: pd.Series):
@@ -109,13 +105,15 @@ def get_cellranger_filename(self, df: pd.DataFrame) -> pd.DataFrame:
 
     def select_columns(self,
                        df: pd.DataFrame,
-                       columns: list = None):
+                       columns: list = None,
+                       identifiers: str = None):
         """Select/Subset columns from DataFrame to reduce
         DataFrame dimensions. """
+
         if not columns:
-            columns = self.columns
+            columns = self.columns + identifiers
         return df[columns]
-
+            
     @staticmethod
     def filter_by_wildcards(
             wildcards: Wildcards,

From 32a5d5569ab2774115b668c810b599a5fc4fa724 Mon Sep 17 00:00:00 2001
From: leawlb <lea.woelbert@gmail.com>
Date: Sun, 27 Nov 2022 23:16:40 +0100
Subject: [PATCH 22/32] various corrections

---
 config/config-test-lea.yaml | 6 +++---
 workflow/Snakefile          | 7 +++++--
 workflow/scripts/samples.py | 2 +-
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/config/config-test-lea.yaml b/config/config-test-lea.yaml
index d910fe9..4be12d7 100644
--- a/config/config-test-lea.yaml
+++ b/config/config-test-lea.yaml
@@ -1,14 +1,14 @@
 paths:
-  output_dir: "/omics/odcf/analysis/OE0538_projects/DO-0008/data"
+  output_dir: "/omics/odcf/analysis/OE0538_projects/DO-0008/test4"
   target_templates:
     linked_files: "cellranger/linked_files/{0[Species_ID]}/{0[individual]}"
     cellranger_count: "cellranger/cellranger_count/{0[Species_ID]}/{0[individual]}/outs"
-    construct_sce_objects: "sce_objects/archive/01_cellranger_output/{0[Species_ID]}/sce_{0[individual]}-01"
+    construct_sce_objects: "sce_objects/01_cellranger_output/{0[Species_ID]}/sce_{0[individual]}-01"
   # define specific target files
   target_files:
     samples_sheet: "metadata.csv"
 references:
-  all_masked: "/omics/groups/OE0538/internal/users/f844s/2022-bonemarrow/data/masked_reference/GRCm38_masked_allStrains"
+  all_masked: "/omics/groups/OE0538/internal/shared_data/CellRangerReferences/GRCm38_masked_allStrains/"
   #CAST_EiJ: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_cast/"
   #SPRET_EiJ: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_spret/"
   #C57BL6J: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_spret/"
diff --git a/workflow/Snakefile b/workflow/Snakefile
index de3aaa3..f4c6cf3 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -41,6 +41,7 @@ rule link_files:
        directory(f"{OUTPUT_BASE_PATH}/cellranger/linked_files/{{Species_ID}}/{{individual}}")
     log:
         f"{OUTPUT_BASE_PATH}/cellranger/logs/{{Species_ID}}/link_files_{{individual}}.log"
+    threads: 1
     run:
         log_fh  = open(log[0], "w")
         os.makedirs(output[0], exist_ok=True)
@@ -57,6 +58,7 @@ rule write_samples_spreadsheet:
         f"{OUTPUT_BASE_PATH}/cellranger/{config['paths']['target_files']['samples_sheet']}"
     log:
         f"{OUTPUT_BASE_PATH}/cellranger/logs/write_samples_spreadsheet.log"
+    threads: 1
     run:
         with open(log[0], "w") as log_fh:
             print("storing samples metadata to {output[0]}", file=log_fh)
@@ -111,8 +113,9 @@ rule construct_sce_objects:
         identifiers = IDENTIFIERS,
         metadata = METADATA_PATH,
         single_cell_object_metadata_fields = config["metadata"]["single_cell_object_metadata_fields"]
-    #conda:
-    #    "envs/construct_sce_objects.yaml" # if required packages are installed this can be inactivated
+    threads: 1
+    conda:
+        "envs/construct_sce_objects.yaml" # if required packages are installed this can be inactivated
     script:
         "scripts/construct_sce_objects.R"
         
diff --git a/workflow/scripts/samples.py b/workflow/scripts/samples.py
index 4915cc4..c20cdf5 100644
--- a/workflow/scripts/samples.py
+++ b/workflow/scripts/samples.py
@@ -127,7 +127,7 @@ def filter_by_wildcards(
         :return param:
         """
 
-        _identifiers = ["individual", "Species_ID"]
+        _identifiers = ["individual", "Species_ID"] # possible wildcards
 
         if wildcards:
             filters = dict((k, getattr(wildcards, k)) for k in _identifiers)

From 7931239045010c1eebc214eabef6db88fe363fc1 Mon Sep 17 00:00:00 2001
From: leawlb <lea.woelbert@gmail.com>
Date: Mon, 28 Nov 2022 09:38:01 +0100
Subject: [PATCH 23/32] altered individual column generation and renamed config

---
 ...ml => config-interspecies-bonemarrow.yaml} |  7 ++--
 workflow/Snakefile                            | 13 +++----
 workflow/scripts/construct_sce_objects.R      |  2 +-
 workflow/scripts/samples.py                   | 35 +++----------------
 4 files changed, 14 insertions(+), 43 deletions(-)
 rename config/{config-test-lea.yaml => config-interspecies-bonemarrow.yaml} (92%)

diff --git a/config/config-test-lea.yaml b/config/config-interspecies-bonemarrow.yaml
similarity index 92%
rename from config/config-test-lea.yaml
rename to config/config-interspecies-bonemarrow.yaml
index 4be12d7..b22c35d 100644
--- a/config/config-test-lea.yaml
+++ b/config/config-interspecies-bonemarrow.yaml
@@ -1,7 +1,7 @@
 paths:
-  output_dir: "/omics/odcf/analysis/OE0538_projects/DO-0008/test4"
+  output_dir: "/omics/odcf/analysis/OE0538_projects/DO-0008/data"
   target_templates:
-    linked_files: "cellranger/linked_files/{0[Species_ID]}/{0[individual]}"
+    linked_files: "cellranger/linked_files/{0[Species_ID]}/{0[individual]}" 
     cellranger_count: "cellranger/cellranger_count/{0[Species_ID]}/{0[individual]}/outs"
     construct_sce_objects: "sce_objects/01_cellranger_output/{0[Species_ID]}/sce_{0[individual]}-01"
   # define specific target files
@@ -23,7 +23,6 @@ metadata:
     - Age_ID
     - Fraction_ID 
     - Sample_NR
-    #- Objects_ID 
     #- PID # col name automatically included in OTP metadata sheets
     #- Sample Type # col name automatically included in OTP metadata sheets
   single_cell_object_metadata_fields:
@@ -50,7 +49,7 @@ metadata:
 # Enable / Disable rules and specifiy rule-specific parameters
 rules:
   cellranger_count:
-  #  extra: ""  # set additional arguments for cellranger count
+    extra: ""  # set additional arguments for cellranger count
   allele_specific: False
   wasp_filter_reads: False
   realign_bam: False
\ No newline at end of file
diff --git a/workflow/Snakefile b/workflow/Snakefile
index f4c6cf3..eab0548 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -41,7 +41,6 @@ rule link_files:
        directory(f"{OUTPUT_BASE_PATH}/cellranger/linked_files/{{Species_ID}}/{{individual}}")
     log:
         f"{OUTPUT_BASE_PATH}/cellranger/logs/{{Species_ID}}/link_files_{{individual}}.log"
-    threads: 1
     run:
         log_fh  = open(log[0], "w")
         os.makedirs(output[0], exist_ok=True)
@@ -58,7 +57,6 @@ rule write_samples_spreadsheet:
         f"{OUTPUT_BASE_PATH}/cellranger/{config['paths']['target_files']['samples_sheet']}"
     log:
         f"{OUTPUT_BASE_PATH}/cellranger/logs/write_samples_spreadsheet.log"
-    threads: 1
     run:
         with open(log[0], "w") as log_fh:
             print("storing samples metadata to {output[0]}", file=log_fh)
@@ -77,8 +75,8 @@ rule cellranger_count:
         outs_dir=directory(f"{OUTPUT_BASE_PATH}/cellranger/cellranger_count/{{Species_ID}}/{{individual}}/outs")
     params:
         genome=REFERENCES["all_masked"],
-        output_root_dir=lambda wildcards, output: Path(output[0]).parents[1]  # remove last 2 levels from output path
-        #extra=config["rules"]["cellranger_count"]["extra"]
+        output_root_dir=lambda wildcards, output: Path(output[0]).parents[1],  # remove last 2 levels from output path
+        extra=config["rules"]["cellranger_count"]["extra"]
     log:
         f"{OUTPUT_BASE_PATH}/cellranger/logs/{{Species_ID}}/cellranger_count_{{individual}}.log"
     threads: 16
@@ -92,7 +90,7 @@ rule cellranger_count:
         "--transcriptome {params.genome} "
         "--fastqs {input} "
         "--localcores={threads} "
-        #"{params.extra} "
+        "{params.extra} "
         "--sample {wildcards.individual} "
         "> {log}"
 
@@ -113,9 +111,8 @@ rule construct_sce_objects:
         identifiers = IDENTIFIERS,
         metadata = METADATA_PATH,
         single_cell_object_metadata_fields = config["metadata"]["single_cell_object_metadata_fields"]
-    threads: 1
-    conda:
-        "envs/construct_sce_objects.yaml" # if required packages are installed this can be inactivated
+    conda: # if all required packages are installed in the snakemake environment this can be omitted
+        "envs/construct_sce_objects.yaml" # if all required packages are installed in the snakemake environment this can be omitted
     script:
         "scripts/construct_sce_objects.R"
         
diff --git a/workflow/scripts/construct_sce_objects.R b/workflow/scripts/construct_sce_objects.R
index 8f1bfd8..2adaf20 100644
--- a/workflow/scripts/construct_sce_objects.R
+++ b/workflow/scripts/construct_sce_objects.R
@@ -22,7 +22,7 @@ metadata <- read.csv(file = snakemake@params[["metadata"]],
 individual_curr <- snakemake@wildcards[["individual"]] # currently loaded individual sample 
 IDENTIFIERS <- snakemake@params[["identifiers"]] 
 
-# if necessary concatenate identifiers again to obtain all possible wildcards 
+# if necessary, concatenate identifiers again to obtain all possible wildcards 
 metadata_curr <- metadata
 if(! "individual" %in% colnames(metadata_curr)){
   metadata_curr <- metadata_curr %>%
diff --git a/workflow/scripts/samples.py b/workflow/scripts/samples.py
index c20cdf5..66214b7 100644
--- a/workflow/scripts/samples.py
+++ b/workflow/scripts/samples.py
@@ -2,9 +2,6 @@
 
 from snakemake.io import Wildcards
 
-# list identifiers identically to config identifiers and in same order
-#IDENTIFIERS = ["Species_ID", "Age_ID", "Fraction_ID"] # find a way to lift directly from config
-
 class Samples:
     """
     Convert the OTP-exported metadata spreadsheet into a pandas.DataFrame
@@ -26,12 +23,7 @@ class Samples:
                "LANE_NO",
                "READ",
                "CELLRANGER_FASTQ_PATH",
-               "individual"] # include individual which is later generated
-    #print(IDENTIFIERS)
-    #print(columns)
-    #columns = columns + IDENTIFIERS 
-    #print(columns)
-    #columns = None
+               "individual"] # include 'individual' col to be generated below
                
     def __init__(self, config):
         IDENTIFIERS = config["metadata"]["identifiers"]
@@ -42,27 +34,10 @@ def __init__(self, config):
 
         metadata_full = pd.concat((pd.read_csv(f) for f in metadata_files), ignore_index=True)
 
-        """Make "individual" column
-        
-        A single identifier col is copied and renamed to "individual" but 
-        entries from multiple identifier cols are concatenated to "individual".
-        
-        Entries in the "individual" col are used as wildcards.
-        
-        For metadata_full it is used for downstream functions (?).
-        
-        This may cause a warning that I still have to take care of but that doesn't seem critical.
-        """       
+        # generate 'individual' column containing all concatenated identifiers
         if not "individual" in metadata_full.columns: 
-            print('establishing "individual" column') 
-            for i in IDENTIFIERS:
-                if i == IDENTIFIERS[0]:
-                    metadata_full["individual"] = metadata_full[i].map(str)
-                else: 
-                    metadata_full["individual"] = metadata_full["individual"] + '_' + metadata_full[i].map(str)
-      
-        metadata_full["DATE_OF_BIRTH"] = metadata_full.apply(self.rename_date_of_birth, axis=1)
-        
+            metadata_full['individual'] = metadata_full[IDENTIFIERS].apply('_'.join, axis=1)
+   
         metadata_full = self.get_cellranger_filename(metadata_full)
 
         self.metadata = self.select_columns(metadata_full, identifiers = IDENTIFIERS)
@@ -127,7 +102,7 @@ def filter_by_wildcards(
         :return param:
         """
 
-        _identifiers = ["individual", "Species_ID"] # possible wildcards
+        _identifiers = ["individual", "Species_ID"] 
 
         if wildcards:
             filters = dict((k, getattr(wildcards, k)) for k in _identifiers)

From 91401fadf0fc12488ae32371e439ac22a2c9f1ae Mon Sep 17 00:00:00 2001
From: leawlb <76486615+leawlb@users.noreply.github.com>
Date: Mon, 28 Nov 2022 10:11:37 +0100
Subject: [PATCH 24/32] Update README.md

---
 README.md | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/README.md b/README.md
index ff6e88e..0faa717 100644
--- a/README.md
+++ b/README.md
@@ -58,3 +58,12 @@ snakemake --cluster "bsub -n16 -q verylong -R rusage[mem=200GB]" -p -j4 -c42 --c
  - `--cluster` may change depending on the computational footprint of your analyses
  - `--configfile` should point to your personal configuration
 
+For generating SCE objects, R (version as specified in construct_sce_objects.yaml) has to be installed in the snakemake environment. 
+For using the construct_sce_objects.yaml environment to resolve additional required packages automatically, also add:
+
+```bash
+--conda-frontend conda
+```
+
+to the snakemake command.
+For multiple runs, it is recommended to install these packages directly into the snakemake environment as well.

From 28fc10680b8caf868ab798e1286dd9c00da28ff0 Mon Sep 17 00:00:00 2001
From: leawlb <lea.woelbert@gmail.com>
Date: Thu, 26 Jan 2023 16:17:27 +0100
Subject: [PATCH 25/32] added complete env files and smalle improvements

---
 config/config-interspecies-bonemarrow.yaml |  27 ++-
 workflow/Snakefile                         |  20 +-
 workflow/envs/construct_sce_objects.yaml   | 247 ++++++++++++++++++++-
 workflow/envs/snakemake_cellranger.yaml    | 181 +++++++++++++++
 workflow/scripts/construct_sce_objects.R   |   1 +
 5 files changed, 448 insertions(+), 28 deletions(-)
 create mode 100644 workflow/envs/snakemake_cellranger.yaml

diff --git a/config/config-interspecies-bonemarrow.yaml b/config/config-interspecies-bonemarrow.yaml
index b22c35d..872907b 100644
--- a/config/config-interspecies-bonemarrow.yaml
+++ b/config/config-interspecies-bonemarrow.yaml
@@ -1,6 +1,12 @@
 paths:
-  output_dir: "/omics/odcf/analysis/OE0538_projects/DO-0008/data"
+  # directory to store output data = cellranger files and SingleCellExperiment objects
+  # paths to input data = fastq files are stored in metadata sheet (OTP export)
+  output_dir: "/omics/odcf/analysis/OE0538_projects/DO-0008/test"
   target_templates:
+    # paths for each type of output file
+      # remove "/{0[Species_ID]}" to store all objects in one directory or
+      # replace "Species_ID" with other colname as suitable wildcard from metadata sheet (e.g. Sample_Type, ...) to create subfolders
+      # remove or replace all instances of Species_ID in Snakefile and samples.py 
     linked_files: "cellranger/linked_files/{0[Species_ID]}/{0[individual]}" 
     cellranger_count: "cellranger/cellranger_count/{0[Species_ID]}/{0[individual]}/outs"
     construct_sce_objects: "sce_objects/01_cellranger_output/{0[Species_ID]}/sce_{0[individual]}-01"
@@ -9,27 +15,24 @@ paths:
     samples_sheet: "metadata.csv"
 references:
   all_masked: "/omics/groups/OE0538/internal/shared_data/CellRangerReferences/GRCm38_masked_allStrains/"
-  #CAST_EiJ: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_cast/"
-  #SPRET_EiJ: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_spret/"
-  #C57BL6J: "/omics/groups/OE0538/internal/users/panten/projects/genome_files/CellRangerRNA/B6_masked_spret/"
 metadata:
   raw:
+  # should have same format as or be based on OTP export sheet
    - "/omics/odcf/analysis/OE0538_projects/DO-0008/metadata/OE0538_DO-0008_metadata_combined.csv"
   identifiers: 
-  # Define column name(s) used to define a sample (in correct order)
-  # entries from multiple cols are concatenated into one sample-specific name
+  # define one or multiple column name(s) that uniquely define a sample/object 
+  # multiple cols are concatenated into one sample-specific name
   # in the format of {Species_ID}_{Age_ID}_{Fraction_ID}_{Sample_NR}
-    - Species_ID # col names I manually added to metadata sheet beforehand
+    - Species_ID # columns I manually added to metadata sheet 
     - Age_ID
     - Fraction_ID 
     - Sample_NR
-    #- PID # col name automatically included in OTP metadata sheets
-    #- Sample Type # col name automatically included in OTP metadata sheets
+    #- PID # columns included in OTP metadata sheet
+    #- Sample_Type 
   single_cell_object_metadata_fields:
   # Define all columns from the metadata spreadsheet that
-  # will be included in the SingleCellExperiment / Seurat
-  # objects
-      - Object_ID # colnames I manually added to metadata 
+  # will be included in the SingleCellExperiment / Seurat objects
+      - Object_ID # columns I manually added to metadata sheet
       - Mouse_ID
       - Species_ID
       - Age
diff --git a/workflow/Snakefile b/workflow/Snakefile
index eab0548..9bc9bca 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -94,15 +94,17 @@ rule cellranger_count:
         "--sample {wildcards.individual} "
         "> {log}"
 
-# construct SCE objects from counts matrices found in ../alignment/output/
-
-"""
-# I set channel priority to strict with << conda config --set channel_priority strict >>
-# I had to use << --conda-frontend conda >> in snakemake command
-# if used more than a couple times I suggest directly installing the packages into the snakemake env
-"""
 
 rule construct_sce_objects:
+    """construct SCE objects from counts matrices
+
+    set channel priority to strict with << conda config --set channel_priority strict >>
+    use << --conda-frontend conda >> in snakemake command
+    
+    Initially, building the environment might take some time.
+    
+    Note: output must match target_templates in configuration YAML.
+    """
     input: 
         output_cellranger = rules.cellranger_count.output
     output:
@@ -111,8 +113,8 @@ rule construct_sce_objects:
         identifiers = IDENTIFIERS,
         metadata = METADATA_PATH,
         single_cell_object_metadata_fields = config["metadata"]["single_cell_object_metadata_fields"]
-    conda: # if all required packages are installed in the snakemake environment this can be omitted
-        "envs/construct_sce_objects.yaml" # if all required packages are installed in the snakemake environment this can be omitted
+    conda: 
+        "envs/construct_sce_objects.yaml" 
     script:
         "scripts/construct_sce_objects.R"
         
diff --git a/workflow/envs/construct_sce_objects.yaml b/workflow/envs/construct_sce_objects.yaml
index 2924286..bd49ff1 100644
--- a/workflow/envs/construct_sce_objects.yaml
+++ b/workflow/envs/construct_sce_objects.yaml
@@ -1,10 +1,243 @@
-name: construct_sce_objects
+# this environment is automatically created by snakemake
+name: construct-sce-objects
 channels:
-  - conda-forge
-  - bioconda
-  - defaults
+- conda-forge
+- bioconda
+- defaults
 dependencies:
-  - r-base=4.1.3
-  - bioconductor-dropletutils=1.14.2
-  - r-tidyverse=1.3.2
+- r-dbplyr=2.3.0=r41hc72bb7e_0
+- bioconductor-delayedmatrixstats=1.16.0=r41hdfd78af_0
+- font-ttf-ubuntu=0.83=hab24e00_0
+- libsanitizer=12.2.0=h46fd767_19
+- bwidget=1.9.14=ha770c72_1
+- r-bit=4.0.5=r41h06615bd_0
+- libcurl=7.86.0=h7bff187_1
+- libpng=1.6.39=h753d276_0
+- r-digest=0.6.31=r41h38f115c_0
+- r-fastmap=1.1.0=r41h7525677_1
+- r-cpp11=0.4.3=r41hc72bb7e_0
+- _r-mutex=1.0.1=anacondar_1
+- r-stringi=1.7.12=r41h1ae9187_0
+- r-forcats=0.5.2=r41hc72bb7e_1
+- r-withr=2.5.0=r41hc72bb7e_1
+- r-crayon=1.5.2=r41hc72bb7e_1
+- r-highr=0.10=r41hc72bb7e_0
+- r-sys=3.4.1=r41h06615bd_0
+- r-formatr=1.14=r41hc72bb7e_0
+- r-base=4.1.3=h2f963a2_5
+- bioconductor-delayedarray=0.20.0=r41hc0cfd56_2
+- libstdcxx-ng=12.2.0=h46fd767_19
+- tktable=2.10=hb7b940f_3
+- r-sass=0.4.5=r41h38f115c_0
+- lerc=4.0.0=h27087fc_0
+- libgcc-devel_linux-64=12.2.0=h3b97bd3_19
+- bioconductor-scuttle=1.4.0=r41hc247a5b_2
+- libgfortran5=12.2.0=h337968e_19
+- r-colorspace=2.1_0=r41h133d619_0
+- r-ids=1.0.1=r41hc72bb7e_2
+- libstdcxx-devel_linux-64=12.2.0=h3b97bd3_19
+- jpeg=9e=h166bdaf_2
+- r-callr=3.7.3=r41hc72bb7e_0
+- r-curl=4.3.3=r41h06615bd_1
+- fribidi=1.0.10=h36c2ea0_0
+- krb5=1.19.3=h3790be6_0
+- bioconductor-xvector=0.34.0=r41hc0cfd56_2
+- bioconductor-biocparallel=1.28.3=r41hc247a5b_1
+- libgcc-ng=12.2.0=h65d4601_19
+- r-r.utils=2.12.2=r41hc72bb7e_0
+- r-lubridate=1.9.1=r41h133d619_0
+- r-base64enc=0.1_3=r41h06615bd_1005
+- r-readr=2.1.3=r41h7525677_1
+- ca-certificates=2022.12.7=ha878542_0
+- xorg-kbproto=1.0.7=h7f98852_1002
+- r-xml2=1.3.3=r41h044e5c7_2
+- r-scales=1.2.1=r41hc72bb7e_1
+- sed=4.8=he412f7d_0
+- harfbuzz=6.0.0=h8e241bc_0
+- readline=8.1.2=h0f457ee_0
+- zstd=1.5.2=h3eb15da_6
+- r-rcurl=1.98_1.9=r41h06615bd_1
+- libuuid=2.32.1=h7f98852_1000
+- zlib=1.2.13=h166bdaf_4
+- r-r6=2.5.1=r41hc72bb7e_1
+- bioconductor-edger=3.36.0=r41hc247a5b_2
+- libzlib=1.2.13=h166bdaf_4
+- r-rcolorbrewer=1.1_3=r41h785f33e_1
+- libedit=3.1.20191231=he28a2e2_2
+- r-farver=2.1.1=r41h7525677_1
+- r-lattice=0.20_45=r41h06615bd_1
+- libgfortran-ng=12.2.0=h69a702a_19
+- fonts-conda-ecosystem=1=0
+- r-glue=1.6.2=r41h06615bd_1
+- xorg-libice=1.0.10=h7f98852_0
+- r-utf8=1.2.2=r41h06615bd_1
+- xorg-libxt=1.2.1=h7f98852_2
+- pandoc=2.19.2=h32600fe_1
+- bioconductor-genomicranges=1.46.1=r41hc0cfd56_1
+- r-pkgconfig=2.0.3=r41hc72bb7e_2
+- r-rstudioapi=0.14=r41hc72bb7e_1
+- r-memoise=2.0.1=r41hc72bb7e_1
+- r-futile.options=1.0.1=r41hc72bb7e_1003
+- r-snow=0.4_4=r41hc72bb7e_1
+- r-httr=1.4.4=r41hc72bb7e_1
+- libffi=3.4.2=h7f98852_5
+- _openmp_mutex=4.5=2_gnu
+- sysroot_linux-64=2.12=he073ed8_15
+- keyutils=1.6.1=h166bdaf_0
+- bioconductor-biobase=2.54.0=r41hc0cfd56_2
+- r-googledrive=2.0.0=r41hc72bb7e_1
+- r-rvest=1.0.3=r41hc72bb7e_1
+- _libgcc_mutex=0.1=conda_forge
+- bioconductor-singlecellexperiment=1.16.0=r41hdfd78af_0
+- r-yaml=2.3.7=r41h133d619_0
+- r-selectr=0.4_2=r41hc72bb7e_2
+- r-generics=0.1.3=r41hc72bb7e_1
+- r-backports=1.4.1=r41h06615bd_1
+- r-ps=1.7.2=r41h06615bd_0
+- font-ttf-inconsolata=3.000=h77eed37_0
+- fonts-conda-forge=1=0
+- bioconductor-genomeinfodbdata=1.2.7=r41hdfd78af_2
+- r-gargle=1.2.1=r41hc72bb7e_1
+- r-viridislite=0.4.1=r41hc72bb7e_1
+- r-nlme=3.1_161=r41hac0b197_0
+- r-hms=1.1.2=r41hc72bb7e_1
+- r-munsell=0.5.0=r41hc72bb7e_1005
+- ld_impl_linux-64=2.39=hcc3a1bd_1
+- curl=7.86.0=h7bff187_1
+- c-ares=1.18.1=h7f98852_0
+- r-sitmo=2.0.2=r41h7525677_1
+- r-locfit=1.5_9.7=r41h133d619_0
+- bioconductor-matrixgenerics=1.6.0=r41hdfd78af_0
+- r-tzdb=0.3.0=r41h7525677_1
+- pango=1.50.12=hd33c08f_1
+- r-jsonlite=1.8.4=r41h133d619_0
+- libev=4.33=h516909a_1
+- libwebp-base=1.2.4=h166bdaf_0
+- bioconductor-dropletutils=1.14.2=r41hc247a5b_1
+- r-dplyr=1.0.10=r41h7525677_1
+- bioconductor-beachmat=2.10.0=r41hc247a5b_2
+- r-vroom=1.6.1=r41h38f115c_0
+- r-data.table=1.14.6=r41h06615bd_0
+- pixman=0.40.0=h36c2ea0_0
+- r-tibble=3.1.8=r41h06615bd_1
+- r-xfun=0.36=r41h38f115c_0
+- bioconductor-sparsematrixstats=1.6.0=r41hc247a5b_2
+- r-evaluate=0.20=r41hc72bb7e_0
+- xorg-xproto=7.0.31=h7f98852_1007
+- r-jquerylib=0.1.4=r41hc72bb7e_1
+- xorg-libxrender=0.9.10=h7f98852_1003
+- r-mass=7.3_58.2=r41h133d619_0
+- r-r.oo=1.25.0=r41hc72bb7e_1
+- r-rematch2=2.1.2=r41hc72bb7e_2
+- r-mime=0.12=r41h06615bd_1
+- liblapack=3.9.0=16_linux64_openblas
+- r-processx=3.8.0=r41h06615bd_0
+- r-blob=1.2.3=r41hc72bb7e_1
+- libblas=3.9.0=16_linux64_openblas
+- bioconductor-zlibbioc=1.40.0=r41hc0cfd56_2
+- r-haven=2.5.1=r41h7525677_0
+- r-timechange=0.2.0=r41h38f115c_0
+- r-matrixstats=0.63.0=r41h06615bd_0
+- r-assertthat=0.2.1=r41hc72bb7e_3
+- xorg-xextproto=7.3.0=h7f98852_1002
+- bioconductor-s4vectors=0.32.4=r41hc0cfd56_0
+- r-rappdirs=0.3.3=r41h06615bd_1
+- r-ggplot2=3.4.0=r41hc72bb7e_1
+- xz=5.2.6=h166bdaf_0
+- libgomp=12.2.0=h65d4601_19
+- r-cachem=1.0.6=r41h06615bd_1
+- r-tidyverse=1.3.2=r41hc72bb7e_1
+- r-clipr=0.8.0=r41hc72bb7e_1
+- r-rcpp=1.0.10=r41h38f115c_0
+- font-ttf-source-code-pro=2.038=h77eed37_0
+- font-ttf-dejavu-sans-mono=2.37=hab24e00_0
+- bioconductor-iranges=2.28.0=r41hc0cfd56_2
+- libnghttp2=1.51.0=hdcd2b5c_0
+- bioconductor-rhdf5lib=1.16.0=r41hc0cfd56_2
+- r-fansi=1.0.4=r41h133d619_0
+- kernel-headers_linux-64=2.6.32=he073ed8_15
+- gcc_impl_linux-64=12.2.0=hcc96c02_19
+- r-pillar=1.8.1=r41hc72bb7e_1
+- gettext=0.21.1=h27087fc_0
+- libiconv=1.17=h166bdaf_0
+- r-bitops=1.0_7=r41h06615bd_1
+- bioconductor-hdf5array=1.22.1=r41hc0cfd56_1
+- r-purrr=1.0.1=r41h133d619_0
+- r-broom=1.0.3=r41hc72bb7e_0
+- r-bh=1.81.0_1=r41hc72bb7e_0
+- libxml2=2.10.3=h7463322_0
+- r-isoband=0.2.7=r41h38f115c_1
+- r-futile.logger=1.4.3=r41hc72bb7e_1004
+- r-reprex=2.0.2=r41hc72bb7e_1
+- r-openssl=2.0.5=r41hb1dc35e_0
+- r-lifecycle=1.0.3=r41hc72bb7e_1
+- r-stringr=1.5.0=r41h785f33e_0
+- r-tidyselect=1.2.0=r41hc72bb7e_0
+- r-lambda.r=1.2.4=r41hc72bb7e_2
+- expat=2.5.0=h27087fc_0
+- r-r.methodss3=1.8.2=r41hc72bb7e_1
+- r-dbi=1.1.3=r41hc72bb7e_1
+- r-rmarkdown=2.20=r41hc72bb7e_0
+- xorg-libx11=1.7.2=h7f98852_0
+- bzip2=1.0.8=h7f98852_4
+- r-cli=3.6.0=r41h38f115c_0
+- r-gtable=0.3.1=r41hc72bb7e_1
+- libtiff=4.5.0=h6adf6a1_2
+- libopenblas=0.3.21=pthreads_h78a6416_3
+- tk=8.6.12=h27826a3_0
+- bioconductor-genomeinfodb=1.30.1=r41hdfd78af_0
+- bioconductor-limma=3.50.3=r41hc0cfd56_0
+- xorg-libxau=1.0.9=h7f98852_0
+- r-cellranger=1.1.0=r41hc72bb7e_1005
+- make=4.3=hd18ef5c_1
+- r-rematch=1.0.1=r41hc72bb7e_1005
+- xorg-renderproto=0.11.1=h7f98852_1002
+- graphite2=1.3.13=h58526e2_1001
+- r-htmltools=0.5.4=r41h38f115c_0
+- r-mgcv=1.8_41=r41h5f7b363_0
+- r-knitr=1.42=r41hc72bb7e_1
+- libdeflate=1.17=h0b41bf4_0
+- bioconductor-rhdf5filters=1.6.0=r41hc247a5b_2
+- fontconfig=2.14.1=hc2a2eb6_0
+- r-bslib=0.4.2=r41hc72bb7e_0
+- gfortran_impl_linux-64=12.2.0=h55be85b_19
+- r-dqrng=0.3.0=r41h7525677_1
+- libxcb=1.13=h7f98852_1004
+- xorg-libsm=1.2.3=hd9c2040_1000
+- r-googlesheets4=1.0.1=r41h785f33e_1
+- r-prettyunits=1.1.1=r41hc72bb7e_2
+- libcblas=3.9.0=16_linux64_openblas
+- bioconductor-rhdf5=2.38.1=r41hbe1951d_0
+- binutils_impl_linux-64=2.39=he00db2b_1
+- icu=70.1=h27087fc_0
+- pcre2=10.40=hc3806b6_0
+- r-rlang=1.0.6=r41h7525677_1
+- r-readxl=1.4.1=r41h3ebcfa7_1
+- pthread-stubs=0.4=h36c2ea0_1001
+- r-ellipsis=0.3.2=r41h06615bd_1
+- r-magrittr=2.0.3=r41h06615bd_1
+- openssl=1.1.1s=h0b41bf4_1
+- gxx_impl_linux-64=12.2.0=hcc96c02_19
+- r-progress=1.2.2=r41hc72bb7e_3
+- r-fs=1.6.0=r41h38f115c_0
+- xorg-libxext=1.3.4=h7f98852_1
+- gsl=2.7=he838d99_0
+- r-tidyr=1.3.0=r41h38f115c_0
+- libssh2=1.10.0=haa6b8db_3
+- r-bit64=4.0.5=r41h06615bd_1
+- ncurses=6.3=h27087fc_1
+- r-matrix=1.5_3=r41h5f7b363_0
+- r-uuid=1.1_0=r41h06615bd_1
+- cairo=1.16.0=ha61ee94_1014
+- libglib=2.74.1=h606061b_1
+- r-vctrs=0.5.2=r41h38f115c_0
+- freetype=2.12.1=hca18f0e_1
+- r-tinytex=0.43=r41hc72bb7e_0
+- xorg-libxdmcp=1.1.3=h7f98852_0
+- bioconductor-biocgenerics=0.40.0=r41hdfd78af_0
+- r-dtplyr=1.2.2=r41hc72bb7e_2
+- r-modelr=0.1.10=r41hc72bb7e_0
+- r-askpass=1.1=r41h06615bd_3
+- bioconductor-summarizedexperiment=1.24.0=r41hdfd78af_0
+- r-labeling=0.4.2=r41hc72bb7e_2
   
\ No newline at end of file
diff --git a/workflow/envs/snakemake_cellranger.yaml b/workflow/envs/snakemake_cellranger.yaml
new file mode 100644
index 0000000..d44ce8f
--- /dev/null
+++ b/workflow/envs/snakemake_cellranger.yaml
@@ -0,0 +1,181 @@
+# this env is required, please install and activate for running snakemake
+name: snakemake-cellranger
+channels:
+- conda-forge
+- bioconda
+- defaults
+dependencies:
+- libsqlite=3.39.4=h753d276_0
+- toposort=1.7=pyhd8ed1ab_0
+- importlib-metadata=5.0.0=pyha770c72_1
+- jinja2=3.1.2=pyhd8ed1ab_1
+- exceptiongroup=1.0.1=pyhd8ed1ab_0
+- veracitools=0.1.3=py_0
+- google-cloud-core=2.3.2=pyhd8ed1ab_0
+- coincbc=2.10.8=0_metapackage
+- oauth2client=4.1.3=py_0
+- ubiquerg=0.6.2=pyhd8ed1ab_0
+- google-auth=2.14.0=pyh1a96a4e_0
+- python-dateutil=2.8.2=pyhd8ed1ab_0
+- liblapacke=3.9.0=16_linux64_openblas
+- micromamba=1.0.0=1
+- coin-or-clp=1.17.7=hc56784d_2
+- aioeasywebdav=2.4.0=pyha770c72_0
+- libgomp=12.2.0=h65d4601_19
+- ply=3.11=py_1
+- importlib_resources=5.10.0=pyhd8ed1ab_0
+- libgfortran5=12.2.0=h337968e_19
+- datrie=0.8.2=py310h5764c6d_6
+- pip=22.3.1=pyhd8ed1ab_0
+- packaging=21.3=pyhd8ed1ab_0
+- dataclasses=0.8=pyhc8e2a94_3
+- pycparser=2.21=pyhd8ed1ab_0
+- configargparse=1.5.3=pyhd8ed1ab_0
+- urllib3=1.26.11=pyhd8ed1ab_0
+- colorama=0.4.6=pyhd8ed1ab_0
+- yarl=1.8.1=py310h5764c6d_0
+- psutil=5.9.4=py310h5764c6d_0
+- plac=1.3.5=pyhd8ed1ab_0
+- certifi=2022.9.24=pyhd8ed1ab_0
+- markupsafe=2.1.1=py310h5764c6d_2
+- toolz=0.12.0=pyhd8ed1ab_0
+- cachetools=5.2.0=pyhd8ed1ab_0
+- google-crc32c=1.1.2=py310he8fe98e_4
+- amply=0.1.5=pyhd8ed1ab_0
+- reretry=0.11.1=pyhd8ed1ab_0
+- c-ares=1.18.1=h7f98852_0
+- libsodium=1.0.18=h36c2ea0_1
+- peppy=0.35.2=pyhd8ed1ab_0
+- snakemake=7.18.1=hdfd78af_0
+- pytz=2022.6=pyhd8ed1ab_0
+- pytest=7.2.0=pyhd8ed1ab_2
+- libcblas=3.9.0=16_linux64_openblas
+- httplib2=0.21.0=pyhd8ed1ab_0
+- yte=1.5.1=py310hff52083_1
+- pyrsistent=0.19.2=py310h5764c6d_0
+- libgrpc=1.49.1=h30feacc_1
+- pulp=2.7.0=py310hff52083_0
+- attrs=22.1.0=pyh71513ae_1
+- pandas=1.5.1=py310h769672d_1
+- multidict=6.0.2=py310h5764c6d_2
+- connection_pool=0.0.3=pyhd3deb0d_0
+- _libgcc_mutex=0.1=conda_forge
+- google-api-core=2.10.2=pyhd8ed1ab_0
+- smmap=3.0.5=pyh44b312d_0
+- pygments=2.13.0=pyhd8ed1ab_0
+- wheel=0.38.3=pyhd8ed1ab_0
+- docutils=0.19=py310hff52083_1
+- ftputil=5.0.4=pyhd8ed1ab_0
+- conda=22.9.0=py310hff52083_2
+- gitdb=4.0.9=pyhd8ed1ab_0
+- aiohttp=3.8.3=py310h5764c6d_1
+- libgfortran-ng=12.2.0=h69a702a_19
+- stopit=1.1.2=py_0
+- defusedxml=0.7.1=pyhd8ed1ab_0
+- liblapack=3.9.0=16_linux64_openblas
+- backports=1.0=py_2
+- numpy=1.23.4=py310h53a5b5f_1
+- iniconfig=1.1.1=pyh9f0ad1d_0
+- snakemake-minimal=7.18.1=pyhdfd78af_0
+- coin-or-cbc=2.10.8=h3786ebc_0
+- tzdata=2022f=h191b570_0
+- readline=8.1.2=h0f457ee_0
+- frozenlist=1.3.3=py310h5764c6d_0
+- filelock=3.8.0=pyhd8ed1ab_0
+- ruamel_yaml=0.15.80=py310h5764c6d_1008
+- pycosat=0.6.4=py310h5764c6d_1
+- logmuse=0.2.6=pyh8c360ce_0
+- boto3=1.26.5=pyhd8ed1ab_0
+- pyparsing=3.0.9=pyhd8ed1ab_0
+- backports.functools_lru_cache=1.6.4=pyhd8ed1ab_0
+- pyu2f=0.1.5=pyhd8ed1ab_0
+- protobuf=4.21.9=py310hd8f1fbe_0
+- conda-package-handling=1.9.0=py310h5764c6d_1
+- pysftp=0.2.9=py_1
+- pkgutil-resolve-name=1.3.10=pyhd8ed1ab_0
+- python-fastjsonschema=2.16.2=pyhd8ed1ab_0
+- ld_impl_linux-64=2.39=hc81fddc_0
+- ncurses=6.3=h27087fc_1
+- googleapis-common-protos=1.56.4=py310hff52083_1
+- google-cloud-storage=2.6.0=pyh1a96a4e_0
+- slacker=0.14.0=py_0
+- libffi=3.4.2=h7f98852_5
+- _openmp_mutex=4.5=2_gnu
+- libzlib=1.2.13=h166bdaf_4
+- coin-or-utils=2.11.6=h202d8b1_2
+- libabseil=20220623.0=cxx17_h48a1fff_5
+- prettytable=3.4.1=pyhd8ed1ab_0
+- cffi=1.15.1=py310h255011f_2
+- typing-extensions=4.4.0=hd8ed1ab_0
+- pyyaml=6.0=py310h5764c6d_5
+- dropbox=11.35.0=pyhd8ed1ab_0
+- python_abi=3.10=2_cp310
+- nbformat=5.7.0=pyhd8ed1ab_0
+- libnsl=2.0.0=h7f98852_0
+- stone=3.3.1=pyhd8ed1ab_0
+- libblas=3.9.0=16_linux64_openblas
+- cryptography=38.0.3=py310h600f1e7_0
+- google-auth-httplib2=0.1.0=pyhd8ed1ab_1
+- tk=8.6.12=h27826a3_0
+- libopenblas=0.3.21=pthreads_h78a6416_3
+- pyasn1-modules=0.2.7=py_0
+- jsonschema=4.17.0=pyhd8ed1ab_0
+- coin-or-cgl=0.60.6=h6f57e76_2
+- dpath=2.0.6=py310hff52083_2
+- google-api-python-client=2.65.0=pyhd8ed1ab_0
+- setuptools-scm=7.0.5=pyhd8ed1ab_1
+- rsa=4.9=pyhd8ed1ab_0
+- pyasn1=0.4.8=py_0
+- wcwidth=0.2.5=pyh9f0ad1d_2
+- tqdm=4.64.1=pyhd8ed1ab_0
+- traitlets=5.5.0=pyhd8ed1ab_0
+- wrapt=1.14.1=py310h5764c6d_1
+- zipp=3.10.0=pyhd8ed1ab_0
+- botocore=1.29.5=pyhd8ed1ab_0
+- idna=3.4=pyhd8ed1ab_0
+- google-resumable-media=2.4.0=pyhd8ed1ab_0
+- bcrypt=3.2.2=py310h5764c6d_1
+- attmap=0.13.2=pyhd8ed1ab_0
+- requests=2.28.1=pyhd8ed1ab_1
+- xz=5.2.6=h166bdaf_0
+- grpcio=1.49.1=py310hc32fa93_1
+- libprotobuf=3.21.9=h6239696_0
+- gitpython=3.1.29=pyhd8ed1ab_0
+- s3transfer=0.6.0=pyhd8ed1ab_0
+- yaml=0.2.5=h7f98852_2
+- ratelimiter=1.2.0=pyhd8ed1ab_1003
+- bzip2=1.0.8=h7f98852_4
+- ca-certificates=2022.9.24=ha878542_0
+- uritemplate=4.1.1=pyhd8ed1ab_0
+- future=0.18.2=pyhd8ed1ab_6
+- jupyter_core=4.11.2=py310hff52083_0
+- pluggy=1.0.0=pyhd8ed1ab_5
+- brotlipy=0.7.0=py310h5764c6d_1005
+- jmespath=1.0.1=pyhd8ed1ab_0
+- setuptools=65.5.1=pyhd8ed1ab_0
+- libcrc32c=1.1.2=h9c3ff4c_0
+- libstdcxx-ng=12.2.0=h46fd767_19
+- commonmark=0.9.1=py_0
+- zlib=1.2.13=h166bdaf_4
+- tabulate=0.9.0=pyhd8ed1ab_1
+- re2=2022.06.01=h27087fc_0
+- appdirs=1.4.4=pyh9f0ad1d_0
+- aiosignal=1.3.1=pyhd8ed1ab_0
+- paramiko=2.12.0=pyhd8ed1ab_0
+- filechunkio=1.8=py_2
+- charset-normalizer=2.1.1=pyhd8ed1ab_0
+- python-irodsclient=1.1.5=pyhd8ed1ab_0
+- rich=12.6.0=pyhd8ed1ab_0
+- async-timeout=4.0.2=pyhd8ed1ab_0
+- six=1.16.0=pyh6c4a22f_0
+- coin-or-osi=0.108.7=h2720bb7_2
+- pyopenssl=22.1.0=pyhd8ed1ab_0
+- libuuid=2.32.1=h7f98852_1000
+- pysocks=1.7.1=pyha2e5f31_6
+- openssl=3.0.7=h166bdaf_0
+- typing_extensions=4.4.0=pyha770c72_0
+- python=3.10.6=ha86cf86_0_cpython
+- pynacl=1.5.0=py310h5764c6d_2
+- smart_open=6.2.0=pyha770c72_0
+- tomli=2.0.1=pyhd8ed1ab_0
+- libgcc-ng=12.2.0=h65d4601_19
\ No newline at end of file
diff --git a/workflow/scripts/construct_sce_objects.R b/workflow/scripts/construct_sce_objects.R
index 2adaf20..c4565f7 100644
--- a/workflow/scripts/construct_sce_objects.R
+++ b/workflow/scripts/construct_sce_objects.R
@@ -2,6 +2,7 @@
 
 library(tidyverse, quietly = TRUE) 
 library(DropletUtils, quietly = TRUE) 
+set.seed(1)
 
 # construct SCE object from raw cellranger output
 print(paste0(snakemake@input[["output_cellranger"]], "/raw_feature_bc_matrix"))

From 17648670cf91339d3d68267d49cb554e8ae866f6 Mon Sep 17 00:00:00 2001
From: leawlb <lea.woelbert@gmail.com>
Date: Tue, 31 Jan 2023 13:44:51 +0100
Subject: [PATCH 26/32] Merge branch 'add_SCE' of
 https://github.com/odomlab2/snakemake-cellranger into add_SCE

---
 README.md | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/README.md b/README.md
index ff6e88e..0faa717 100644
--- a/README.md
+++ b/README.md
@@ -58,3 +58,12 @@ snakemake --cluster "bsub -n16 -q verylong -R rusage[mem=200GB]" -p -j4 -c42 --c
  - `--cluster` may change depending on the computational footprint of your analyses
  - `--configfile` should point to your personal configuration
 
+For generating SCE objects, R (version as specified in construct_sce_objects.yaml) has to be installed in the snakemake environment. 
+For using the construct_sce_objects.yaml environment to resolve additional required packages automatically, also add:
+
+```bash
+--conda-frontend conda
+```
+
+to the snakemake command.
+For multiple runs, it is recommended to install these packages directly into the snakemake environment as well.

From ed54ac16ea5b216059b647dbdcd03a8e6a07e73c Mon Sep 17 00:00:00 2001
From: Fritjof Lammers <f.lammers@dkfz-heidelberg.de>
Date: Fri, 17 Feb 2023 12:09:49 +0100
Subject: [PATCH 27/32] extract method to rename BIRTH columns

---
 utils/process_otp_metadata.py | 13 +++++++++++++
 workflow/scripts/samples.py   | 15 ++-------------
 2 files changed, 15 insertions(+), 13 deletions(-)
 create mode 100644 utils/process_otp_metadata.py

diff --git a/utils/process_otp_metadata.py b/utils/process_otp_metadata.py
new file mode 100644
index 0000000..10ca797
--- /dev/null
+++ b/utils/process_otp_metadata.py
@@ -0,0 +1,13 @@
+import pandas as pd
+
+
+def rename_date_of_birth(row: pd.Series):
+    """Unify the 'BIRTH' and 'DATE_OF_BIRTH' columns into
+    a single 'DATE_OF_BIRTH' column.
+    """
+    values = [row["BIRTH"], row["DATE_OF_BIRTH"]]
+    dates = [val for val in values if not pd.isna(val)]
+    if len(dates) == 1:
+        return dates[0]
+    elif len(dates) != 1:
+        return pd.NA
diff --git a/workflow/scripts/samples.py b/workflow/scripts/samples.py
index 66214b7..a447908 100644
--- a/workflow/scripts/samples.py
+++ b/workflow/scripts/samples.py
@@ -2,6 +2,7 @@
 
 from snakemake.io import Wildcards
 
+
 class Samples:
     """
     Convert the OTP-exported metadata spreadsheet into a pandas.DataFrame
@@ -23,7 +24,7 @@ class Samples:
                "LANE_NO",
                "READ",
                "CELLRANGER_FASTQ_PATH",
-               "individual"] # include 'individual' col to be generated below
+               "individual"]  # include 'individual' col to be generated below
                
     def __init__(self, config):
         IDENTIFIERS = config["metadata"]["identifiers"]
@@ -42,18 +43,6 @@ def __init__(self, config):
 
         self.metadata = self.select_columns(metadata_full, identifiers = IDENTIFIERS)
 
-    def rename_date_of_birth(self,
-                             row: pd.Series):
-        """Unify the 'BIRTH' and 'DATE_OF_BIRTH' columns into
-        a single 'DATE_OF_BIRTH' column.
-        """
-        values = [row["BIRTH"], row["DATE_OF_BIRTH"]]
-        dates = [val for val in values if not pd.isna(val)]
-        if len(dates) == 1:
-            return dates[0]
-        elif len(dates) != 1:
-            return pd.NA
-
     def get_cellranger_filename(self, df: pd.DataFrame) -> pd.DataFrame:
 
         """Add column containing CellRanger compatible filename,

From 5d9d69f11676bb593357a1ee8b622a8da4e2b081 Mon Sep 17 00:00:00 2001
From: leawlb <lea.woelbert@gmail.com>
Date: Mon, 27 Mar 2023 15:30:09 +0200
Subject: [PATCH 28/32] improved readability

---
 README.md                                  |  20 ++-
 config/config-interspecies-bonemarrow.yaml |   4 +-
 workflow/Snakefile                         |   2 +-
 workflow/envs/snakemake_cellranger.yaml    | 181 ---------------------
 workflow/scripts/construct_sce_objects.R   |  19 +--
 5 files changed, 23 insertions(+), 203 deletions(-)
 delete mode 100644 workflow/envs/snakemake_cellranger.yaml

diff --git a/README.md b/README.md
index 0faa717..4341f2a 100644
--- a/README.md
+++ b/README.md
@@ -49,21 +49,23 @@ Minimal changes needed are:
 
 ## How to run 
 
-You may call the pipeline as follows in the directory where you cloned it. 
+Use snakemake_cellranger.yaml to build an environment with all required packages.
 
 ```bash
-snakemake --cluster "bsub -n16 -q verylong -R rusage[mem=200GB]" -p -j4 -c42 --configfile config/config-cluster.yaml --use-conda  --use-envmodules
+micromamba create -f snakemake_cellranger.yaml
 ```
 
- - `--cluster` may change depending on the computational footprint of your analyses
- - `--configfile` should point to your personal configuration
+or
 
-For generating SCE objects, R (version as specified in construct_sce_objects.yaml) has to be installed in the snakemake environment. 
-For using the construct_sce_objects.yaml environment to resolve additional required packages automatically, also add:
+```bash
+conda env create -f snakemake_cellranger.yaml
+```
+
+You may call the pipeline as follows in the directory where you cloned it. 
 
 ```bash
---conda-frontend conda
+snakemake --cluster "bsub -n16 -q verylong -R rusage[mem=200GB]" -p -j4 -c42 --configfile config/config-cluster.yaml --use-conda  --use-envmodules --conda-frontend conda
 ```
 
-to the snakemake command.
-For multiple runs, it is recommended to install these packages directly into the snakemake environment as well.
+ - `--cluster` may change depending on the computational footprint of your analyses
+ - `--configfile` should point to your personal configuration
diff --git a/config/config-interspecies-bonemarrow.yaml b/config/config-interspecies-bonemarrow.yaml
index 872907b..3eedb8e 100644
--- a/config/config-interspecies-bonemarrow.yaml
+++ b/config/config-interspecies-bonemarrow.yaml
@@ -1,7 +1,7 @@
 paths:
   # directory to store output data = cellranger files and SingleCellExperiment objects
   # paths to input data = fastq files are stored in metadata sheet (OTP export)
-  output_dir: "/omics/odcf/analysis/OE0538_projects/DO-0008/test"
+  output_dir: "/omics/odcf/analysis/OE0538_projects/DO-0008/data"
   target_templates:
     # paths for each type of output file
       # remove "/{0[Species_ID]}" to store all objects in one directory or
@@ -16,7 +16,7 @@ paths:
 references:
   all_masked: "/omics/groups/OE0538/internal/shared_data/CellRangerReferences/GRCm38_masked_allStrains/"
 metadata:
-  raw:
+  table:
   # should have same format as or be based on OTP export sheet
    - "/omics/odcf/analysis/OE0538_projects/DO-0008/metadata/OE0538_DO-0008_metadata_combined.csv"
   identifiers: 
diff --git a/workflow/Snakefile b/workflow/Snakefile
index 9bc9bca..de65a9a 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -6,7 +6,7 @@ from scripts.samples import Samples
 
 samples = Samples(config)
 
-METADATA_PATH = config["metadata"]["raw"]
+METADATA_PATH = config["metadata"]["table"]
 OUTPUT_BASE_PATH = config["paths"]["output_dir"]
 
 REFERENCES = config["references"]
diff --git a/workflow/envs/snakemake_cellranger.yaml b/workflow/envs/snakemake_cellranger.yaml
deleted file mode 100644
index d44ce8f..0000000
--- a/workflow/envs/snakemake_cellranger.yaml
+++ /dev/null
@@ -1,181 +0,0 @@
-# this env is required, please install and activate for running snakemake
-name: snakemake-cellranger
-channels:
-- conda-forge
-- bioconda
-- defaults
-dependencies:
-- libsqlite=3.39.4=h753d276_0
-- toposort=1.7=pyhd8ed1ab_0
-- importlib-metadata=5.0.0=pyha770c72_1
-- jinja2=3.1.2=pyhd8ed1ab_1
-- exceptiongroup=1.0.1=pyhd8ed1ab_0
-- veracitools=0.1.3=py_0
-- google-cloud-core=2.3.2=pyhd8ed1ab_0
-- coincbc=2.10.8=0_metapackage
-- oauth2client=4.1.3=py_0
-- ubiquerg=0.6.2=pyhd8ed1ab_0
-- google-auth=2.14.0=pyh1a96a4e_0
-- python-dateutil=2.8.2=pyhd8ed1ab_0
-- liblapacke=3.9.0=16_linux64_openblas
-- micromamba=1.0.0=1
-- coin-or-clp=1.17.7=hc56784d_2
-- aioeasywebdav=2.4.0=pyha770c72_0
-- libgomp=12.2.0=h65d4601_19
-- ply=3.11=py_1
-- importlib_resources=5.10.0=pyhd8ed1ab_0
-- libgfortran5=12.2.0=h337968e_19
-- datrie=0.8.2=py310h5764c6d_6
-- pip=22.3.1=pyhd8ed1ab_0
-- packaging=21.3=pyhd8ed1ab_0
-- dataclasses=0.8=pyhc8e2a94_3
-- pycparser=2.21=pyhd8ed1ab_0
-- configargparse=1.5.3=pyhd8ed1ab_0
-- urllib3=1.26.11=pyhd8ed1ab_0
-- colorama=0.4.6=pyhd8ed1ab_0
-- yarl=1.8.1=py310h5764c6d_0
-- psutil=5.9.4=py310h5764c6d_0
-- plac=1.3.5=pyhd8ed1ab_0
-- certifi=2022.9.24=pyhd8ed1ab_0
-- markupsafe=2.1.1=py310h5764c6d_2
-- toolz=0.12.0=pyhd8ed1ab_0
-- cachetools=5.2.0=pyhd8ed1ab_0
-- google-crc32c=1.1.2=py310he8fe98e_4
-- amply=0.1.5=pyhd8ed1ab_0
-- reretry=0.11.1=pyhd8ed1ab_0
-- c-ares=1.18.1=h7f98852_0
-- libsodium=1.0.18=h36c2ea0_1
-- peppy=0.35.2=pyhd8ed1ab_0
-- snakemake=7.18.1=hdfd78af_0
-- pytz=2022.6=pyhd8ed1ab_0
-- pytest=7.2.0=pyhd8ed1ab_2
-- libcblas=3.9.0=16_linux64_openblas
-- httplib2=0.21.0=pyhd8ed1ab_0
-- yte=1.5.1=py310hff52083_1
-- pyrsistent=0.19.2=py310h5764c6d_0
-- libgrpc=1.49.1=h30feacc_1
-- pulp=2.7.0=py310hff52083_0
-- attrs=22.1.0=pyh71513ae_1
-- pandas=1.5.1=py310h769672d_1
-- multidict=6.0.2=py310h5764c6d_2
-- connection_pool=0.0.3=pyhd3deb0d_0
-- _libgcc_mutex=0.1=conda_forge
-- google-api-core=2.10.2=pyhd8ed1ab_0
-- smmap=3.0.5=pyh44b312d_0
-- pygments=2.13.0=pyhd8ed1ab_0
-- wheel=0.38.3=pyhd8ed1ab_0
-- docutils=0.19=py310hff52083_1
-- ftputil=5.0.4=pyhd8ed1ab_0
-- conda=22.9.0=py310hff52083_2
-- gitdb=4.0.9=pyhd8ed1ab_0
-- aiohttp=3.8.3=py310h5764c6d_1
-- libgfortran-ng=12.2.0=h69a702a_19
-- stopit=1.1.2=py_0
-- defusedxml=0.7.1=pyhd8ed1ab_0
-- liblapack=3.9.0=16_linux64_openblas
-- backports=1.0=py_2
-- numpy=1.23.4=py310h53a5b5f_1
-- iniconfig=1.1.1=pyh9f0ad1d_0
-- snakemake-minimal=7.18.1=pyhdfd78af_0
-- coin-or-cbc=2.10.8=h3786ebc_0
-- tzdata=2022f=h191b570_0
-- readline=8.1.2=h0f457ee_0
-- frozenlist=1.3.3=py310h5764c6d_0
-- filelock=3.8.0=pyhd8ed1ab_0
-- ruamel_yaml=0.15.80=py310h5764c6d_1008
-- pycosat=0.6.4=py310h5764c6d_1
-- logmuse=0.2.6=pyh8c360ce_0
-- boto3=1.26.5=pyhd8ed1ab_0
-- pyparsing=3.0.9=pyhd8ed1ab_0
-- backports.functools_lru_cache=1.6.4=pyhd8ed1ab_0
-- pyu2f=0.1.5=pyhd8ed1ab_0
-- protobuf=4.21.9=py310hd8f1fbe_0
-- conda-package-handling=1.9.0=py310h5764c6d_1
-- pysftp=0.2.9=py_1
-- pkgutil-resolve-name=1.3.10=pyhd8ed1ab_0
-- python-fastjsonschema=2.16.2=pyhd8ed1ab_0
-- ld_impl_linux-64=2.39=hc81fddc_0
-- ncurses=6.3=h27087fc_1
-- googleapis-common-protos=1.56.4=py310hff52083_1
-- google-cloud-storage=2.6.0=pyh1a96a4e_0
-- slacker=0.14.0=py_0
-- libffi=3.4.2=h7f98852_5
-- _openmp_mutex=4.5=2_gnu
-- libzlib=1.2.13=h166bdaf_4
-- coin-or-utils=2.11.6=h202d8b1_2
-- libabseil=20220623.0=cxx17_h48a1fff_5
-- prettytable=3.4.1=pyhd8ed1ab_0
-- cffi=1.15.1=py310h255011f_2
-- typing-extensions=4.4.0=hd8ed1ab_0
-- pyyaml=6.0=py310h5764c6d_5
-- dropbox=11.35.0=pyhd8ed1ab_0
-- python_abi=3.10=2_cp310
-- nbformat=5.7.0=pyhd8ed1ab_0
-- libnsl=2.0.0=h7f98852_0
-- stone=3.3.1=pyhd8ed1ab_0
-- libblas=3.9.0=16_linux64_openblas
-- cryptography=38.0.3=py310h600f1e7_0
-- google-auth-httplib2=0.1.0=pyhd8ed1ab_1
-- tk=8.6.12=h27826a3_0
-- libopenblas=0.3.21=pthreads_h78a6416_3
-- pyasn1-modules=0.2.7=py_0
-- jsonschema=4.17.0=pyhd8ed1ab_0
-- coin-or-cgl=0.60.6=h6f57e76_2
-- dpath=2.0.6=py310hff52083_2
-- google-api-python-client=2.65.0=pyhd8ed1ab_0
-- setuptools-scm=7.0.5=pyhd8ed1ab_1
-- rsa=4.9=pyhd8ed1ab_0
-- pyasn1=0.4.8=py_0
-- wcwidth=0.2.5=pyh9f0ad1d_2
-- tqdm=4.64.1=pyhd8ed1ab_0
-- traitlets=5.5.0=pyhd8ed1ab_0
-- wrapt=1.14.1=py310h5764c6d_1
-- zipp=3.10.0=pyhd8ed1ab_0
-- botocore=1.29.5=pyhd8ed1ab_0
-- idna=3.4=pyhd8ed1ab_0
-- google-resumable-media=2.4.0=pyhd8ed1ab_0
-- bcrypt=3.2.2=py310h5764c6d_1
-- attmap=0.13.2=pyhd8ed1ab_0
-- requests=2.28.1=pyhd8ed1ab_1
-- xz=5.2.6=h166bdaf_0
-- grpcio=1.49.1=py310hc32fa93_1
-- libprotobuf=3.21.9=h6239696_0
-- gitpython=3.1.29=pyhd8ed1ab_0
-- s3transfer=0.6.0=pyhd8ed1ab_0
-- yaml=0.2.5=h7f98852_2
-- ratelimiter=1.2.0=pyhd8ed1ab_1003
-- bzip2=1.0.8=h7f98852_4
-- ca-certificates=2022.9.24=ha878542_0
-- uritemplate=4.1.1=pyhd8ed1ab_0
-- future=0.18.2=pyhd8ed1ab_6
-- jupyter_core=4.11.2=py310hff52083_0
-- pluggy=1.0.0=pyhd8ed1ab_5
-- brotlipy=0.7.0=py310h5764c6d_1005
-- jmespath=1.0.1=pyhd8ed1ab_0
-- setuptools=65.5.1=pyhd8ed1ab_0
-- libcrc32c=1.1.2=h9c3ff4c_0
-- libstdcxx-ng=12.2.0=h46fd767_19
-- commonmark=0.9.1=py_0
-- zlib=1.2.13=h166bdaf_4
-- tabulate=0.9.0=pyhd8ed1ab_1
-- re2=2022.06.01=h27087fc_0
-- appdirs=1.4.4=pyh9f0ad1d_0
-- aiosignal=1.3.1=pyhd8ed1ab_0
-- paramiko=2.12.0=pyhd8ed1ab_0
-- filechunkio=1.8=py_2
-- charset-normalizer=2.1.1=pyhd8ed1ab_0
-- python-irodsclient=1.1.5=pyhd8ed1ab_0
-- rich=12.6.0=pyhd8ed1ab_0
-- async-timeout=4.0.2=pyhd8ed1ab_0
-- six=1.16.0=pyh6c4a22f_0
-- coin-or-osi=0.108.7=h2720bb7_2
-- pyopenssl=22.1.0=pyhd8ed1ab_0
-- libuuid=2.32.1=h7f98852_1000
-- pysocks=1.7.1=pyha2e5f31_6
-- openssl=3.0.7=h166bdaf_0
-- typing_extensions=4.4.0=pyha770c72_0
-- python=3.10.6=ha86cf86_0_cpython
-- pynacl=1.5.0=py310h5764c6d_2
-- smart_open=6.2.0=pyha770c72_0
-- tomli=2.0.1=pyhd8ed1ab_0
-- libgcc-ng=12.2.0=h65d4601_19
\ No newline at end of file
diff --git a/workflow/scripts/construct_sce_objects.R b/workflow/scripts/construct_sce_objects.R
index c4565f7..4be32d8 100644
--- a/workflow/scripts/construct_sce_objects.R
+++ b/workflow/scripts/construct_sce_objects.R
@@ -4,6 +4,9 @@ library(tidyverse, quietly = TRUE)
 library(DropletUtils, quietly = TRUE) 
 set.seed(1)
 
+individual_curr <- snakemake@wildcards[["individual"]] # currently loaded individual sample 
+IDENTIFIERS <- snakemake@params[["identifiers"]] 
+
 # construct SCE object from raw cellranger output
 print(paste0(snakemake@input[["output_cellranger"]], "/raw_feature_bc_matrix"))
 sce <- read10xCounts(samples = paste0(snakemake@input[["output_cellranger"]], 
@@ -20,25 +23,21 @@ metadata <- read.csv(file = snakemake@params[["metadata"]],
                      as.is=TRUE, 
                      colClasses = "character")
 
-individual_curr <- snakemake@wildcards[["individual"]] # currently loaded individual sample 
-IDENTIFIERS <- snakemake@params[["identifiers"]] 
-
 # if necessary, concatenate identifiers again to obtain all possible wildcards 
-metadata_curr <- metadata
-if(! "individual" %in% colnames(metadata_curr)){
-  metadata_curr <- metadata_curr %>%
+if(! "individual" %in% colnames(metadata)){
+  metadata <- metadata %>%
     unite(col = "individual", all_of(IDENTIFIERS), sep = "_", remove = FALSE)
   print('establishing "individual" column, construct_sce_objects.R')
 }
 
 # subset data as specified by wildcard and single_cell_object_metadata_fields
-metadata_curr <- metadata_curr[which(metadata_curr$individual == individual_curr),]
+metadata <- metadata[which(metadata$individual == individual_curr),]
 cols_add <- snakemake@params[["single_cell_object_metadata_fields"]]
-metadata_curr <- metadata_curr[,colnames(metadata_curr) %in% cols_add]
+metadata <- metadata[,colnames(metadata) %in% cols_add]
 
 # add cell-level metadata to each barcode in the SCE object
-for(i in colnames(metadata_curr)){
-  colData(sce)[i] <- rep(metadata_curr[1,i], ncol(sce))
+for(i in colnames(metadata)){
+  colData(sce)[i] <- rep(metadata[1,i], ncol(sce))
 }
 
 print(sce)

From 47c9b94db7075df7c988501064d91521d2e70ad1 Mon Sep 17 00:00:00 2001
From: leawlb <lea.woelbert@gmail.com>
Date: Mon, 27 Mar 2023 15:46:34 +0200
Subject: [PATCH 29/32] readability improvements

---
 README.md                                |   8 +-
 snakemake_cellranger.yaml                | 181 +++++++++++++++++++++++
 workflow/Snakefile                       |   4 +-
 workflow/scripts/construct_sce_objects.R |   2 +-
 4 files changed, 190 insertions(+), 5 deletions(-)
 create mode 100644 snakemake_cellranger.yaml

diff --git a/README.md b/README.md
index 4341f2a..3a3289c 100644
--- a/README.md
+++ b/README.md
@@ -49,7 +49,7 @@ Minimal changes needed are:
 
 ## How to run 
 
-Use snakemake_cellranger.yaml to build an environment with all required packages.
+Use snakemake_cellranger.yaml to create an environment with all required packages.
 
 ```bash
 micromamba create -f snakemake_cellranger.yaml
@@ -61,6 +61,12 @@ or
 conda env create -f snakemake_cellranger.yaml
 ```
 
+Set channel priority to strict.
+
+```bash
+conda config --set channel_priority strict
+```
+
 You may call the pipeline as follows in the directory where you cloned it. 
 
 ```bash
diff --git a/snakemake_cellranger.yaml b/snakemake_cellranger.yaml
new file mode 100644
index 0000000..d44ce8f
--- /dev/null
+++ b/snakemake_cellranger.yaml
@@ -0,0 +1,181 @@
+# this env is required, please install and activate for running snakemake
+name: snakemake-cellranger
+channels:
+- conda-forge
+- bioconda
+- defaults
+dependencies:
+- libsqlite=3.39.4=h753d276_0
+- toposort=1.7=pyhd8ed1ab_0
+- importlib-metadata=5.0.0=pyha770c72_1
+- jinja2=3.1.2=pyhd8ed1ab_1
+- exceptiongroup=1.0.1=pyhd8ed1ab_0
+- veracitools=0.1.3=py_0
+- google-cloud-core=2.3.2=pyhd8ed1ab_0
+- coincbc=2.10.8=0_metapackage
+- oauth2client=4.1.3=py_0
+- ubiquerg=0.6.2=pyhd8ed1ab_0
+- google-auth=2.14.0=pyh1a96a4e_0
+- python-dateutil=2.8.2=pyhd8ed1ab_0
+- liblapacke=3.9.0=16_linux64_openblas
+- micromamba=1.0.0=1
+- coin-or-clp=1.17.7=hc56784d_2
+- aioeasywebdav=2.4.0=pyha770c72_0
+- libgomp=12.2.0=h65d4601_19
+- ply=3.11=py_1
+- importlib_resources=5.10.0=pyhd8ed1ab_0
+- libgfortran5=12.2.0=h337968e_19
+- datrie=0.8.2=py310h5764c6d_6
+- pip=22.3.1=pyhd8ed1ab_0
+- packaging=21.3=pyhd8ed1ab_0
+- dataclasses=0.8=pyhc8e2a94_3
+- pycparser=2.21=pyhd8ed1ab_0
+- configargparse=1.5.3=pyhd8ed1ab_0
+- urllib3=1.26.11=pyhd8ed1ab_0
+- colorama=0.4.6=pyhd8ed1ab_0
+- yarl=1.8.1=py310h5764c6d_0
+- psutil=5.9.4=py310h5764c6d_0
+- plac=1.3.5=pyhd8ed1ab_0
+- certifi=2022.9.24=pyhd8ed1ab_0
+- markupsafe=2.1.1=py310h5764c6d_2
+- toolz=0.12.0=pyhd8ed1ab_0
+- cachetools=5.2.0=pyhd8ed1ab_0
+- google-crc32c=1.1.2=py310he8fe98e_4
+- amply=0.1.5=pyhd8ed1ab_0
+- reretry=0.11.1=pyhd8ed1ab_0
+- c-ares=1.18.1=h7f98852_0
+- libsodium=1.0.18=h36c2ea0_1
+- peppy=0.35.2=pyhd8ed1ab_0
+- snakemake=7.18.1=hdfd78af_0
+- pytz=2022.6=pyhd8ed1ab_0
+- pytest=7.2.0=pyhd8ed1ab_2
+- libcblas=3.9.0=16_linux64_openblas
+- httplib2=0.21.0=pyhd8ed1ab_0
+- yte=1.5.1=py310hff52083_1
+- pyrsistent=0.19.2=py310h5764c6d_0
+- libgrpc=1.49.1=h30feacc_1
+- pulp=2.7.0=py310hff52083_0
+- attrs=22.1.0=pyh71513ae_1
+- pandas=1.5.1=py310h769672d_1
+- multidict=6.0.2=py310h5764c6d_2
+- connection_pool=0.0.3=pyhd3deb0d_0
+- _libgcc_mutex=0.1=conda_forge
+- google-api-core=2.10.2=pyhd8ed1ab_0
+- smmap=3.0.5=pyh44b312d_0
+- pygments=2.13.0=pyhd8ed1ab_0
+- wheel=0.38.3=pyhd8ed1ab_0
+- docutils=0.19=py310hff52083_1
+- ftputil=5.0.4=pyhd8ed1ab_0
+- conda=22.9.0=py310hff52083_2
+- gitdb=4.0.9=pyhd8ed1ab_0
+- aiohttp=3.8.3=py310h5764c6d_1
+- libgfortran-ng=12.2.0=h69a702a_19
+- stopit=1.1.2=py_0
+- defusedxml=0.7.1=pyhd8ed1ab_0
+- liblapack=3.9.0=16_linux64_openblas
+- backports=1.0=py_2
+- numpy=1.23.4=py310h53a5b5f_1
+- iniconfig=1.1.1=pyh9f0ad1d_0
+- snakemake-minimal=7.18.1=pyhdfd78af_0
+- coin-or-cbc=2.10.8=h3786ebc_0
+- tzdata=2022f=h191b570_0
+- readline=8.1.2=h0f457ee_0
+- frozenlist=1.3.3=py310h5764c6d_0
+- filelock=3.8.0=pyhd8ed1ab_0
+- ruamel_yaml=0.15.80=py310h5764c6d_1008
+- pycosat=0.6.4=py310h5764c6d_1
+- logmuse=0.2.6=pyh8c360ce_0
+- boto3=1.26.5=pyhd8ed1ab_0
+- pyparsing=3.0.9=pyhd8ed1ab_0
+- backports.functools_lru_cache=1.6.4=pyhd8ed1ab_0
+- pyu2f=0.1.5=pyhd8ed1ab_0
+- protobuf=4.21.9=py310hd8f1fbe_0
+- conda-package-handling=1.9.0=py310h5764c6d_1
+- pysftp=0.2.9=py_1
+- pkgutil-resolve-name=1.3.10=pyhd8ed1ab_0
+- python-fastjsonschema=2.16.2=pyhd8ed1ab_0
+- ld_impl_linux-64=2.39=hc81fddc_0
+- ncurses=6.3=h27087fc_1
+- googleapis-common-protos=1.56.4=py310hff52083_1
+- google-cloud-storage=2.6.0=pyh1a96a4e_0
+- slacker=0.14.0=py_0
+- libffi=3.4.2=h7f98852_5
+- _openmp_mutex=4.5=2_gnu
+- libzlib=1.2.13=h166bdaf_4
+- coin-or-utils=2.11.6=h202d8b1_2
+- libabseil=20220623.0=cxx17_h48a1fff_5
+- prettytable=3.4.1=pyhd8ed1ab_0
+- cffi=1.15.1=py310h255011f_2
+- typing-extensions=4.4.0=hd8ed1ab_0
+- pyyaml=6.0=py310h5764c6d_5
+- dropbox=11.35.0=pyhd8ed1ab_0
+- python_abi=3.10=2_cp310
+- nbformat=5.7.0=pyhd8ed1ab_0
+- libnsl=2.0.0=h7f98852_0
+- stone=3.3.1=pyhd8ed1ab_0
+- libblas=3.9.0=16_linux64_openblas
+- cryptography=38.0.3=py310h600f1e7_0
+- google-auth-httplib2=0.1.0=pyhd8ed1ab_1
+- tk=8.6.12=h27826a3_0
+- libopenblas=0.3.21=pthreads_h78a6416_3
+- pyasn1-modules=0.2.7=py_0
+- jsonschema=4.17.0=pyhd8ed1ab_0
+- coin-or-cgl=0.60.6=h6f57e76_2
+- dpath=2.0.6=py310hff52083_2
+- google-api-python-client=2.65.0=pyhd8ed1ab_0
+- setuptools-scm=7.0.5=pyhd8ed1ab_1
+- rsa=4.9=pyhd8ed1ab_0
+- pyasn1=0.4.8=py_0
+- wcwidth=0.2.5=pyh9f0ad1d_2
+- tqdm=4.64.1=pyhd8ed1ab_0
+- traitlets=5.5.0=pyhd8ed1ab_0
+- wrapt=1.14.1=py310h5764c6d_1
+- zipp=3.10.0=pyhd8ed1ab_0
+- botocore=1.29.5=pyhd8ed1ab_0
+- idna=3.4=pyhd8ed1ab_0
+- google-resumable-media=2.4.0=pyhd8ed1ab_0
+- bcrypt=3.2.2=py310h5764c6d_1
+- attmap=0.13.2=pyhd8ed1ab_0
+- requests=2.28.1=pyhd8ed1ab_1
+- xz=5.2.6=h166bdaf_0
+- grpcio=1.49.1=py310hc32fa93_1
+- libprotobuf=3.21.9=h6239696_0
+- gitpython=3.1.29=pyhd8ed1ab_0
+- s3transfer=0.6.0=pyhd8ed1ab_0
+- yaml=0.2.5=h7f98852_2
+- ratelimiter=1.2.0=pyhd8ed1ab_1003
+- bzip2=1.0.8=h7f98852_4
+- ca-certificates=2022.9.24=ha878542_0
+- uritemplate=4.1.1=pyhd8ed1ab_0
+- future=0.18.2=pyhd8ed1ab_6
+- jupyter_core=4.11.2=py310hff52083_0
+- pluggy=1.0.0=pyhd8ed1ab_5
+- brotlipy=0.7.0=py310h5764c6d_1005
+- jmespath=1.0.1=pyhd8ed1ab_0
+- setuptools=65.5.1=pyhd8ed1ab_0
+- libcrc32c=1.1.2=h9c3ff4c_0
+- libstdcxx-ng=12.2.0=h46fd767_19
+- commonmark=0.9.1=py_0
+- zlib=1.2.13=h166bdaf_4
+- tabulate=0.9.0=pyhd8ed1ab_1
+- re2=2022.06.01=h27087fc_0
+- appdirs=1.4.4=pyh9f0ad1d_0
+- aiosignal=1.3.1=pyhd8ed1ab_0
+- paramiko=2.12.0=pyhd8ed1ab_0
+- filechunkio=1.8=py_2
+- charset-normalizer=2.1.1=pyhd8ed1ab_0
+- python-irodsclient=1.1.5=pyhd8ed1ab_0
+- rich=12.6.0=pyhd8ed1ab_0
+- async-timeout=4.0.2=pyhd8ed1ab_0
+- six=1.16.0=pyh6c4a22f_0
+- coin-or-osi=0.108.7=h2720bb7_2
+- pyopenssl=22.1.0=pyhd8ed1ab_0
+- libuuid=2.32.1=h7f98852_1000
+- pysocks=1.7.1=pyha2e5f31_6
+- openssl=3.0.7=h166bdaf_0
+- typing_extensions=4.4.0=pyha770c72_0
+- python=3.10.6=ha86cf86_0_cpython
+- pynacl=1.5.0=py310h5764c6d_2
+- smart_open=6.2.0=pyha770c72_0
+- tomli=2.0.1=pyhd8ed1ab_0
+- libgcc-ng=12.2.0=h65d4601_19
\ No newline at end of file
diff --git a/workflow/Snakefile b/workflow/Snakefile
index de65a9a..8c705fb 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -97,9 +97,6 @@ rule cellranger_count:
 
 rule construct_sce_objects:
     """construct SCE objects from counts matrices
-
-    set channel priority to strict with << conda config --set channel_priority strict >>
-    use << --conda-frontend conda >> in snakemake command
     
     Initially, building the environment might take some time.
     
@@ -113,6 +110,7 @@ rule construct_sce_objects:
         identifiers = IDENTIFIERS,
         metadata = METADATA_PATH,
         single_cell_object_metadata_fields = config["metadata"]["single_cell_object_metadata_fields"]
+    threads: 2
     conda: 
         "envs/construct_sce_objects.yaml" 
     script:
diff --git a/workflow/scripts/construct_sce_objects.R b/workflow/scripts/construct_sce_objects.R
index 4be32d8..3e21d8f 100644
--- a/workflow/scripts/construct_sce_objects.R
+++ b/workflow/scripts/construct_sce_objects.R
@@ -31,7 +31,7 @@ if(! "individual" %in% colnames(metadata)){
 }
 
 # subset data as specified by wildcard and single_cell_object_metadata_fields
-metadata <- metadata[which(metadata$individual == individual_curr),]
+metadata <- metadata |> filter(individual == individual_curr)
 cols_add <- snakemake@params[["single_cell_object_metadata_fields"]]
 metadata <- metadata[,colnames(metadata) %in% cols_add]
 

From c5033c6d6f461077a3d1a1ac18e33dc4562a1374 Mon Sep 17 00:00:00 2001
From: leawlb <76486615+leawlb@users.noreply.github.com>
Date: Mon, 27 Mar 2023 15:54:25 +0200
Subject: [PATCH 30/32] Update workflow/scripts/samples.py

Co-authored-by: Fritjof Lammers <25619971+mobilegenome@users.noreply.github.com>
---
 workflow/scripts/samples.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/scripts/samples.py b/workflow/scripts/samples.py
index a447908..1f85517 100644
--- a/workflow/scripts/samples.py
+++ b/workflow/scripts/samples.py
@@ -41,7 +41,7 @@ def __init__(self, config):
    
         metadata_full = self.get_cellranger_filename(metadata_full)
 
-        self.metadata = self.select_columns(metadata_full, identifiers = IDENTIFIERS)
+        self.metadata = self.select_columns(metadata_full, custom_columns = IDENTIFIERS)
 
     def get_cellranger_filename(self, df: pd.DataFrame) -> pd.DataFrame:
 

From b2d5b8c1d770f832d9cbbf1acf68636ebf30060f Mon Sep 17 00:00:00 2001
From: leawlb <lea.woelbert@gmail.com>
Date: Tue, 28 Mar 2023 09:50:19 +0200
Subject: [PATCH 31/32] adjusted select_columns function plus minor corrections

---
 README.md                                  |  2 +-
 config/config-interspecies-bonemarrow.yaml |  2 +-
 workflow/scripts/samples.py                | 60 ++++++++++++----------
 3 files changed, 34 insertions(+), 30 deletions(-)

diff --git a/README.md b/README.md
index 3a3289c..7580ac7 100644
--- a/README.md
+++ b/README.md
@@ -70,7 +70,7 @@ conda config --set channel_priority strict
 You may call the pipeline as follows in the directory where you cloned it. 
 
 ```bash
-snakemake --cluster "bsub -n16 -q verylong -R rusage[mem=200GB]" -p -j4 -c42 --configfile config/config-cluster.yaml --use-conda  --use-envmodules --conda-frontend conda
+snakemake --cluster "bsub -n16 -q verylong -R rusage[mem=200GB]" -p -j4 -c42 --configfile config/config-cluster.yaml --use-conda --use-envmodules --conda-frontend conda
 ```
 
  - `--cluster` may change depending on the computational footprint of your analyses
diff --git a/config/config-interspecies-bonemarrow.yaml b/config/config-interspecies-bonemarrow.yaml
index 3eedb8e..4a571cf 100644
--- a/config/config-interspecies-bonemarrow.yaml
+++ b/config/config-interspecies-bonemarrow.yaml
@@ -1,7 +1,7 @@
 paths:
   # directory to store output data = cellranger files and SingleCellExperiment objects
   # paths to input data = fastq files are stored in metadata sheet (OTP export)
-  output_dir: "/omics/odcf/analysis/OE0538_projects/DO-0008/data"
+  output_dir: "/omics/odcf/analysis/OE0538_projects/DO-0008/test2"
   target_templates:
     # paths for each type of output file
       # remove "/{0[Species_ID]}" to store all objects in one directory or
diff --git a/workflow/scripts/samples.py b/workflow/scripts/samples.py
index 1f85517..a41a331 100644
--- a/workflow/scripts/samples.py
+++ b/workflow/scripts/samples.py
@@ -2,46 +2,46 @@
 
 from snakemake.io import Wildcards
 
-
 class Samples:
     """
     Convert the OTP-exported metadata spreadsheet into a pandas.DataFrame
     that provides metadata for the workflow.
     """
-    # columns to select
-    columns = ["PID",
-               "Sample Type",
-               "Species Scientific Name",
-               "STRAIN",
-               "FastQ Path",
-               "FastQC Path",
-               "Run ID",
-               "ILSE_NO",
-               "TISSUE",
-               "BIRTH",
-               "DATE_OF_BIRTH",
-               "DATE_OF_DEATH",
-               "LANE_NO",
-               "READ",
-               "CELLRANGER_FASTQ_PATH",
-               "individual"]  # include 'individual' col to be generated below
                
     def __init__(self, config):
         IDENTIFIERS = config["metadata"]["identifiers"]
-        metadata_files = config["metadata"]["raw"]
+        metadata_files = config["metadata"]["table"]
 
         self.output_base_dir = config["paths"]["output_dir"]
         self.target_templates = config["paths"]["target_templates"]
+        
+        # columns to select for subsampling 
+        columns = ["PID",
+                   "Sample Type",
+                   "Species Scientific Name",
+                   "STRAIN",
+                   "FastQ Path",
+                   "FastQC Path",
+                   "Run ID",
+                   "ILSE_NO",
+                   "TISSUE",
+                   "BIRTH",
+                   "DATE_OF_BIRTH",
+                   "DATE_OF_DEATH",
+                   "LANE_NO",
+                   "READ",
+                   "CELLRANGER_FASTQ_PATH",
+                   "individual"]  # include 'individual' col to be generated below
 
         metadata_full = pd.concat((pd.read_csv(f) for f in metadata_files), ignore_index=True)
 
-        # generate 'individual' column containing all concatenated identifiers
+        # if required, generate 'individual' column containing all concatenated identifiers
         if not "individual" in metadata_full.columns: 
             metadata_full['individual'] = metadata_full[IDENTIFIERS].apply('_'.join, axis=1)
    
         metadata_full = self.get_cellranger_filename(metadata_full)
 
-        self.metadata = self.select_columns(metadata_full, custom_columns = IDENTIFIERS)
+        self.metadata = self.select_columns(metadata_full, custom_columns = columns + IDENTIFIERS)
 
     def get_cellranger_filename(self, df: pd.DataFrame) -> pd.DataFrame:
 
@@ -69,14 +69,18 @@ def get_cellranger_filename(self, df: pd.DataFrame) -> pd.DataFrame:
 
     def select_columns(self,
                        df: pd.DataFrame,
-                       columns: list = None,
-                       identifiers: str = None):
+                       custom_columns: list = None):
         """Select/Subset columns from DataFrame to reduce
-        DataFrame dimensions. """
-
-        if not columns:
-            columns = self.columns + identifiers
-        return df[columns]
+        DataFrame dimensions. A list of column names can be provided by `custom_columns` """
+        
+        if custom_columns:
+          print(custom_columns)
+          df_subset = df[custom_columns] 
+        else:
+          print(custom_columns)
+          df_subset = df[columns]
+        
+        return df_subset
             
     @staticmethod
     def filter_by_wildcards(

From cdcda74c65592cd3ea79300724136a91c21ee909 Mon Sep 17 00:00:00 2001
From: leawlb <lea.woelbert@gmail.com>
Date: Thu, 30 Mar 2023 09:20:23 +0200
Subject: [PATCH 32/32] minor corrections

---
 config/config-interspecies-bonemarrow.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config/config-interspecies-bonemarrow.yaml b/config/config-interspecies-bonemarrow.yaml
index 4a571cf..3eedb8e 100644
--- a/config/config-interspecies-bonemarrow.yaml
+++ b/config/config-interspecies-bonemarrow.yaml
@@ -1,7 +1,7 @@
 paths:
   # directory to store output data = cellranger files and SingleCellExperiment objects
   # paths to input data = fastq files are stored in metadata sheet (OTP export)
-  output_dir: "/omics/odcf/analysis/OE0538_projects/DO-0008/test2"
+  output_dir: "/omics/odcf/analysis/OE0538_projects/DO-0008/data"
   target_templates:
     # paths for each type of output file
       # remove "/{0[Species_ID]}" to store all objects in one directory or