diff --git a/README.md b/README.md index ff6e88e..7580ac7 100644 --- a/README.md +++ b/README.md @@ -49,12 +49,29 @@ Minimal changes needed are: ## How to run +Use snakemake_cellranger.yaml to create an environment with all required packages. + +```bash +micromamba create -f snakemake_cellranger.yaml +``` + +or + +```bash +conda env create -f snakemake_cellranger.yaml +``` + +Set channel priority to strict. + +```bash +conda config --set channel_priority strict +``` + You may call the pipeline as follows in the directory where you cloned it. ```bash -snakemake --cluster "bsub -n16 -q verylong -R rusage[mem=200GB]" -p -j4 -c42 --configfile config/config-cluster.yaml --use-conda --use-envmodules +snakemake --cluster "bsub -n16 -q verylong -R rusage[mem=200GB]" -p -j4 -c42 --configfile config/config-cluster.yaml --use-conda --use-envmodules --conda-frontend conda ``` - `--cluster` may change depending on the computational footprint of your analyses - `--configfile` should point to your personal configuration - diff --git a/config/config-example.yaml b/config/config-example.yaml index 2040a43..0e12dce 100644 --- a/config/config-example.yaml +++ b/config/config-example.yaml @@ -22,16 +22,15 @@ metadata: - "" # Define the column names used to define a sample identifiers: - - SAMPLE_NAME + - PID - Sample_Type - - Age - - fraction # Define all columns from the metadata spreadsheet that # will be included in the SingleCellExperiment / Seurat # objects single_cell_object_metadata_fields: - - SAMPLE_NAME - - Age + - PID + - Sample_Type + - individual # Enable / Disable rules and specifiy rule-specific parameters rules: cellranger_count: diff --git a/config/config-interspecies-bonemarrow.yaml b/config/config-interspecies-bonemarrow.yaml new file mode 100644 index 0000000..3eedb8e --- /dev/null +++ b/config/config-interspecies-bonemarrow.yaml @@ -0,0 +1,58 @@ +paths: + # directory to store output data = cellranger files and SingleCellExperiment objects + # paths to input data = fastq files are stored in metadata sheet (OTP export) + output_dir: "/omics/odcf/analysis/OE0538_projects/DO-0008/data" + target_templates: + # paths for each type of output file + # remove "/{0[Species_ID]}" to store all objects in one directory or + # replace "Species_ID" with other colname as suitable wildcard from metadata sheet (e.g. Sample_Type, ...) to create subfolders + # remove or replace all instances of Species_ID in Snakefile and samples.py + linked_files: "cellranger/linked_files/{0[Species_ID]}/{0[individual]}" + cellranger_count: "cellranger/cellranger_count/{0[Species_ID]}/{0[individual]}/outs" + construct_sce_objects: "sce_objects/01_cellranger_output/{0[Species_ID]}/sce_{0[individual]}-01" + # define specific target files + target_files: + samples_sheet: "metadata.csv" +references: + all_masked: "/omics/groups/OE0538/internal/shared_data/CellRangerReferences/GRCm38_masked_allStrains/" +metadata: + table: + # should have same format as or be based on OTP export sheet + - "/omics/odcf/analysis/OE0538_projects/DO-0008/metadata/OE0538_DO-0008_metadata_combined.csv" + identifiers: + # define one or multiple column name(s) that uniquely define a sample/object + # multiple cols are concatenated into one sample-specific name + # in the format of {Species_ID}_{Age_ID}_{Fraction_ID}_{Sample_NR} + - Species_ID # columns I manually added to metadata sheet + - Age_ID + - Fraction_ID + - Sample_NR + #- PID # columns included in OTP metadata sheet + #- Sample_Type + single_cell_object_metadata_fields: + # Define all columns from the metadata spreadsheet that + # will be included in the SingleCellExperiment / Seurat objects + - Object_ID # columns I manually added to metadata sheet + - Mouse_ID + - Species_ID + - Age + - Age_ID + - Age_weeks + - Fraction + - Fraction_ID + - Antibody_combination + - Sample_NR + - Extrarun + - Batch_exp_day + - Batch_sequencing + - Date_collected + - Keep_sample + - individual + - PID +# Enable / Disable rules and specifiy rule-specific parameters +rules: + cellranger_count: + extra: "" # set additional arguments for cellranger count + allele_specific: False + wasp_filter_reads: False + realign_bam: False \ No newline at end of file diff --git a/snakemake_cellranger.yaml b/snakemake_cellranger.yaml new file mode 100644 index 0000000..d44ce8f --- /dev/null +++ b/snakemake_cellranger.yaml @@ -0,0 +1,181 @@ +# this env is required, please install and activate for running snakemake +name: snakemake-cellranger +channels: +- conda-forge +- bioconda +- defaults +dependencies: +- libsqlite=3.39.4=h753d276_0 +- toposort=1.7=pyhd8ed1ab_0 +- importlib-metadata=5.0.0=pyha770c72_1 +- jinja2=3.1.2=pyhd8ed1ab_1 +- exceptiongroup=1.0.1=pyhd8ed1ab_0 +- veracitools=0.1.3=py_0 +- google-cloud-core=2.3.2=pyhd8ed1ab_0 +- coincbc=2.10.8=0_metapackage +- oauth2client=4.1.3=py_0 +- ubiquerg=0.6.2=pyhd8ed1ab_0 +- google-auth=2.14.0=pyh1a96a4e_0 +- python-dateutil=2.8.2=pyhd8ed1ab_0 +- liblapacke=3.9.0=16_linux64_openblas +- micromamba=1.0.0=1 +- coin-or-clp=1.17.7=hc56784d_2 +- aioeasywebdav=2.4.0=pyha770c72_0 +- libgomp=12.2.0=h65d4601_19 +- ply=3.11=py_1 +- importlib_resources=5.10.0=pyhd8ed1ab_0 +- libgfortran5=12.2.0=h337968e_19 +- datrie=0.8.2=py310h5764c6d_6 +- pip=22.3.1=pyhd8ed1ab_0 +- packaging=21.3=pyhd8ed1ab_0 +- dataclasses=0.8=pyhc8e2a94_3 +- pycparser=2.21=pyhd8ed1ab_0 +- configargparse=1.5.3=pyhd8ed1ab_0 +- urllib3=1.26.11=pyhd8ed1ab_0 +- colorama=0.4.6=pyhd8ed1ab_0 +- yarl=1.8.1=py310h5764c6d_0 +- psutil=5.9.4=py310h5764c6d_0 +- plac=1.3.5=pyhd8ed1ab_0 +- certifi=2022.9.24=pyhd8ed1ab_0 +- markupsafe=2.1.1=py310h5764c6d_2 +- toolz=0.12.0=pyhd8ed1ab_0 +- cachetools=5.2.0=pyhd8ed1ab_0 +- google-crc32c=1.1.2=py310he8fe98e_4 +- amply=0.1.5=pyhd8ed1ab_0 +- reretry=0.11.1=pyhd8ed1ab_0 +- c-ares=1.18.1=h7f98852_0 +- libsodium=1.0.18=h36c2ea0_1 +- peppy=0.35.2=pyhd8ed1ab_0 +- snakemake=7.18.1=hdfd78af_0 +- pytz=2022.6=pyhd8ed1ab_0 +- pytest=7.2.0=pyhd8ed1ab_2 +- libcblas=3.9.0=16_linux64_openblas +- httplib2=0.21.0=pyhd8ed1ab_0 +- yte=1.5.1=py310hff52083_1 +- pyrsistent=0.19.2=py310h5764c6d_0 +- libgrpc=1.49.1=h30feacc_1 +- pulp=2.7.0=py310hff52083_0 +- attrs=22.1.0=pyh71513ae_1 +- pandas=1.5.1=py310h769672d_1 +- multidict=6.0.2=py310h5764c6d_2 +- connection_pool=0.0.3=pyhd3deb0d_0 +- _libgcc_mutex=0.1=conda_forge +- google-api-core=2.10.2=pyhd8ed1ab_0 +- smmap=3.0.5=pyh44b312d_0 +- pygments=2.13.0=pyhd8ed1ab_0 +- wheel=0.38.3=pyhd8ed1ab_0 +- docutils=0.19=py310hff52083_1 +- ftputil=5.0.4=pyhd8ed1ab_0 +- conda=22.9.0=py310hff52083_2 +- gitdb=4.0.9=pyhd8ed1ab_0 +- aiohttp=3.8.3=py310h5764c6d_1 +- libgfortran-ng=12.2.0=h69a702a_19 +- stopit=1.1.2=py_0 +- defusedxml=0.7.1=pyhd8ed1ab_0 +- liblapack=3.9.0=16_linux64_openblas +- backports=1.0=py_2 +- numpy=1.23.4=py310h53a5b5f_1 +- iniconfig=1.1.1=pyh9f0ad1d_0 +- snakemake-minimal=7.18.1=pyhdfd78af_0 +- coin-or-cbc=2.10.8=h3786ebc_0 +- tzdata=2022f=h191b570_0 +- readline=8.1.2=h0f457ee_0 +- frozenlist=1.3.3=py310h5764c6d_0 +- filelock=3.8.0=pyhd8ed1ab_0 +- ruamel_yaml=0.15.80=py310h5764c6d_1008 +- pycosat=0.6.4=py310h5764c6d_1 +- logmuse=0.2.6=pyh8c360ce_0 +- boto3=1.26.5=pyhd8ed1ab_0 +- pyparsing=3.0.9=pyhd8ed1ab_0 +- backports.functools_lru_cache=1.6.4=pyhd8ed1ab_0 +- pyu2f=0.1.5=pyhd8ed1ab_0 +- protobuf=4.21.9=py310hd8f1fbe_0 +- conda-package-handling=1.9.0=py310h5764c6d_1 +- pysftp=0.2.9=py_1 +- pkgutil-resolve-name=1.3.10=pyhd8ed1ab_0 +- python-fastjsonschema=2.16.2=pyhd8ed1ab_0 +- ld_impl_linux-64=2.39=hc81fddc_0 +- ncurses=6.3=h27087fc_1 +- googleapis-common-protos=1.56.4=py310hff52083_1 +- google-cloud-storage=2.6.0=pyh1a96a4e_0 +- slacker=0.14.0=py_0 +- libffi=3.4.2=h7f98852_5 +- _openmp_mutex=4.5=2_gnu +- libzlib=1.2.13=h166bdaf_4 +- coin-or-utils=2.11.6=h202d8b1_2 +- libabseil=20220623.0=cxx17_h48a1fff_5 +- prettytable=3.4.1=pyhd8ed1ab_0 +- cffi=1.15.1=py310h255011f_2 +- typing-extensions=4.4.0=hd8ed1ab_0 +- pyyaml=6.0=py310h5764c6d_5 +- dropbox=11.35.0=pyhd8ed1ab_0 +- python_abi=3.10=2_cp310 +- nbformat=5.7.0=pyhd8ed1ab_0 +- libnsl=2.0.0=h7f98852_0 +- stone=3.3.1=pyhd8ed1ab_0 +- libblas=3.9.0=16_linux64_openblas +- cryptography=38.0.3=py310h600f1e7_0 +- google-auth-httplib2=0.1.0=pyhd8ed1ab_1 +- tk=8.6.12=h27826a3_0 +- libopenblas=0.3.21=pthreads_h78a6416_3 +- pyasn1-modules=0.2.7=py_0 +- jsonschema=4.17.0=pyhd8ed1ab_0 +- coin-or-cgl=0.60.6=h6f57e76_2 +- dpath=2.0.6=py310hff52083_2 +- google-api-python-client=2.65.0=pyhd8ed1ab_0 +- setuptools-scm=7.0.5=pyhd8ed1ab_1 +- rsa=4.9=pyhd8ed1ab_0 +- pyasn1=0.4.8=py_0 +- wcwidth=0.2.5=pyh9f0ad1d_2 +- tqdm=4.64.1=pyhd8ed1ab_0 +- traitlets=5.5.0=pyhd8ed1ab_0 +- wrapt=1.14.1=py310h5764c6d_1 +- zipp=3.10.0=pyhd8ed1ab_0 +- botocore=1.29.5=pyhd8ed1ab_0 +- idna=3.4=pyhd8ed1ab_0 +- google-resumable-media=2.4.0=pyhd8ed1ab_0 +- bcrypt=3.2.2=py310h5764c6d_1 +- attmap=0.13.2=pyhd8ed1ab_0 +- requests=2.28.1=pyhd8ed1ab_1 +- xz=5.2.6=h166bdaf_0 +- grpcio=1.49.1=py310hc32fa93_1 +- libprotobuf=3.21.9=h6239696_0 +- gitpython=3.1.29=pyhd8ed1ab_0 +- s3transfer=0.6.0=pyhd8ed1ab_0 +- yaml=0.2.5=h7f98852_2 +- ratelimiter=1.2.0=pyhd8ed1ab_1003 +- bzip2=1.0.8=h7f98852_4 +- ca-certificates=2022.9.24=ha878542_0 +- uritemplate=4.1.1=pyhd8ed1ab_0 +- future=0.18.2=pyhd8ed1ab_6 +- jupyter_core=4.11.2=py310hff52083_0 +- pluggy=1.0.0=pyhd8ed1ab_5 +- brotlipy=0.7.0=py310h5764c6d_1005 +- jmespath=1.0.1=pyhd8ed1ab_0 +- setuptools=65.5.1=pyhd8ed1ab_0 +- libcrc32c=1.1.2=h9c3ff4c_0 +- libstdcxx-ng=12.2.0=h46fd767_19 +- commonmark=0.9.1=py_0 +- zlib=1.2.13=h166bdaf_4 +- tabulate=0.9.0=pyhd8ed1ab_1 +- re2=2022.06.01=h27087fc_0 +- appdirs=1.4.4=pyh9f0ad1d_0 +- aiosignal=1.3.1=pyhd8ed1ab_0 +- paramiko=2.12.0=pyhd8ed1ab_0 +- filechunkio=1.8=py_2 +- charset-normalizer=2.1.1=pyhd8ed1ab_0 +- python-irodsclient=1.1.5=pyhd8ed1ab_0 +- rich=12.6.0=pyhd8ed1ab_0 +- async-timeout=4.0.2=pyhd8ed1ab_0 +- six=1.16.0=pyh6c4a22f_0 +- coin-or-osi=0.108.7=h2720bb7_2 +- pyopenssl=22.1.0=pyhd8ed1ab_0 +- libuuid=2.32.1=h7f98852_1000 +- pysocks=1.7.1=pyha2e5f31_6 +- openssl=3.0.7=h166bdaf_0 +- typing_extensions=4.4.0=pyha770c72_0 +- python=3.10.6=ha86cf86_0_cpython +- pynacl=1.5.0=py310h5764c6d_2 +- smart_open=6.2.0=pyha770c72_0 +- tomli=2.0.1=pyhd8ed1ab_0 +- libgcc-ng=12.2.0=h65d4601_19 \ No newline at end of file diff --git a/utils/process_otp_metadata.py b/utils/process_otp_metadata.py new file mode 100644 index 0000000..10ca797 --- /dev/null +++ b/utils/process_otp_metadata.py @@ -0,0 +1,13 @@ +import pandas as pd + + +def rename_date_of_birth(row: pd.Series): + """Unify the 'BIRTH' and 'DATE_OF_BIRTH' columns into + a single 'DATE_OF_BIRTH' column. + """ + values = [row["BIRTH"], row["DATE_OF_BIRTH"]] + dates = [val for val in values if not pd.isna(val)] + if len(dates) == 1: + return dates[0] + elif len(dates) != 1: + return pd.NA diff --git a/workflow/Snakefile b/workflow/Snakefile index b035d0c..8c705fb 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -6,27 +6,24 @@ from scripts.samples import Samples samples = Samples(config) -REFERENCES = config["references"] +METADATA_PATH = config["metadata"]["table"] OUTPUT_BASE_PATH = config["paths"]["output_dir"] -def get_sample_string(identifiers): - return "_".join(["{" + label + "}" for label in identifiers]) - -SAMPLE_WILDCARDS = get_sample_string(config['metadata']['identifiers']) +REFERENCES = config["references"] +IDENTIFIERS = config["metadata"]["identifiers"] print(samples.metadata) localrules: link_files, write_samples_spreadsheet, all # local execution of non-demanding rules -wildcard_constraints: - sample_type="[A-z0-9_-]+", - individual="[A-z0-9_-]+" +wildcard_constraints: #should not contain ".", see rule cellranger_count + individual="[A-z0-9_-]+", + Species_ID="[A-z0-9_-]+" rule all: input: samples.targets - rule link_files: """Link files with filename pattern required for Cellranger @@ -41,9 +38,9 @@ rule link_files: ) output: - directory(f"{OUTPUT_BASE_PATH}/linked_files/{{individual}}/{{sample_type}}/") + directory(f"{OUTPUT_BASE_PATH}/cellranger/linked_files/{{Species_ID}}/{{individual}}") log: - "logs/link_files_{individual}_{sample_type}.log" + f"{OUTPUT_BASE_PATH}/cellranger/logs/{{Species_ID}}/link_files_{{individual}}.log" run: log_fh = open(log[0], "w") os.makedirs(output[0], exist_ok=True) @@ -57,9 +54,9 @@ rule link_files: rule write_samples_spreadsheet: output: - f"{OUTPUT_BASE_PATH}/{config['paths']['target_files']['samples_sheet']}" + f"{OUTPUT_BASE_PATH}/cellranger/{config['paths']['target_files']['samples_sheet']}" log: - "logs/write_samples_spreadsheet.log" + f"{OUTPUT_BASE_PATH}/cellranger/logs/write_samples_spreadsheet.log" run: with open(log[0], "w") as log_fh: print("storing samples metadata to {output[0]}", file=log_fh) @@ -75,23 +72,47 @@ rule cellranger_count: input: rules.link_files.output output: - outs_dir=f"{OUTPUT_BASE_PATH}/cellranger_count/{{individual}}_{{sample_type}}/outs" + outs_dir=directory(f"{OUTPUT_BASE_PATH}/cellranger/cellranger_count/{{Species_ID}}/{{individual}}/outs") params: genome=REFERENCES["all_masked"], output_root_dir=lambda wildcards, output: Path(output[0]).parents[1], # remove last 2 levels from output path extra=config["rules"]["cellranger_count"]["extra"] log: - "logs/cellranger_count_{individual}_{sample_type}.log" + f"{OUTPUT_BASE_PATH}/cellranger/logs/{{Species_ID}}/cellranger_count_{{individual}}.log" threads: 16 envmodules: "cellranger/6.1.1", shell: "cd {params.output_root_dir}; " + "pwd; " "cellranger count " - "--id {wildcards.individual}_{wildcards.sample_type} " # letters, numbers, hyphens, and underscores + "--id {wildcards.individual} " # letters, numbers, hyphens, and underscores "--transcriptome {params.genome} " "--fastqs {input} " "--localcores={threads} " "{params.extra} " - "--sample {wildcards.individual}_{wildcards.sample_type} " + "--sample {wildcards.individual} " "> {log}" + + +rule construct_sce_objects: + """construct SCE objects from counts matrices + + Initially, building the environment might take some time. + + Note: output must match target_templates in configuration YAML. + """ + input: + output_cellranger = rules.cellranger_count.output + output: + sce_objects = f"{OUTPUT_BASE_PATH}/sce_objects/01_cellranger_output/{{Species_ID}}/sce_{{individual}}-01" + params: + identifiers = IDENTIFIERS, + metadata = METADATA_PATH, + single_cell_object_metadata_fields = config["metadata"]["single_cell_object_metadata_fields"] + threads: 2 + conda: + "envs/construct_sce_objects.yaml" + script: + "scripts/construct_sce_objects.R" + diff --git a/workflow/envs/construct_sce_objects.yaml b/workflow/envs/construct_sce_objects.yaml new file mode 100644 index 0000000..bd49ff1 --- /dev/null +++ b/workflow/envs/construct_sce_objects.yaml @@ -0,0 +1,243 @@ +# this environment is automatically created by snakemake +name: construct-sce-objects +channels: +- conda-forge +- bioconda +- defaults +dependencies: +- r-dbplyr=2.3.0=r41hc72bb7e_0 +- bioconductor-delayedmatrixstats=1.16.0=r41hdfd78af_0 +- font-ttf-ubuntu=0.83=hab24e00_0 +- libsanitizer=12.2.0=h46fd767_19 +- bwidget=1.9.14=ha770c72_1 +- r-bit=4.0.5=r41h06615bd_0 +- libcurl=7.86.0=h7bff187_1 +- libpng=1.6.39=h753d276_0 +- r-digest=0.6.31=r41h38f115c_0 +- r-fastmap=1.1.0=r41h7525677_1 +- r-cpp11=0.4.3=r41hc72bb7e_0 +- _r-mutex=1.0.1=anacondar_1 +- r-stringi=1.7.12=r41h1ae9187_0 +- r-forcats=0.5.2=r41hc72bb7e_1 +- r-withr=2.5.0=r41hc72bb7e_1 +- r-crayon=1.5.2=r41hc72bb7e_1 +- r-highr=0.10=r41hc72bb7e_0 +- r-sys=3.4.1=r41h06615bd_0 +- r-formatr=1.14=r41hc72bb7e_0 +- r-base=4.1.3=h2f963a2_5 +- bioconductor-delayedarray=0.20.0=r41hc0cfd56_2 +- libstdcxx-ng=12.2.0=h46fd767_19 +- tktable=2.10=hb7b940f_3 +- r-sass=0.4.5=r41h38f115c_0 +- lerc=4.0.0=h27087fc_0 +- libgcc-devel_linux-64=12.2.0=h3b97bd3_19 +- bioconductor-scuttle=1.4.0=r41hc247a5b_2 +- libgfortran5=12.2.0=h337968e_19 +- r-colorspace=2.1_0=r41h133d619_0 +- r-ids=1.0.1=r41hc72bb7e_2 +- libstdcxx-devel_linux-64=12.2.0=h3b97bd3_19 +- jpeg=9e=h166bdaf_2 +- r-callr=3.7.3=r41hc72bb7e_0 +- r-curl=4.3.3=r41h06615bd_1 +- fribidi=1.0.10=h36c2ea0_0 +- krb5=1.19.3=h3790be6_0 +- bioconductor-xvector=0.34.0=r41hc0cfd56_2 +- bioconductor-biocparallel=1.28.3=r41hc247a5b_1 +- libgcc-ng=12.2.0=h65d4601_19 +- r-r.utils=2.12.2=r41hc72bb7e_0 +- r-lubridate=1.9.1=r41h133d619_0 +- r-base64enc=0.1_3=r41h06615bd_1005 +- r-readr=2.1.3=r41h7525677_1 +- ca-certificates=2022.12.7=ha878542_0 +- xorg-kbproto=1.0.7=h7f98852_1002 +- r-xml2=1.3.3=r41h044e5c7_2 +- r-scales=1.2.1=r41hc72bb7e_1 +- sed=4.8=he412f7d_0 +- harfbuzz=6.0.0=h8e241bc_0 +- readline=8.1.2=h0f457ee_0 +- zstd=1.5.2=h3eb15da_6 +- r-rcurl=1.98_1.9=r41h06615bd_1 +- libuuid=2.32.1=h7f98852_1000 +- zlib=1.2.13=h166bdaf_4 +- r-r6=2.5.1=r41hc72bb7e_1 +- bioconductor-edger=3.36.0=r41hc247a5b_2 +- libzlib=1.2.13=h166bdaf_4 +- r-rcolorbrewer=1.1_3=r41h785f33e_1 +- libedit=3.1.20191231=he28a2e2_2 +- r-farver=2.1.1=r41h7525677_1 +- r-lattice=0.20_45=r41h06615bd_1 +- libgfortran-ng=12.2.0=h69a702a_19 +- fonts-conda-ecosystem=1=0 +- r-glue=1.6.2=r41h06615bd_1 +- xorg-libice=1.0.10=h7f98852_0 +- r-utf8=1.2.2=r41h06615bd_1 +- xorg-libxt=1.2.1=h7f98852_2 +- pandoc=2.19.2=h32600fe_1 +- bioconductor-genomicranges=1.46.1=r41hc0cfd56_1 +- r-pkgconfig=2.0.3=r41hc72bb7e_2 +- r-rstudioapi=0.14=r41hc72bb7e_1 +- r-memoise=2.0.1=r41hc72bb7e_1 +- r-futile.options=1.0.1=r41hc72bb7e_1003 +- r-snow=0.4_4=r41hc72bb7e_1 +- r-httr=1.4.4=r41hc72bb7e_1 +- libffi=3.4.2=h7f98852_5 +- _openmp_mutex=4.5=2_gnu +- sysroot_linux-64=2.12=he073ed8_15 +- keyutils=1.6.1=h166bdaf_0 +- bioconductor-biobase=2.54.0=r41hc0cfd56_2 +- r-googledrive=2.0.0=r41hc72bb7e_1 +- r-rvest=1.0.3=r41hc72bb7e_1 +- _libgcc_mutex=0.1=conda_forge +- bioconductor-singlecellexperiment=1.16.0=r41hdfd78af_0 +- r-yaml=2.3.7=r41h133d619_0 +- r-selectr=0.4_2=r41hc72bb7e_2 +- r-generics=0.1.3=r41hc72bb7e_1 +- r-backports=1.4.1=r41h06615bd_1 +- r-ps=1.7.2=r41h06615bd_0 +- font-ttf-inconsolata=3.000=h77eed37_0 +- fonts-conda-forge=1=0 +- bioconductor-genomeinfodbdata=1.2.7=r41hdfd78af_2 +- r-gargle=1.2.1=r41hc72bb7e_1 +- r-viridislite=0.4.1=r41hc72bb7e_1 +- r-nlme=3.1_161=r41hac0b197_0 +- r-hms=1.1.2=r41hc72bb7e_1 +- r-munsell=0.5.0=r41hc72bb7e_1005 +- ld_impl_linux-64=2.39=hcc3a1bd_1 +- curl=7.86.0=h7bff187_1 +- c-ares=1.18.1=h7f98852_0 +- r-sitmo=2.0.2=r41h7525677_1 +- r-locfit=1.5_9.7=r41h133d619_0 +- bioconductor-matrixgenerics=1.6.0=r41hdfd78af_0 +- r-tzdb=0.3.0=r41h7525677_1 +- pango=1.50.12=hd33c08f_1 +- r-jsonlite=1.8.4=r41h133d619_0 +- libev=4.33=h516909a_1 +- libwebp-base=1.2.4=h166bdaf_0 +- bioconductor-dropletutils=1.14.2=r41hc247a5b_1 +- r-dplyr=1.0.10=r41h7525677_1 +- bioconductor-beachmat=2.10.0=r41hc247a5b_2 +- r-vroom=1.6.1=r41h38f115c_0 +- r-data.table=1.14.6=r41h06615bd_0 +- pixman=0.40.0=h36c2ea0_0 +- r-tibble=3.1.8=r41h06615bd_1 +- r-xfun=0.36=r41h38f115c_0 +- bioconductor-sparsematrixstats=1.6.0=r41hc247a5b_2 +- r-evaluate=0.20=r41hc72bb7e_0 +- xorg-xproto=7.0.31=h7f98852_1007 +- r-jquerylib=0.1.4=r41hc72bb7e_1 +- xorg-libxrender=0.9.10=h7f98852_1003 +- r-mass=7.3_58.2=r41h133d619_0 +- r-r.oo=1.25.0=r41hc72bb7e_1 +- r-rematch2=2.1.2=r41hc72bb7e_2 +- r-mime=0.12=r41h06615bd_1 +- liblapack=3.9.0=16_linux64_openblas +- r-processx=3.8.0=r41h06615bd_0 +- r-blob=1.2.3=r41hc72bb7e_1 +- libblas=3.9.0=16_linux64_openblas +- bioconductor-zlibbioc=1.40.0=r41hc0cfd56_2 +- r-haven=2.5.1=r41h7525677_0 +- r-timechange=0.2.0=r41h38f115c_0 +- r-matrixstats=0.63.0=r41h06615bd_0 +- r-assertthat=0.2.1=r41hc72bb7e_3 +- xorg-xextproto=7.3.0=h7f98852_1002 +- bioconductor-s4vectors=0.32.4=r41hc0cfd56_0 +- r-rappdirs=0.3.3=r41h06615bd_1 +- r-ggplot2=3.4.0=r41hc72bb7e_1 +- xz=5.2.6=h166bdaf_0 +- libgomp=12.2.0=h65d4601_19 +- r-cachem=1.0.6=r41h06615bd_1 +- r-tidyverse=1.3.2=r41hc72bb7e_1 +- r-clipr=0.8.0=r41hc72bb7e_1 +- r-rcpp=1.0.10=r41h38f115c_0 +- font-ttf-source-code-pro=2.038=h77eed37_0 +- font-ttf-dejavu-sans-mono=2.37=hab24e00_0 +- bioconductor-iranges=2.28.0=r41hc0cfd56_2 +- libnghttp2=1.51.0=hdcd2b5c_0 +- bioconductor-rhdf5lib=1.16.0=r41hc0cfd56_2 +- r-fansi=1.0.4=r41h133d619_0 +- kernel-headers_linux-64=2.6.32=he073ed8_15 +- gcc_impl_linux-64=12.2.0=hcc96c02_19 +- r-pillar=1.8.1=r41hc72bb7e_1 +- gettext=0.21.1=h27087fc_0 +- libiconv=1.17=h166bdaf_0 +- r-bitops=1.0_7=r41h06615bd_1 +- bioconductor-hdf5array=1.22.1=r41hc0cfd56_1 +- r-purrr=1.0.1=r41h133d619_0 +- r-broom=1.0.3=r41hc72bb7e_0 +- r-bh=1.81.0_1=r41hc72bb7e_0 +- libxml2=2.10.3=h7463322_0 +- r-isoband=0.2.7=r41h38f115c_1 +- r-futile.logger=1.4.3=r41hc72bb7e_1004 +- r-reprex=2.0.2=r41hc72bb7e_1 +- r-openssl=2.0.5=r41hb1dc35e_0 +- r-lifecycle=1.0.3=r41hc72bb7e_1 +- r-stringr=1.5.0=r41h785f33e_0 +- r-tidyselect=1.2.0=r41hc72bb7e_0 +- r-lambda.r=1.2.4=r41hc72bb7e_2 +- expat=2.5.0=h27087fc_0 +- r-r.methodss3=1.8.2=r41hc72bb7e_1 +- r-dbi=1.1.3=r41hc72bb7e_1 +- r-rmarkdown=2.20=r41hc72bb7e_0 +- xorg-libx11=1.7.2=h7f98852_0 +- bzip2=1.0.8=h7f98852_4 +- r-cli=3.6.0=r41h38f115c_0 +- r-gtable=0.3.1=r41hc72bb7e_1 +- libtiff=4.5.0=h6adf6a1_2 +- libopenblas=0.3.21=pthreads_h78a6416_3 +- tk=8.6.12=h27826a3_0 +- bioconductor-genomeinfodb=1.30.1=r41hdfd78af_0 +- bioconductor-limma=3.50.3=r41hc0cfd56_0 +- xorg-libxau=1.0.9=h7f98852_0 +- r-cellranger=1.1.0=r41hc72bb7e_1005 +- make=4.3=hd18ef5c_1 +- r-rematch=1.0.1=r41hc72bb7e_1005 +- xorg-renderproto=0.11.1=h7f98852_1002 +- graphite2=1.3.13=h58526e2_1001 +- r-htmltools=0.5.4=r41h38f115c_0 +- r-mgcv=1.8_41=r41h5f7b363_0 +- r-knitr=1.42=r41hc72bb7e_1 +- libdeflate=1.17=h0b41bf4_0 +- bioconductor-rhdf5filters=1.6.0=r41hc247a5b_2 +- fontconfig=2.14.1=hc2a2eb6_0 +- r-bslib=0.4.2=r41hc72bb7e_0 +- gfortran_impl_linux-64=12.2.0=h55be85b_19 +- r-dqrng=0.3.0=r41h7525677_1 +- libxcb=1.13=h7f98852_1004 +- xorg-libsm=1.2.3=hd9c2040_1000 +- r-googlesheets4=1.0.1=r41h785f33e_1 +- r-prettyunits=1.1.1=r41hc72bb7e_2 +- libcblas=3.9.0=16_linux64_openblas +- bioconductor-rhdf5=2.38.1=r41hbe1951d_0 +- binutils_impl_linux-64=2.39=he00db2b_1 +- icu=70.1=h27087fc_0 +- pcre2=10.40=hc3806b6_0 +- r-rlang=1.0.6=r41h7525677_1 +- r-readxl=1.4.1=r41h3ebcfa7_1 +- pthread-stubs=0.4=h36c2ea0_1001 +- r-ellipsis=0.3.2=r41h06615bd_1 +- r-magrittr=2.0.3=r41h06615bd_1 +- openssl=1.1.1s=h0b41bf4_1 +- gxx_impl_linux-64=12.2.0=hcc96c02_19 +- r-progress=1.2.2=r41hc72bb7e_3 +- r-fs=1.6.0=r41h38f115c_0 +- xorg-libxext=1.3.4=h7f98852_1 +- gsl=2.7=he838d99_0 +- r-tidyr=1.3.0=r41h38f115c_0 +- libssh2=1.10.0=haa6b8db_3 +- r-bit64=4.0.5=r41h06615bd_1 +- ncurses=6.3=h27087fc_1 +- r-matrix=1.5_3=r41h5f7b363_0 +- r-uuid=1.1_0=r41h06615bd_1 +- cairo=1.16.0=ha61ee94_1014 +- libglib=2.74.1=h606061b_1 +- r-vctrs=0.5.2=r41h38f115c_0 +- freetype=2.12.1=hca18f0e_1 +- r-tinytex=0.43=r41hc72bb7e_0 +- xorg-libxdmcp=1.1.3=h7f98852_0 +- bioconductor-biocgenerics=0.40.0=r41hdfd78af_0 +- r-dtplyr=1.2.2=r41hc72bb7e_2 +- r-modelr=0.1.10=r41hc72bb7e_0 +- r-askpass=1.1=r41h06615bd_3 +- bioconductor-summarizedexperiment=1.24.0=r41hdfd78af_0 +- r-labeling=0.4.2=r41hc72bb7e_2 + \ No newline at end of file diff --git a/workflow/scripts/construct_sce_objects.R b/workflow/scripts/construct_sce_objects.R new file mode 100644 index 0000000..3e21d8f --- /dev/null +++ b/workflow/scripts/construct_sce_objects.R @@ -0,0 +1,44 @@ +#------------------------------------------------------------------------------- + +library(tidyverse, quietly = TRUE) +library(DropletUtils, quietly = TRUE) +set.seed(1) + +individual_curr <- snakemake@wildcards[["individual"]] # currently loaded individual sample +IDENTIFIERS <- snakemake@params[["identifiers"]] + +# construct SCE object from raw cellranger output +print(paste0(snakemake@input[["output_cellranger"]], "/raw_feature_bc_matrix")) +sce <- read10xCounts(samples = paste0(snakemake@input[["output_cellranger"]], + "/raw_feature_bc_matrix"), + col.names = TRUE, + type = "sparse" ) + +# add specified metadata to object +metadata <- read.csv(file = snakemake@params[["metadata"]], + head = TRUE, + sep = ",", + check.names=FALSE, + stringsAsFactors=FALSE, + as.is=TRUE, + colClasses = "character") + +# if necessary, concatenate identifiers again to obtain all possible wildcards +if(! "individual" %in% colnames(metadata)){ + metadata <- metadata %>% + unite(col = "individual", all_of(IDENTIFIERS), sep = "_", remove = FALSE) + print('establishing "individual" column, construct_sce_objects.R') +} + +# subset data as specified by wildcard and single_cell_object_metadata_fields +metadata <- metadata |> filter(individual == individual_curr) +cols_add <- snakemake@params[["single_cell_object_metadata_fields"]] +metadata <- metadata[,colnames(metadata) %in% cols_add] + +# add cell-level metadata to each barcode in the SCE object +for(i in colnames(metadata)){ + colData(sce)[i] <- rep(metadata[1,i], ncol(sce)) +} + +print(sce) +saveRDS(sce, file = snakemake@output[["sce_objects"]]) \ No newline at end of file diff --git a/workflow/scripts/samples.py b/workflow/scripts/samples.py index bf9c40a..a41a331 100644 --- a/workflow/scripts/samples.py +++ b/workflow/scripts/samples.py @@ -2,83 +2,65 @@ from snakemake.io import Wildcards - class Samples: """ Convert the OTP-exported metadata spreadsheet into a pandas.DataFrame that provides metadata for the workflow. - - """ - # columns to select - columns = ["PID", - "Sample Type", - "Species Scientific Name", - "STRAIN", - "FastQ Path", - "FastQC Path", - "Run ID", - "ILSE_NO", - "TISSUE", - "BIRTH", - "DATE_OF_BIRTH", - "DATE_OF_DEATH", - "LANE_NO", - "READ", - "CELLRANGER_FASTQ_PATH"] - - # map to rename columns in the format of (old):(new) - columns_map = { - "PID": "individual", - "Sample Type": "sample_type", - } - + def __init__(self, config): - metadata_files = config["metadata"]["raw"] + IDENTIFIERS = config["metadata"]["identifiers"] + metadata_files = config["metadata"]["table"] self.output_base_dir = config["paths"]["output_dir"] self.target_templates = config["paths"]["target_templates"] + + # columns to select for subsampling + columns = ["PID", + "Sample Type", + "Species Scientific Name", + "STRAIN", + "FastQ Path", + "FastQC Path", + "Run ID", + "ILSE_NO", + "TISSUE", + "BIRTH", + "DATE_OF_BIRTH", + "DATE_OF_DEATH", + "LANE_NO", + "READ", + "CELLRANGER_FASTQ_PATH", + "individual"] # include 'individual' col to be generated below metadata_full = pd.concat((pd.read_csv(f) for f in metadata_files), ignore_index=True) - metadata_full["DATE_OF_BIRTH"] = metadata_full.apply(self.rename_date_of_birth, axis=1) - + # if required, generate 'individual' column containing all concatenated identifiers + if not "individual" in metadata_full.columns: + metadata_full['individual'] = metadata_full[IDENTIFIERS].apply('_'.join, axis=1) + metadata_full = self.get_cellranger_filename(metadata_full) - self.metadata = self.select_columns(metadata_full) - self.metadata = self.metadata.rename(self.columns_map, axis="columns") - - def rename_date_of_birth(self, - row: pd.Series): - """Unify the 'BIRTH' and 'DATE_OF_BIRTH' columns into - a single 'DATE_OF_BIRTH' column. - """ - values = [row["BIRTH"], row["DATE_OF_BIRTH"]] - dates = [val for val in values if not pd.isna(val)] - if len(dates) == 1: - return dates[0] - elif len(dates) != 1: - return pd.NA + self.metadata = self.select_columns(metadata_full, custom_columns = columns + IDENTIFIERS) def get_cellranger_filename(self, df: pd.DataFrame) -> pd.DataFrame: """Add column containing CellRanger compatible filename, i.e. in the format of - [Sample Name]_S[Sample_Number]_L00[Lane Number]_[Read Type]_001.fastq.gz - - Here, [Sample Name] consists of PID and Sample Type. - + + Here, [Sample Name] consists of "individual". + See also: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/fastq-input """ - grouped = df.groupby(["PID", "Sample Type", "LANE_NO", "READ"]) + grouped = df.groupby(["individual", "LANE_NO", "READ"]) groups = [] for name, group in grouped: group = group.sort_values("FastQ Path") # import to have consistent sorting group["multi_sample_idx"] = range(1, len(group)+1) group["CELLRANGER_FASTQ_PATH"] = group.agg( - "{0[PID]}_{0[Sample Type]}_S{0[multi_sample_idx]}_L00{0[LANE_NO]}_R{0[READ]}_001.fastq.gz".format, + "{0[individual]}_S{0[multi_sample_idx]}_L00{0[LANE_NO]}_R{0[READ]}_001.fastq.gz".format, axis=1, ) groups.append(group) @@ -87,13 +69,19 @@ def get_cellranger_filename(self, df: pd.DataFrame) -> pd.DataFrame: def select_columns(self, df: pd.DataFrame, - columns: list = None): + custom_columns: list = None): """Select/Subset columns from DataFrame to reduce - DataFrame dimensions. """ - if not columns: - columns = self.columns - return df[columns] - + DataFrame dimensions. A list of column names can be provided by `custom_columns` """ + + if custom_columns: + print(custom_columns) + df_subset = df[custom_columns] + else: + print(custom_columns) + df_subset = df[columns] + + return df_subset + @staticmethod def filter_by_wildcards( wildcards: Wildcards, @@ -104,11 +92,10 @@ def filter_by_wildcards( ) -> list: """Get item from DataFrame by subsetting using the following index attributes provided via Snakemakes wildcards object. - :return param: """ - _identifiers = ["individual", "sample_type"] + _identifiers = ["individual", "Species_ID"] if wildcards: filters = dict((k, getattr(wildcards, k)) for k in _identifiers) @@ -138,7 +125,7 @@ def targets(self) -> list: targets = list(map(lambda x: self.metadata.agg(x.format, axis=1).drop_duplicates().to_list(), target_templates.values())) return targets - - - + + + #grouped = metadata_subset.groupby(["PID", "Sample Type"])["FastQ Path"].apply(list)