From 0e00cefb9f486c98ef11b5bddb872de63b3c8f1d Mon Sep 17 00:00:00 2001 From: Ethan Weinberger Date: Fri, 25 Mar 2022 10:55:30 -0700 Subject: [PATCH] Add notebook for downloading McFarland 2020 Figure 1 data This PR adds a Jupyter notebook to download the data from McFarland et al., 2020 used to produce Figure 1 (i.e., response to idasanutlin and control DMSO for different cell lines). This PR also adds a `utils.py` file to the datasets folder containing reusable functions for downloading/preprocessing. --- datasets/McFarland_2020_Fig1.ipynb | 211 +++++++++++++++++++++++++++++ datasets/utils.py | 30 ++++ 2 files changed, 241 insertions(+) create mode 100644 datasets/McFarland_2020_Fig1.ipynb create mode 100644 datasets/utils.py diff --git a/datasets/McFarland_2020_Fig1.ipynb b/datasets/McFarland_2020_Fig1.ipynb new file mode 100644 index 0000000..a1c7e49 --- /dev/null +++ b/datasets/McFarland_2020_Fig1.ipynb @@ -0,0 +1,211 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "62385964-6d60-47de-bd3c-4a40d5c9954c", + "metadata": {}, + "outputs": [], + "source": [ + "from utils import download_binary_file\n", + "import os\n", + "import shutil\n", + "import pandas as pd\n", + "import numpy as np\n", + "from scipy.io import mmread\n", + "import anndata\n", + "from anndata import AnnData\n", + "import scanpy as sc\n", + "from typing import List, Tuple" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "00d0a7c1-b9ae-414d-8b33-eb424a5c9e81", + "metadata": {}, + "outputs": [], + "source": [ + "# Functions for downloading/reading files\n", + "\n", + "def download_mcfarland_2020(output_path: str) -> None:\n", + " \"\"\"\n", + " Download Mcfarland et al. 2020 data from the hosting URLs.\n", + "\n", + " Args:\n", + " ----\n", + " output_path: Output path to store the downloaded and unzipped\n", + " directories.\n", + "\n", + " Returns\n", + " -------\n", + " None. File directories are downloaded and unzipped in output_path.\n", + " \"\"\"\n", + " idasanutlin_url = \"https://figshare.com/ndownloader/files/18716351\"\n", + " idasanutlin_output_filename = os.path.join(output_path, \"idasanutlin.zip\")\n", + "\n", + " download_binary_file(idasanutlin_url, idasanutlin_output_filename)\n", + " idasanutlin_output_dir = idasanutlin_output_filename.replace(\".zip\", \"\")\n", + " shutil.unpack_archive(idasanutlin_output_filename, idasanutlin_output_dir)\n", + "\n", + " dmso_url = \"https://figshare.com/ndownloader/files/18716354\"\n", + " dmso_output_filename = os.path.join(output_path, \"dmso.zip\")\n", + "\n", + " download_binary_file(dmso_url, dmso_output_filename)\n", + " dmso_output_dir = dmso_output_filename.replace(\".zip\", \"\")\n", + " shutil.unpack_archive(dmso_output_filename, dmso_output_dir)\n", + "\n", + "\n", + "def _read_mixseq_df(directory: str) -> pd.DataFrame:\n", + " data = mmread(os.path.join(directory, \"matrix.mtx\"))\n", + " barcodes = pd.read_table(os.path.join(directory, \"barcodes.tsv\"), header=None)\n", + " classifications = pd.read_csv(os.path.join(directory, \"classifications.csv\"))\n", + " classifications[\"cell_line\"] = np.array(\n", + " [x.split(\"_\")[0] for x in classifications.singlet_ID.values]\n", + " )\n", + " gene_names = pd.read_table(os.path.join(directory, \"genes.tsv\"), header=None)\n", + "\n", + " df = pd.DataFrame(\n", + " data.toarray(),\n", + " columns=barcodes.iloc[:, 0].values,\n", + " index=gene_names.iloc[:, 0].values,\n", + " )\n", + " return df\n", + "\n", + "\n", + "def _get_cell_line_labels(directory: str) -> np.array:\n", + " classifications = pd.read_csv(os.path.join(directory, \"classifications.csv\"))\n", + " return classifications.singlet_ID.values\n", + "\n", + "\n", + "def _get_tp53_mutation_status(cell_line_labels: List[str]) -> np.array:\n", + " # Taken from https://cancerdatascience.org/blog/posts/mix-seq/\n", + " TP53_WT = [\n", + " \"LNCAPCLONEFGC_PROSTATE\",\n", + " \"DKMG_CENTRAL_NERVOUS_SYSTEM\",\n", + " \"NCIH226_LUNG\",\n", + " \"RCC10RGB_KIDNEY\",\n", + " \"SNU1079_BILIARY_TRACT\",\n", + " \"CCFSTTG1_CENTRAL_NERVOUS_SYSTEM\",\n", + " \"COV434_OVARY\",\n", + " ]\n", + "\n", + " TP53_mutation_status = [\n", + " \"Wild Type\" if x in TP53_WT else \"Mutation\" for x in cell_line_labels\n", + " ]\n", + " return np.array(TP53_mutation_status)\n", + "\n", + "\n", + "def read_mcfarland_2020(file_directory: str) -> Tuple[pd.DataFrame, pd.DataFrame]:\n", + " \"\"\"\n", + " Read the expression data for Mcfarland et al. 2020 in the given directory.\n", + "\n", + " Args:\n", + " ----\n", + " file_directory: Directory containing Mcfarland et al. 2020 data.\n", + "\n", + " Returns\n", + " -------\n", + " Two data frames of raw count expression data. The first contains\n", + " single-cell gene expression count data from cancer cell lines exposed to\n", + " idasanutlin with cell identification barcodes as column names and gene IDs as\n", + " indices. The second contains count data with the same format from samples\n", + " exposed to a control solution (DMSO).\n", + " \"\"\"\n", + " idasanutlin_dir = os.path.join(\n", + " file_directory, \"idasanutlin\", \"Idasanutlin_24hr_expt1\"\n", + " )\n", + " idasanutlin_df = _read_mixseq_df(idasanutlin_dir)\n", + "\n", + " dmso_dir = os.path.join(file_directory, \"dmso\", \"DMSO_24hr_expt1\")\n", + " dmso_df = _read_mixseq_df(dmso_dir)\n", + "\n", + " return idasanutlin_df, dmso_df" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "701585d6-7d2f-409e-b16b-7ca55c0b614d", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_31619/762642982.py:6: FutureWarning: X.dtype being converted to np.float32 from int64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.\n", + " idasanutlin_adata = AnnData(idasanutlin_df)\n", + "/tmp/ipykernel_31619/762642982.py:18: FutureWarning: X.dtype being converted to np.float32 from int64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.\n", + " dmso_adata = AnnData(dmso_df)\n", + "/homes/gws/ewein/miniconda3/envs/contrastive-vi-env/lib/python3.10/site-packages/anndata/_core/anndata.py:1828: UserWarning: Observation names are not unique. To make them unique, call `.obs_names_make_unique`.\n", + " utils.warn_names_duplicates(\"obs\")\n" + ] + } + ], + "source": [ + "download_path = \"./\"\n", + "\n", + "idasanutlin_df, dmso_df = read_mcfarland_2020(download_path)\n", + "idasanutlin_df, dmso_df = idasanutlin_df.transpose(), dmso_df.transpose()\n", + "\n", + "idasanutlin_adata = AnnData(idasanutlin_df)\n", + "idasanutlin_dir = os.path.join(\n", + " download_path, \"idasanutlin\", \"Idasanutlin_24hr_expt1\"\n", + ")\n", + "idasanutlin_adata.obs[\"cell_line\"] = _get_cell_line_labels(idasanutlin_dir)\n", + "idasanutlin_adata.obs[\"TP53_mutation_status\"] = _get_tp53_mutation_status(\n", + " idasanutlin_adata.obs[\"cell_line\"]\n", + ")\n", + "idasanutlin_adata.obs[\"condition\"] = np.repeat(\n", + " \"Idasanutlin\", idasanutlin_adata.shape[0]\n", + ")\n", + "\n", + "dmso_adata = AnnData(dmso_df)\n", + "dmso_dir = os.path.join(download_path, \"dmso\", \"DMSO_24hr_expt1\")\n", + "dmso_adata.obs[\"cell_line\"] = _get_cell_line_labels(dmso_dir)\n", + "dmso_adata.obs[\"TP53_mutation_status\"] = _get_tp53_mutation_status(\n", + " dmso_adata.obs[\"cell_line\"]\n", + ")\n", + "dmso_adata.obs[\"condition\"] = np.repeat(\"DMSO\", dmso_adata.shape[0])\n", + "\n", + "adata = anndata.concat([idasanutlin_adata, dmso_adata])" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "f1cef90a-4cc9-4dbb-8801-231c9601b10c", + "metadata": {}, + "outputs": [], + "source": [ + "# Filling in the standard metadata values\n", + "\n", + "adata.obs['perturbation_name'] = adata.obs['condition']\n", + "adata.obs['perturbation_type'] = 'small molecule'\n", + "adata.obs['perturbation_value'] = '24'\n", + "adata.obs['perturbation_unit'] = 'hrs'" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/datasets/utils.py b/datasets/utils.py new file mode 100644 index 0000000..f04ec53 --- /dev/null +++ b/datasets/utils.py @@ -0,0 +1,30 @@ +import requests +import os + +def download_binary_file( + file_url: str, output_path: str, overwrite: bool = False +) -> None: + """ + Download binary data file from a URL. + + Args: + ---- + file_url: URL where the file is hosted. + output_path: Output path for the downloaded file. + overwrite: Whether to overwrite existing downloaded file. + + Returns + ------- + None. + """ + file_exists = os.path.exists(output_path) + if (not file_exists) or (file_exists and overwrite): + request = requests.get(file_url) + with open(output_path, "wb") as f: + f.write(request.content) + print(f"Downloaded data from {file_url} at {output_path}") + else: + print( + f"File {output_path} already exists. " + "No files downloaded to overwrite the existing file." + ) \ No newline at end of file