From 0e00cefb9f486c98ef11b5bddb872de63b3c8f1d Mon Sep 17 00:00:00 2001
From: Ethan Weinberger <ethanweinberger@pop-os.localdomain>
Date: Fri, 25 Mar 2022 10:55:30 -0700
Subject: [PATCH] Add notebook for downloading McFarland 2020 Figure 1 data

This PR adds a Jupyter notebook to download the data from
McFarland et al., 2020 used to produce Figure 1 (i.e.,
response to idasanutlin and control DMSO for different cell lines).
This PR also adds a `utils.py` file to the datasets folder
containing reusable functions for downloading/preprocessing.
---
 datasets/McFarland_2020_Fig1.ipynb | 211 +++++++++++++++++++++++++++++
 datasets/utils.py                  |  30 ++++
 2 files changed, 241 insertions(+)
 create mode 100644 datasets/McFarland_2020_Fig1.ipynb
 create mode 100644 datasets/utils.py

diff --git a/datasets/McFarland_2020_Fig1.ipynb b/datasets/McFarland_2020_Fig1.ipynb
new file mode 100644
index 0000000..a1c7e49
--- /dev/null
+++ b/datasets/McFarland_2020_Fig1.ipynb
@@ -0,0 +1,211 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "62385964-6d60-47de-bd3c-4a40d5c9954c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from utils import download_binary_file\n",
+    "import os\n",
+    "import shutil\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from scipy.io import mmread\n",
+    "import anndata\n",
+    "from anndata import AnnData\n",
+    "import scanpy as sc\n",
+    "from typing import List, Tuple"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "00d0a7c1-b9ae-414d-8b33-eb424a5c9e81",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Functions for downloading/reading files\n",
+    "\n",
+    "def download_mcfarland_2020(output_path: str) -> None:\n",
+    "    \"\"\"\n",
+    "    Download Mcfarland et al. 2020 data from the hosting URLs.\n",
+    "\n",
+    "    Args:\n",
+    "    ----\n",
+    "        output_path: Output path to store the downloaded and unzipped\n",
+    "        directories.\n",
+    "\n",
+    "    Returns\n",
+    "    -------\n",
+    "        None. File directories are downloaded and unzipped in output_path.\n",
+    "    \"\"\"\n",
+    "    idasanutlin_url = \"https://figshare.com/ndownloader/files/18716351\"\n",
+    "    idasanutlin_output_filename = os.path.join(output_path, \"idasanutlin.zip\")\n",
+    "\n",
+    "    download_binary_file(idasanutlin_url, idasanutlin_output_filename)\n",
+    "    idasanutlin_output_dir = idasanutlin_output_filename.replace(\".zip\", \"\")\n",
+    "    shutil.unpack_archive(idasanutlin_output_filename, idasanutlin_output_dir)\n",
+    "\n",
+    "    dmso_url = \"https://figshare.com/ndownloader/files/18716354\"\n",
+    "    dmso_output_filename = os.path.join(output_path, \"dmso.zip\")\n",
+    "\n",
+    "    download_binary_file(dmso_url, dmso_output_filename)\n",
+    "    dmso_output_dir = dmso_output_filename.replace(\".zip\", \"\")\n",
+    "    shutil.unpack_archive(dmso_output_filename, dmso_output_dir)\n",
+    "\n",
+    "\n",
+    "def _read_mixseq_df(directory: str) -> pd.DataFrame:\n",
+    "    data = mmread(os.path.join(directory, \"matrix.mtx\"))\n",
+    "    barcodes = pd.read_table(os.path.join(directory, \"barcodes.tsv\"), header=None)\n",
+    "    classifications = pd.read_csv(os.path.join(directory, \"classifications.csv\"))\n",
+    "    classifications[\"cell_line\"] = np.array(\n",
+    "        [x.split(\"_\")[0] for x in classifications.singlet_ID.values]\n",
+    "    )\n",
+    "    gene_names = pd.read_table(os.path.join(directory, \"genes.tsv\"), header=None)\n",
+    "\n",
+    "    df = pd.DataFrame(\n",
+    "        data.toarray(),\n",
+    "        columns=barcodes.iloc[:, 0].values,\n",
+    "        index=gene_names.iloc[:, 0].values,\n",
+    "    )\n",
+    "    return df\n",
+    "\n",
+    "\n",
+    "def _get_cell_line_labels(directory: str) -> np.array:\n",
+    "    classifications = pd.read_csv(os.path.join(directory, \"classifications.csv\"))\n",
+    "    return classifications.singlet_ID.values\n",
+    "\n",
+    "\n",
+    "def _get_tp53_mutation_status(cell_line_labels: List[str]) -> np.array:\n",
+    "    # Taken from https://cancerdatascience.org/blog/posts/mix-seq/\n",
+    "    TP53_WT = [\n",
+    "        \"LNCAPCLONEFGC_PROSTATE\",\n",
+    "        \"DKMG_CENTRAL_NERVOUS_SYSTEM\",\n",
+    "        \"NCIH226_LUNG\",\n",
+    "        \"RCC10RGB_KIDNEY\",\n",
+    "        \"SNU1079_BILIARY_TRACT\",\n",
+    "        \"CCFSTTG1_CENTRAL_NERVOUS_SYSTEM\",\n",
+    "        \"COV434_OVARY\",\n",
+    "    ]\n",
+    "\n",
+    "    TP53_mutation_status = [\n",
+    "        \"Wild Type\" if x in TP53_WT else \"Mutation\" for x in cell_line_labels\n",
+    "    ]\n",
+    "    return np.array(TP53_mutation_status)\n",
+    "\n",
+    "\n",
+    "def read_mcfarland_2020(file_directory: str) -> Tuple[pd.DataFrame, pd.DataFrame]:\n",
+    "    \"\"\"\n",
+    "    Read the expression data for Mcfarland et al. 2020 in the given directory.\n",
+    "\n",
+    "    Args:\n",
+    "    ----\n",
+    "        file_directory: Directory containing Mcfarland et al. 2020 data.\n",
+    "\n",
+    "    Returns\n",
+    "    -------\n",
+    "        Two data frames of raw count expression data. The first contains\n",
+    "        single-cell gene expression count data from cancer cell lines exposed to\n",
+    "        idasanutlin with cell identification barcodes as column names and gene IDs as\n",
+    "        indices. The second contains count data with the same format from samples\n",
+    "        exposed to a control solution (DMSO).\n",
+    "    \"\"\"\n",
+    "    idasanutlin_dir = os.path.join(\n",
+    "        file_directory, \"idasanutlin\", \"Idasanutlin_24hr_expt1\"\n",
+    "    )\n",
+    "    idasanutlin_df = _read_mixseq_df(idasanutlin_dir)\n",
+    "\n",
+    "    dmso_dir = os.path.join(file_directory, \"dmso\", \"DMSO_24hr_expt1\")\n",
+    "    dmso_df = _read_mixseq_df(dmso_dir)\n",
+    "\n",
+    "    return idasanutlin_df, dmso_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "701585d6-7d2f-409e-b16b-7ca55c0b614d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_31619/762642982.py:6: FutureWarning: X.dtype being converted to np.float32 from int64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.\n",
+      "  idasanutlin_adata = AnnData(idasanutlin_df)\n",
+      "/tmp/ipykernel_31619/762642982.py:18: FutureWarning: X.dtype being converted to np.float32 from int64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.\n",
+      "  dmso_adata = AnnData(dmso_df)\n",
+      "/homes/gws/ewein/miniconda3/envs/contrastive-vi-env/lib/python3.10/site-packages/anndata/_core/anndata.py:1828: UserWarning: Observation names are not unique. To make them unique, call `.obs_names_make_unique`.\n",
+      "  utils.warn_names_duplicates(\"obs\")\n"
+     ]
+    }
+   ],
+   "source": [
+    "download_path = \"./\"\n",
+    "\n",
+    "idasanutlin_df, dmso_df = read_mcfarland_2020(download_path)\n",
+    "idasanutlin_df, dmso_df = idasanutlin_df.transpose(), dmso_df.transpose()\n",
+    "\n",
+    "idasanutlin_adata = AnnData(idasanutlin_df)\n",
+    "idasanutlin_dir = os.path.join(\n",
+    "    download_path, \"idasanutlin\", \"Idasanutlin_24hr_expt1\"\n",
+    ")\n",
+    "idasanutlin_adata.obs[\"cell_line\"] = _get_cell_line_labels(idasanutlin_dir)\n",
+    "idasanutlin_adata.obs[\"TP53_mutation_status\"] = _get_tp53_mutation_status(\n",
+    "    idasanutlin_adata.obs[\"cell_line\"]\n",
+    ")\n",
+    "idasanutlin_adata.obs[\"condition\"] = np.repeat(\n",
+    "    \"Idasanutlin\", idasanutlin_adata.shape[0]\n",
+    ")\n",
+    "\n",
+    "dmso_adata = AnnData(dmso_df)\n",
+    "dmso_dir = os.path.join(download_path, \"dmso\", \"DMSO_24hr_expt1\")\n",
+    "dmso_adata.obs[\"cell_line\"] = _get_cell_line_labels(dmso_dir)\n",
+    "dmso_adata.obs[\"TP53_mutation_status\"] = _get_tp53_mutation_status(\n",
+    "    dmso_adata.obs[\"cell_line\"]\n",
+    ")\n",
+    "dmso_adata.obs[\"condition\"] = np.repeat(\"DMSO\", dmso_adata.shape[0])\n",
+    "\n",
+    "adata = anndata.concat([idasanutlin_adata, dmso_adata])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "f1cef90a-4cc9-4dbb-8801-231c9601b10c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Filling in the standard metadata values\n",
+    "\n",
+    "adata.obs['perturbation_name'] = adata.obs['condition']\n",
+    "adata.obs['perturbation_type'] = 'small molecule'\n",
+    "adata.obs['perturbation_value'] = '24'\n",
+    "adata.obs['perturbation_unit'] = 'hrs'"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/datasets/utils.py b/datasets/utils.py
new file mode 100644
index 0000000..f04ec53
--- /dev/null
+++ b/datasets/utils.py
@@ -0,0 +1,30 @@
+import requests
+import os
+
+def download_binary_file(
+    file_url: str, output_path: str, overwrite: bool = False
+) -> None:
+    """
+    Download binary data file from a URL.
+
+    Args:
+    ----
+        file_url: URL where the file is hosted.
+        output_path: Output path for the downloaded file.
+        overwrite: Whether to overwrite existing downloaded file.
+
+    Returns
+    -------
+        None.
+    """
+    file_exists = os.path.exists(output_path)
+    if (not file_exists) or (file_exists and overwrite):
+        request = requests.get(file_url)
+        with open(output_path, "wb") as f:
+            f.write(request.content)
+        print(f"Downloaded data from {file_url} at {output_path}")
+    else:
+        print(
+            f"File {output_path} already exists. "
+            "No files downloaded to overwrite the existing file."
+        )
\ No newline at end of file