Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
339 changes: 339 additions & 0 deletions datasets/Norman_2019_curation.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,339 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "d12cb9dc",
"metadata": {},
"source": [
"Accession: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE133344"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "ca04f335-6926-4764-82ec-374d7c6f94b4",
"metadata": {},
"outputs": [],
"source": [
"import gzip\n",
"import os\n",
"import re\n",
"\n",
"import pandas as pd\n",
"import numpy as np\n",
"from anndata import AnnData\n",
"from scipy.io import mmread\n",
"from scipy.sparse import coo_matrix\n",
"\n",
"from utils import download_binary_file\n",
"\n",
"# Gene program lists obtained by cross-referencing the heatmap here\n",
"# https://github.com/thomasmaxwellnorman/Perturbseq_GI/blob/master/GI_optimal_umap.ipynb\n",
"# with Figure 2b in Norman 2019\n",
"G1_CYCLE = [\n",
" \"CDKN1C+CDKN1B\",\n",
" \"CDKN1B+ctrl\",\n",
" \"CDKN1B+CDKN1A\",\n",
" \"CDKN1C+ctrl\",\n",
" \"ctrl+CDKN1A\",\n",
" \"CDKN1C+CDKN1A\",\n",
" \"CDKN1A+ctrl\",\n",
"]\n",
"\n",
"ERYTHROID = [\n",
" \"BPGM+SAMD1\",\n",
" \"ATL1+ctrl\",\n",
" \"UBASH3B+ZBTB25\",\n",
" \"PTPN12+PTPN9\",\n",
" \"PTPN12+UBASH3A\",\n",
" \"CBL+CNN1\",\n",
" \"UBASH3B+CNN1\",\n",
" \"CBL+UBASH3B\",\n",
" \"UBASH3B+PTPN9\",\n",
" \"PTPN1+ctrl\",\n",
" \"CBL+PTPN9\",\n",
" \"CNN1+UBASH3A\",\n",
" \"CBL+PTPN12\",\n",
" \"PTPN12+ZBTB25\",\n",
" \"UBASH3B+PTPN12\",\n",
" \"SAMD1+PTPN12\",\n",
" \"SAMD1+UBASH3B\",\n",
" \"UBASH3B+UBASH3A\",\n",
"]\n",
"\n",
"PIONEER_FACTORS = [\n",
" \"ZBTB10+SNAI1\",\n",
" \"FOXL2+MEIS1\",\n",
" \"POU3F2+CBFA2T3\",\n",
" \"DUSP9+SNAI1\",\n",
" \"FOXA3+FOXA1\",\n",
" \"FOXA3+ctrl\",\n",
" \"LYL1+IER5L\",\n",
" \"FOXA1+FOXF1\",\n",
" \"FOXF1+HOXB9\",\n",
" \"FOXA1+HOXB9\",\n",
" \"FOXA3+HOXB9\",\n",
" \"FOXA3+FOXA1\",\n",
" \"FOXA3+FOXL2\",\n",
" \"POU3F2+FOXL2\",\n",
" \"FOXF1+FOXL2\",\n",
" \"FOXA1+FOXL2\",\n",
" \"HOXA13+ctrl\",\n",
" \"ctrl+HOXC13\",\n",
" \"HOXC13+ctrl\",\n",
" \"MIDN+ctrl\",\n",
" \"TP73+ctrl\",\n",
"]\n",
"\n",
"GRANULOCYTE_APOPTOSIS = [\n",
" \"SPI1+ctrl\",\n",
" \"ctrl+SPI1\",\n",
" \"ctrl+CEBPB\",\n",
" \"CEBPB+ctrl\",\n",
" \"JUN+CEBPA\",\n",
" \"CEBPB+CEBPA\",\n",
" \"FOSB+CEBPE\",\n",
" \"ZC3HAV1+CEBPA\",\n",
" \"KLF1+CEBPA\",\n",
" \"ctrl+CEBPA\",\n",
" \"CEBPA+ctrl\",\n",
" \"CEBPE+CEBPA\",\n",
" \"CEBPE+SPI1\",\n",
" \"CEBPE+ctrl\",\n",
" \"ctrl+CEBPE\",\n",
" \"CEBPE+RUNX1T1\",\n",
" \"CEBPE+CEBPB\",\n",
" \"FOSB+CEBPB\",\n",
" \"ETS2+CEBPE\",\n",
"]\n",
"\n",
"MEGAKARYOCYTE = [\n",
" \"ctrl+ETS2\",\n",
" \"MAPK1+ctrl\",\n",
" \"ctrl+MAPK1\",\n",
" \"ETS2+MAPK1\",\n",
" \"CEBPB+MAPK1\",\n",
" \"MAPK1+TGFBR2\",\n",
"]\n",
"\n",
"PRO_GROWTH = [\n",
" \"CEBPE+KLF1\",\n",
" \"KLF1+MAP2K6\",\n",
" \"AHR+KLF1\",\n",
" \"ctrl+KLF1\",\n",
" \"KLF1+ctrl\",\n",
" \"KLF1+BAK1\",\n",
" \"KLF1+TGFBR2\",\n",
"]\n",
"\n",
"\n",
"def download_norman_2019(output_path: str) -> None:\n",
" \"\"\"\n",
" Download Norman et al. 2019 data and metadata files from the hosting URLs.\n",
"\n",
" Args:\n",
" ----\n",
" output_path: Output path to store the downloaded and unzipped\n",
" directories.\n",
"\n",
" Returns\n",
" -------\n",
" None. File directories are downloaded to output_path.\n",
" \"\"\"\n",
"\n",
" file_urls = (\n",
" \"https://ftp.ncbi.nlm.nih.gov/geo/series/GSE133nnn/GSE133344/suppl\"\n",
" \"/GSE133344_filtered_matrix.mtx.gz\",\n",
" \"https://ftp.ncbi.nlm.nih.gov/geo/series/GSE133nnn/GSE133344/suppl\"\n",
" \"/GSE133344_filtered_genes.tsv.gz\",\n",
" \"https://ftp.ncbi.nlm.nih.gov/geo/series/GSE133nnn/GSE133344/suppl\"\n",
" \"/GSE133344_filtered_barcodes.tsv.gz\",\n",
" \"https://ftp.ncbi.nlm.nih.gov/geo/series/GSE133nnn/GSE133344/suppl\"\n",
" \"/GSE133344_filtered_cell_identities.csv.gz\",\n",
" )\n",
"\n",
" for url in file_urls:\n",
" output_filename = os.path.join(output_path, url.split(\"/\")[-1])\n",
" download_binary_file(url, output_filename)\n",
"\n",
"\n",
"def read_norman_2019(file_directory: str) -> coo_matrix:\n",
" \"\"\"\n",
" Read the expression data for Norman et al. 2019 in the given directory.\n",
"\n",
" Args:\n",
" ----\n",
" file_directory: Directory containing Norman et al. 2019 data.\n",
"\n",
" Returns\n",
" -------\n",
" A sparse matrix containing single-cell gene expression count, with rows\n",
" representing genes and columns representing cells.\n",
" \"\"\"\n",
"\n",
" with gzip.open(\n",
" os.path.join(file_directory, \"GSE133344_filtered_matrix.mtx.gz\"), \"rb\"\n",
" ) as f:\n",
" matrix = mmread(f)\n",
"\n",
" return matrix"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "21457d17-ce85-405e-af71-b98f55cd9dfc",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloaded data from https://ftp.ncbi.nlm.nih.gov/geo/series/GSE133nnn/GSE133344/suppl/GSE133344_filtered_matrix.mtx.gz at ./GSE133344_filtered_matrix.mtx.gz\n",
"Downloaded data from https://ftp.ncbi.nlm.nih.gov/geo/series/GSE133nnn/GSE133344/suppl/GSE133344_filtered_genes.tsv.gz at ./GSE133344_filtered_genes.tsv.gz\n",
"Downloaded data from https://ftp.ncbi.nlm.nih.gov/geo/series/GSE133nnn/GSE133344/suppl/GSE133344_filtered_barcodes.tsv.gz at ./GSE133344_filtered_barcodes.tsv.gz\n",
"Downloaded data from https://ftp.ncbi.nlm.nih.gov/geo/series/GSE133nnn/GSE133344/suppl/GSE133344_filtered_cell_identities.csv.gz at ./GSE133344_filtered_cell_identities.csv.gz\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Trying to set attribute `.obs` of view, copying.\n"
]
}
],
"source": [
"download_path = \"./norman2019/\"\n",
"\n",
"download_norman_2019(download_path)\n",
"\n",
"matrix = read_norman_2019(download_path)\n",
"\n",
"# List of cell barcodes. The barcodes in this list are stored in the same order\n",
"# as cells are in the count matrix.\n",
"cell_barcodes = pd.read_csv(\n",
" os.path.join(download_path, \"GSE133344_filtered_barcodes.tsv.gz\"),\n",
" sep=\"\\t\",\n",
" header=None,\n",
" names=[\"cell_barcode\"],\n",
")\n",
"\n",
"# IDs/names of the gene features.\n",
"gene_list = pd.read_csv(\n",
" os.path.join(download_path, \"GSE133344_filtered_genes.tsv.gz\"),\n",
" sep=\"\\t\",\n",
" header=None,\n",
" names=[\"gene_id\", \"gene_name\"],\n",
")\n",
"\n",
"# Dataframe where each row corresponds to a cell, and each column corresponds\n",
"# to a gene feature.\n",
"matrix = pd.DataFrame(\n",
" matrix.transpose().todense(),\n",
" columns=gene_list[\"gene_id\"],\n",
" index=cell_barcodes[\"cell_barcode\"],\n",
" dtype=\"int32\",\n",
")\n",
"\n",
"# Dataframe mapping cell barcodes to metadata about that cell (e.g. which CRISPR\n",
"# guides were applied to that cell). Unfortunately, this list has a different\n",
"# ordering from the count matrix, so we have to be careful combining the metadata\n",
"# and count data.\n",
"cell_identities = pd.read_csv(\n",
" os.path.join(download_path, \"GSE133344_filtered_cell_identities.csv.gz\")\n",
").set_index(\"cell_barcode\")\n",
"\n",
"# This merge call reorders our metadata dataframe to match the ordering in the\n",
"# count matrix. Some cells in `cell_barcodes` do not have metadata associated with\n",
"# them, and their metadata values will be filled in as NaN.\n",
"aligned_metadata = pd.merge(\n",
" cell_barcodes,\n",
" cell_identities,\n",
" left_on=\"cell_barcode\",\n",
" right_index=True,\n",
" how=\"left\",\n",
").set_index(\"cell_barcode\")\n",
"\n",
"adata = AnnData(matrix)\n",
"adata.obs = aligned_metadata\n",
"\n",
"# Filter out any cells that don't have metadata values.\n",
"rows_without_nans = [\n",
" index for index, row in adata.obs.iterrows() if not row.isnull().any()\n",
"]\n",
"adata = adata[rows_without_nans, :]\n",
"\n",
"# Remove these as suggested by the authors. See lines referring to\n",
"# NegCtrl1_NegCtrl0 in GI_generate_populations.ipynb in the Norman 2019 paper's\n",
"# Github repo https://github.com/thomasmaxwellnorman/Perturbseq_GI/\n",
"adata = adata[adata.obs[\"guide_identity\"] != \"NegCtrl1_NegCtrl0__NegCtrl1_NegCtrl0\"]\n",
"\n",
"# We create a new metadata column with cleaner representations of CRISPR guide\n",
"# identities. The original format is <Guide1>_<Guide2>__<Guide1>_<Guide2>_<number>\n",
"adata.obs[\"guide_merged\"] = adata.obs[\"guide_identity\"]\n",
"\n",
"control_regex = re.compile(r\"NegCtrl(.*)_NegCtrl(.*)+NegCtrl(.*)_NegCtrl(.*)\")\n",
"for i in adata.obs[\"guide_merged\"].unique():\n",
" if control_regex.match(i):\n",
" # For any cells that only had control guides, we don't care about the\n",
" # specific IDs of the guides. Here we relabel them just as \"ctrl\".\n",
" adata.obs[\"guide_merged\"].replace(i, \"ctrl\", inplace=True)\n",
" else:\n",
" # Otherwise, we reformat the guide label to be <Guide1>+<Guide2>. If Guide1\n",
" # or Guide2 was a control, we replace it with \"ctrl\".\n",
" split = i.split(\"__\")[0]\n",
" split = split.split(\"_\")\n",
" for j, string in enumerate(split):\n",
" if \"NegCtrl\" in split[j]:\n",
" split[j] = \"ctrl\"\n",
" adata.obs[\"guide_merged\"].replace(i, f\"{split[0]}+{split[1]}\", inplace=True)\n",
"\n",
"guides_to_programs = {}\n",
"guides_to_programs.update(dict.fromkeys(G1_CYCLE, \"G1 cell cycle arrest\"))\n",
"guides_to_programs.update(dict.fromkeys(ERYTHROID, \"Erythroid\"))\n",
"guides_to_programs.update(dict.fromkeys(PIONEER_FACTORS, \"Pioneer factors\"))\n",
"guides_to_programs.update(\n",
" dict.fromkeys(GRANULOCYTE_APOPTOSIS, \"Granulocyte/apoptosis\")\n",
")\n",
"guides_to_programs.update(dict.fromkeys(PRO_GROWTH, \"Pro-growth\"))\n",
"guides_to_programs.update(dict.fromkeys(MEGAKARYOCYTE, \"Megakaryocyte\"))\n",
"guides_to_programs.update(dict.fromkeys([\"ctrl\"], \"Ctrl\"))\n",
"\n",
"adata.obs[\"gene_program\"] = [guides_to_programs[x] if x in guides_to_programs else \"N/A\" for x in adata.obs[\"guide_merged\"]]\n",
"adata.obs[\"good_coverage\"] = adata.obs[\"good_coverage\"].astype(bool)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "72c5c54f",
"metadata": {},
"outputs": [],
"source": [
"adata.write('Norman_2019_raw.h5ad')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
30 changes: 30 additions & 0 deletions datasets/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import requests
import os

def download_binary_file(
file_url: str, output_path: str, overwrite: bool = False
) -> None:
"""
Download binary data file from a URL.

Args:
----
file_url: URL where the file is hosted.
output_path: Output path for the downloaded file.
overwrite: Whether to overwrite existing downloaded file.

Returns
-------
None.
"""
file_exists = os.path.exists(output_path)
if (not file_exists) or (file_exists and overwrite):
request = requests.get(file_url)
with open(output_path, "wb") as f:
f.write(request.content)
print(f"Downloaded data from {file_url} at {output_path}")
else:
print(
f"File {output_path} already exists. "
"No files downloaded to overwrite the existing file."
)