Skip to content
77 changes: 57 additions & 20 deletions notebooks/0.download-data/2.preprocessing.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@
"import polars as pl\n",
"\n",
"sys.path.append(\"../../\")\n",
"from utils.data_utils import split_meta_and_features, add_cell_id_hash"
"from utils.data_utils import split_meta_and_features, add_cell_id_hash\n",
"from utils.io_utils import load_profiles"
]
},
{
Expand Down Expand Up @@ -93,15 +94,6 @@
" \"All elements in specific_plates must be pathlib.Path objects\"\n",
" )\n",
"\n",
" def load_profile(file: pathlib.Path) -> pl.DataFrame:\n",
" \"\"\"internal function to load a single profile file.\"\"\"\n",
" profile_df = pl.read_parquet(file)\n",
" meta_cols, _ = split_meta_and_features(profile_df)\n",
" if shared_features is not None:\n",
" # Only select metadata and shared features\n",
" return profile_df.select(meta_cols + shared_features)\n",
" return profile_df\n",
"\n",
" # Use specific_plates if provided, otherwise gather all .parquet files\n",
" if specific_plates is not None:\n",
" # Validate that all specific plate files exist\n",
Expand All @@ -115,7 +107,9 @@
" raise FileNotFoundError(f\"No profile files found in {profile_dir}\")\n",
"\n",
" # Load and concatenate profiles\n",
" loaded_profiles = [load_profile(f) for f in files_to_load]\n",
" loaded_profiles = [\n",
" load_profiles(f, shared_features=shared_features) for f in files_to_load\n",
" ]\n",
"\n",
" # Concatenate all loaded profiles\n",
" return pl.concat(loaded_profiles, rechunk=True)\n",
Expand Down Expand Up @@ -205,6 +199,11 @@
"# Setting profiles directory\n",
"profiles_dir = (data_dir / \"sc-profiles\").resolve(strict=True)\n",
"\n",
"# setting connectivity map drug repurposing config\n",
"drug_repurposing_config_path = (data_dir / \"repurposing_drugs_20180907.txt\").resolve(\n",
" strict=True\n",
")\n",
"\n",
"# Experimental metadata\n",
"exp_metadata_path = (\n",
" profiles_dir / \"cpjump1\" / \"cpjump1_compound_experimental-metadata.csv\"\n",
Expand Down Expand Up @@ -286,6 +285,14 @@
"- Adding a unique cell id has column `Metadata_cell_id`"
]
},
{
"cell_type": "markdown",
"id": "9ec882fa",
"metadata": {},
"source": [
"We are loading per-plate parquet profiles for compound-treated plates, selecting the shared feature set, concatenating them into a single Polars DataFrame while preserving metadata, and adding a unique Metadata_cell_id for each cell. The resulting cpjump1_profiles table is ready for downstream analysis."
]
},
{
"cell_type": "code",
"execution_count": 5,
Expand All @@ -306,12 +313,38 @@
")\n",
"\n",
"# create an index columm and unique cell ID based on features of a single profiles\n",
"cpjump1_profiles = add_cell_id_hash(cpjump1_profiles)\n",
"cpjump1_profiles = add_cell_id_hash(cpjump1_profiles)"
]
},
{
"cell_type": "markdown",
"id": "3df9bbf5",
"metadata": {},
"source": [
"Next we annotate the compound treatments in the CPJUMP1 dataset, we annotate each cell with Mechanism of Action (MoA) information using the [Clue Drug Repurposing Hub](https://clue.io/data/REP#REP). This resource provides comprehensive drug and tool compound annotations, including target information and clinical development status.\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "adfb9148",
"metadata": {},
"outputs": [],
"source": [
"# load drug repurposing moa file and add prefix to metadata columns\n",
"rep_moa_df = pl.read_csv(\n",
" drug_repurposing_config_path, separator=\"\\t\", skip_rows=9, encoding=\"utf8-lossy\"\n",
").rename(lambda x: f\"Metadata_{x}\" if not x.startswith(\"Metadata_\") else x)\n",
"\n",
"# merge the original cpjump1_profiles with rep_moa_df on Metadata_pert_iname\n",
"cpjump1_profiles = cpjump1_profiles.join(\n",
" rep_moa_df, on=\"Metadata_pert_iname\", how=\"left\"\n",
")\n",
"\n",
"# Split meta and features\n",
"# split meta and feature\n",
"meta_cols, features_cols = split_meta_and_features(cpjump1_profiles)\n",
"\n",
"# Saving metadata and features of the concat profile into a json file\n",
"# save the feature space information into a json file\n",
"meta_features_dict = {\n",
" \"concat-profiles\": {\n",
" \"meta-features\": meta_cols,\n",
Expand All @@ -321,7 +354,11 @@
"with open(cpjump1_output_dir / \"concat_profiles_meta_features.json\", \"w\") as f:\n",
" json.dump(meta_features_dict, f, indent=4)\n",
"\n",
"# save as parquet with defined order of columns\n",
"# save concatenated profiles\n",
"# Loading compound profiles with shared features and concat into a single DataFrame\n",
"concat_output_path = (\n",
" cpjump1_output_dir / \"cpjump1_compound_concat_profiles.parquet\"\n",
").resolve()\n",
"cpjump1_profiles.select(meta_cols + features_cols).write_parquet(concat_output_path)"
]
},
Expand Down Expand Up @@ -350,7 +387,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 7,
"id": "c5471d3e",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -404,7 +441,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 8,
"id": "c57da947",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -437,7 +474,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 9,
"id": "1d7ced04",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -490,7 +527,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 10,
"id": "42108980",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -537,7 +574,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 11,
"id": "1763d383",
"metadata": {},
"outputs": [],
Expand Down
58 changes: 40 additions & 18 deletions notebooks/0.download-data/nbconverted/2.preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@

sys.path.append("../../")
from utils.data_utils import add_cell_id_hash, split_meta_and_features
from utils.io_utils import load_profiles

# ## Helper functions
#
Expand Down Expand Up @@ -71,15 +72,6 @@ def load_and_concat_profiles(
"All elements in specific_plates must be pathlib.Path objects"
)

def load_profile(file: pathlib.Path) -> pl.DataFrame:
"""internal function to load a single profile file."""
profile_df = pl.read_parquet(file)
meta_cols, _ = split_meta_and_features(profile_df)
if shared_features is not None:
# Only select metadata and shared features
return profile_df.select(meta_cols + shared_features)
return profile_df

# Use specific_plates if provided, otherwise gather all .parquet files
if specific_plates is not None:
# Validate that all specific plate files exist
Expand All @@ -93,7 +85,9 @@ def load_profile(file: pathlib.Path) -> pl.DataFrame:
raise FileNotFoundError(f"No profile files found in {profile_dir}")

# Load and concatenate profiles
loaded_profiles = [load_profile(f) for f in files_to_load]
loaded_profiles = [
load_profiles(f, shared_features=shared_features) for f in files_to_load
]

# Concatenate all loaded profiles
return pl.concat(loaded_profiles, rechunk=True)
Expand Down Expand Up @@ -173,6 +167,11 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
# Setting profiles directory
profiles_dir = (data_dir / "sc-profiles").resolve(strict=True)

# setting connectivity map drug repurposing config
drug_repurposing_config_path = (data_dir / "repurposing_drugs_20180907.txt").resolve(
strict=True
)

# Experimental metadata
exp_metadata_path = (
profiles_dir / "cpjump1" / "cpjump1_compound_experimental-metadata.csv"
Expand Down Expand Up @@ -238,6 +237,8 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
# - Data integrity is maintained during the merge operation
# - Adding a unique cell id has column `Metadata_cell_id`

# We are loading per-plate parquet profiles for compound-treated plates, selecting the shared feature set, concatenating them into a single Polars DataFrame while preserving metadata, and adding a unique Metadata_cell_id for each cell. The resulting cpjump1_profiles table is ready for downstream analysis.

# In[5]:


Expand All @@ -256,10 +257,27 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
# create an index columm and unique cell ID based on features of a single profiles
cpjump1_profiles = add_cell_id_hash(cpjump1_profiles)

# Split meta and features

# Next we annotate the compound treatments in the CPJUMP1 dataset, we annotate each cell with Mechanism of Action (MoA) information using the [Clue Drug Repurposing Hub](https://clue.io/data/REP#REP). This resource provides comprehensive drug and tool compound annotations, including target information and clinical development status.
#

# In[6]:


# load drug repurposing moa file and add prefix to metadata columns
rep_moa_df = pl.read_csv(
drug_repurposing_config_path, separator="\t", skip_rows=9, encoding="utf8-lossy"
).rename(lambda x: f"Metadata_{x}" if not x.startswith("Metadata_") else x)

# merge the original cpjump1_profiles with rep_moa_df on Metadata_pert_iname
cpjump1_profiles = cpjump1_profiles.join(
rep_moa_df, on="Metadata_pert_iname", how="left"
)

# split meta and feature
meta_cols, features_cols = split_meta_and_features(cpjump1_profiles)

# Saving metadata and features of the concat profile into a json file
# save the feature space information into a json file
meta_features_dict = {
"concat-profiles": {
"meta-features": meta_cols,
Expand All @@ -269,7 +287,11 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
with open(cpjump1_output_dir / "concat_profiles_meta_features.json", "w") as f:
json.dump(meta_features_dict, f, indent=4)

# save as parquet with defined order of columns
# save concatenated profiles
# Loading compound profiles with shared features and concat into a single DataFrame
concat_output_path = (
cpjump1_output_dir / "cpjump1_compound_concat_profiles.parquet"
).resolve()
cpjump1_profiles.select(meta_cols + features_cols).write_parquet(concat_output_path)


Expand All @@ -290,7 +312,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
#
# The preprocessing ensures that all MitoCheck datasets share a common feature space and are ready for comparative analysis with CPJUMP1 profiles.

# In[6]:
# In[7]:


# load in mitocheck profiles and save as parquet
Expand Down Expand Up @@ -334,7 +356,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr

# Filter Cell Profiler (CP) features and preprocess columns by removing the "CP__" prefix to standardize feature names for downstream analysis.

# In[7]:
# In[8]:


# Split profiles to only retain cell profiler features
Expand All @@ -357,7 +379,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr

# Splitting the metadata and feature columns for each dataset to enable targeted downstream analysis and ensure consistent data structure across all profiles.

# In[8]:
# In[9]:


# manually selecting metadata features that are present across all 3 profiles
Expand Down Expand Up @@ -406,7 +428,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
)


# In[9]:
# In[10]:


# create concatenated mitocheck profiles
Expand Down Expand Up @@ -444,7 +466,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
# - **Unique cell identification**: Adding `Metadata_cell_id` column with unique hash values based on all profile features to enable precise cell tracking and deduplication
#

# In[10]:
# In[11]:


# load in cfret profiles and add a unique cell ID
Expand Down
11 changes: 11 additions & 0 deletions utils/io_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,14 @@
import yaml
from tqdm import tqdm

from .data_utils import split_meta_and_features


def load_profiles(
fpath: str | pathlib.Path,
convert_to_f32: bool = False,
verbose: bool | None = False,
shared_features: list[str] | None = None,
) -> pl.DataFrame:
"""Load single-cell profiles from given file path.

Expand All @@ -29,6 +32,9 @@ def load_profiles(
If True, converts all Float64 columns to Float32 to save memory. Default is False
verbose : bool, optional
If True, prints information about the loaded profiles. Default is False.
shared_features : list[str] | None, optional
If provided, only loads metadata columns and these specific feature columns.
Default is None (loads all columns).

Returns
-------
Expand Down Expand Up @@ -61,6 +67,11 @@ def load_profiles(
# load profiles
loaded_profiles = pl.read_parquet(fpath)

# filter to shared features if provided
if shared_features is not None:
meta_cols, _ = split_meta_and_features(loaded_profiles)
loaded_profiles = loaded_profiles.select(meta_cols + shared_features)

# convert all Float64 columns to Float32 if convert_to_f32 is True
if convert_to_f32:
loaded_profiles = loaded_profiles.with_columns(
Expand Down