-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_utils.py
More file actions
54 lines (47 loc) · 2.4 KB
/
data_utils.py
File metadata and controls
54 lines (47 loc) · 2.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import polars as pl
import numpy as np
from scipy.sparse import csr_array, vstack as sp_vstack
import scipy.sparse as sp
from tqdm.auto import tqdm
def get_cancer_type_from_sample_id(sample_id: str) -> str:
"""
Extracts cancer type from a sample ID.
Handles both formats:
- Traditional: 'CANCER-XXX' (returns 'CANCER')
- scATAC format: 'scATAC_CANCER_XXX' (returns 'scATAC_CANCER')
"""
if not isinstance(sample_id, str):
return "UNKNOWN"
if sample_id.startswith("scATAC_"):
# Format: scATAC_CANCER_XXXXX
parts = sample_id.split('_')
if len(parts) >= 2:
return f"scATAC_{parts[1]}"
elif '-' in sample_id:
# Traditional format: CANCER-XXXXX
return sample_id.split('-')[0]
return "UNKNOWN_CANCER_CODE"
def generate_bool_map(df_lazy_full: pl.LazyFrame, all_tcga_samples: list,
n_features_total_for_ranking: int, top_n_features_cutoff: int,
bool_map_chunk_size: int) -> sp.csr_matrix:
"""Generates the bool_map (features x samples) from a Polars LazyFrame."""
print(f" INFO: Using N_FEATURES_TOTAL_FOR_RANKING = {n_features_total_for_ranking} for ranking.")
df_ranked_lazy = df_lazy_full.select(
[(n_features_total_for_ranking - pl.col(sample).rank(method="random", descending=False)).alias(sample)
for sample in all_tcga_samples]
)
print(" INFO: Defined lazy df_ranked expression for bool_map generation.")
blocks = []
print(f" INFO: Generating bool_map with TOP_N_FEATURES_CUTOFF = {top_n_features_cutoff}...")
for start_idx in tqdm(range(0, n_features_total_for_ranking, bool_map_chunk_size),
desc=" Processing feature chunks for bool_map"):
chunk_data_lazy = df_ranked_lazy.slice(start_idx, bool_map_chunk_size)
chunk_collected = chunk_data_lazy.collect()
chunk_bool_numpy = (chunk_collected.to_numpy() < top_n_features_cutoff).astype(np.int8)
blocks.append(csr_array(chunk_bool_numpy))
if not blocks:
raise ValueError("No blocks generated for bool_map.")
bool_map_sparse_features_x_samples = sp_vstack(blocks, format="csr")
print(f" INFO: Generated bool_map_sparse_features_x_samples. Shape: {bool_map_sparse_features_x_samples.shape}, "
f"Stored elements: {bool_map_sparse_features_x_samples.nnz}")
return bool_map_sparse_features_x_samples