-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprepare_data.py
More file actions
77 lines (69 loc) · 3.43 KB
/
prepare_data.py
File metadata and controls
77 lines (69 loc) · 3.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# prepare_data.py
import os
import json
import polars as pl
from scipy.sparse import save_npz
from data_utils import get_cancer_type_from_sample_id, generate_bool_map
def main():
print("--- Starting Data Preparation ---")
try:
with open("config.json", 'r') as f:
config = json.load(f)
print(" INFO: Configuration loaded from config.json")
except Exception as e:
print(f" ERROR: Could not load config.json: {e}")
return
TCGA_ZSCORES_PATH = config["TCGA_ZSCORES_PATH"]
EMB_JSON_PATH = config["EMB_JSON_PATH"]
PREPROCESSED_DATA_DIR = config["PREPROCESSED_DATA_DIR"]
TOP_N_FEATURES_CUTOFF = config["TOP_N_FEATURES_CUTOFF"]
BOOL_MAP_CHUNK_SIZE = config["BOOL_MAP_CHUNK_SIZE"]
os.makedirs(PREPROCESSED_DATA_DIR, exist_ok=True)
print(f" INFO: Preprocessed data will be saved to: {PREPROCESSED_DATA_DIR}")
print(f"\nStep 1: Loading sample IDs and feature count...")
try:
df_schema_scan = pl.scan_parquet(TCGA_ZSCORES_PATH)
all_tcga_samples = sorted(df_schema_scan.collect_schema().names()[6:])
n_features_actual = df_schema_scan.select(pl.len()).collect().item()
print(f" INFO: Loaded {len(all_tcga_samples)} TCGA sample IDs and found {n_features_actual} total features.")
with open(os.path.join(PREPROCESSED_DATA_DIR, "all_tcga_samples.json"), 'w') as f: json.dump(all_tcga_samples, f)
print(f" SUCCESS: Saved all_tcga_samples.json")
except Exception as e:
print(f" ERROR: Could not load sample info: {e}")
return
print(f"\nStep 2: Generating global bool_map...")
try:
df_lazy_zscores = pl.scan_parquet(TCGA_ZSCORES_PATH)
bool_map_sparse = generate_bool_map(
df_lazy_full=df_lazy_zscores, all_tcga_samples=all_tcga_samples,
n_features_total_for_ranking=n_features_actual,
top_n_features_cutoff=TOP_N_FEATURES_CUTOFF,
bool_map_chunk_size=BOOL_MAP_CHUNK_SIZE
)
save_npz(os.path.join(PREPROCESSED_DATA_DIR, "bool_map_overall_sparse_feat_x_sample.npz"), bool_map_sparse)
print(f" SUCCESS: Global bool_map saved.")
except Exception as e:
print(f" ERROR: bool_map generation failed: {e}")
return
print(f"\nStep 3: Loading and saving embryonic groupings...")
try:
with open(EMB_JSON_PATH, 'r') as f:
embryonic_data = json.load(f)
with open(os.path.join(PREPROCESSED_DATA_DIR, "emb_groupings.json"), 'w') as f:
json.dump(embryonic_data, f)
print(f" SUCCESS: Loaded and saved embryonic groupings.")
except Exception as e:
print(f" ERROR: Could not load {EMB_JSON_PATH}: {e}")
return
print(f"\nStep 4: Creating sample_to_cancer_type map...")
sample_to_cancer_type_map = {s_id: get_cancer_type_from_sample_id(s_id) for s_id in all_tcga_samples}
with open(os.path.join(PREPROCESSED_DATA_DIR, "sample_to_cancer_type_map.json"), 'w') as f:
json.dump(sample_to_cancer_type_map, f)
print(f" SUCCESS: Saved sample_to_cancer_type_map.json.")
print(f" VERIFICATION: Review the first few mappings:")
for i, s_id in enumerate(all_tcga_samples[:min(5, len(all_tcga_samples))]):
print(f" Sample: {s_id} -> Mapped Cancer Type: {sample_to_cancer_type_map.get(s_id, 'Not Mapped')}")
print("\n--- Data Preparation Complete ---")
if __name__ == "__main__":
pl.enable_string_cache() # Corrected Polars API call
main()