-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmg_test_arrow_light.py
More file actions
27 lines (20 loc) · 942 Bytes
/
mg_test_arrow_light.py
File metadata and controls
27 lines (20 loc) · 942 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
# filename: inspect_arrow.py
import polars as pl
from pathlib import Path
ARROW_PATH = Path("datasets/arrow_mega/unified_1m_megaset.arrow")
LOG_FILENAME = Path("column_log.txt")
def main() -> None:
if not ARROW_PATH.exists():
raise FileNotFoundError(f"{ARROW_PATH} not found")
# Lazy scan so we don’t load the whole file into RAM
lf = pl.scan_ipc(ARROW_PATH, memory_map=True)
# --- column list ---------------------------------------------------------
col_names = lf.schema.keys()
print(f"📝 Writing {len(col_names):,} column names to {LOG_FILENAME} …")
LOG_FILENAME.write_text("\n".join(col_names))
# --- row / column counts -------------------------------------------------
n_rows = lf.select(pl.len()).collect().item() # fast len() trick
n_cols = len(col_names)
print(f"✅ Dataset shape: {n_rows:,} rows × {n_cols:,} columns")
if __name__ == "__main__":
main()