-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtool_nan_inf_check.py
More file actions
127 lines (104 loc) Β· 4.24 KB
/
tool_nan_inf_check.py
File metadata and controls
127 lines (104 loc) Β· 4.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import os
import polars as pl
import numpy as np
# Configuration
OUTPUT_BASE = "datasets/refined/unified_2_final"
INTERVALS = ['5m', '15m', '30m', '1h', '2h', '4h', '6h', '8h', '12h']
FINAL_OUTPUT = "unified_training_dataset_softclipped.parquet"
def check_parquet_file(file_path: str) -> dict:
"""Check a single parquet file for NaNs, Infs, and nulls."""
try:
df = pl.read_parquet(file_path)
# Get basic info
total_rows = df.height
total_cols = len(df.columns)
total_cells = total_rows * total_cols
# Check for nulls
null_counts = df.null_count()
total_nulls = sum(null_counts.row(0))
# Check for NaNs and Infs in numeric columns
numeric_cols = [col for col, dtype in zip(df.columns, df.dtypes)
if dtype in [pl.Float32, pl.Float64, pl.Int32, pl.Int64]]
nan_counts = {}
inf_counts = {}
total_nans = 0
total_infs = 0
for col in numeric_cols:
# Check for NaNs
nan_count = df.select(pl.col(col).is_nan().sum()).item()
if nan_count > 0:
nan_counts[col] = nan_count
total_nans += nan_count
# Check for Infs
inf_count = df.select(pl.col(col).is_infinite().sum()).item()
if inf_count > 0:
inf_counts[col] = inf_count
total_infs += inf_count
return {
'file_path': file_path,
'total_rows': total_rows,
'total_cols': total_cols,
'total_cells': total_cells,
'total_nulls': total_nulls,
'total_nans': total_nans,
'total_infs': total_infs,
'null_counts': null_counts,
'nan_counts': nan_counts,
'inf_counts': inf_counts,
'has_issues': total_nulls > 0 or total_nans > 0 or total_infs > 0
}
except Exception as e:
return {
'file_path': file_path,
'error': str(e),
'has_issues': True
}
def main():
print("π Checking parquet files for NaNs, Infs, and missing values...\n")
all_results = []
issues_found = False
for interval in INTERVALS:
file_path = os.path.join(OUTPUT_BASE, interval, FINAL_OUTPUT)
if not os.path.exists(file_path):
print(f"β οΈ {interval}: File not found - {file_path}")
continue
print(f"π Checking {interval}...")
result = check_parquet_file(file_path)
all_results.append(result)
if 'error' in result:
print(f"β {interval}: ERROR - {result['error']}")
issues_found = True
elif result['has_issues']:
print(f"β {interval}: ISSUES FOUND")
print(f" β’ Nulls: {result['total_nulls']:,}")
print(f" β’ NaNs: {result['total_nans']:,}")
print(f" β’ Infs: {result['total_infs']:,}")
# Show problematic columns
if result['nan_counts']:
print(f" β’ NaN columns: {list(result['nan_counts'].keys())}")
if result['inf_counts']:
print(f" β’ Inf columns: {list(result['inf_counts'].keys())}")
issues_found = True
else:
print(f"β
{interval}: Clean ({result['total_rows']:,} rows, {result['total_cols']} cols)")
# Summary
print("\n" + "=" * 50)
if issues_found:
print("β ISSUES FOUND in one or more files!")
print("\nDetailed breakdown:")
for result in all_results:
if result.get('has_issues', False):
print(f"\nπ {os.path.basename(os.path.dirname(result['file_path']))}:")
if 'error' in result:
print(f" ERROR: {result['error']}")
else:
print(f" Total cells: {result['total_cells']:,}")
print(f" Nulls: {result['total_nulls']:,}")
print(f" NaNs: {result['total_nans']:,}")
print(f" Infs: {result['total_infs']:,}")
else:
print("β
ALL FILES ARE CLEAN!")
total_files = len([r for r in all_results if 'error' not in r])
print(f"Checked {total_files} files successfully.")
if __name__ == "__main__":
main()