-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathTEMP_debug_nan_problem.py
More file actions
94 lines (75 loc) · 3.72 KB
/
TEMP_debug_nan_problem.py
File metadata and controls
94 lines (75 loc) · 3.72 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import polars as pl
# Load both datasets
print("Loading datasets...")
original_df = pl.read_csv("datasets/refined/1h_unified/unified_dataset.csv")
final_df = pl.read_csv("datasets/refined/1h_unified/unified_training_dataset.csv")
print(f"Original shape: {original_df.shape}")
print(f"Final shape: {final_df.shape}")
# Find the NaN row in the final dataset
print("\n🔍 Finding NaN rows in final dataset...")
target_cols = [col for col in final_df.columns if col.startswith("target_")]
for i, col in enumerate(target_cols[:5]): # Check first 5 target columns
null_positions = final_df.with_row_index().filter(pl.col(col).is_null())["index"].to_list()
if null_positions:
print(f"NaN found in {col} at rows: {null_positions}")
break
if null_positions:
nan_row = null_positions[0]
print(f"\n📍 NaN detected at final dataset row: {nan_row}")
# The final dataset comes from:
# 1. Original dataset (35438 rows)
# 2. Trimmed end by 72 rows → 35366 rows
# 3. After all processing, trimmed beginning by 170 rows → 35196 rows
# But we're seeing 35293 rows, so let's work backwards
# Row in the dataset before final trim
before_final_trim_row = nan_row + 170
print(f"📍 Row before final trim (slice 170): {before_final_trim_row}")
# This should be within the original dataset after end-trimming
# Original after end trim: 35438 - 72 = 35366
if before_final_trim_row >= 35366:
print(f"❌ Issue: Row {before_final_trim_row} exceeds trimmed dataset size 35366")
print(f"This means the NaN is at the very end, suggesting insufficient trimming")
# The issue is the last row again - we're at the boundary
actual_original_row = 35366 - 1 # Last valid row after trimming
else:
actual_original_row = before_final_trim_row
print(f"📍 Actual original row to check: {actual_original_row}")
# Check the timestamp to verify
if nan_row < final_df.height:
final_timestamp = final_df[nan_row, "timestamp"]
print(f"📅 Final dataset timestamp at NaN row: {final_timestamp}")
# Check the original data around this problematic row
print(f"\n🔍 Checking original data around row {actual_original_row}...")
# Look at a window around the problematic row
start_row = max(0, actual_original_row - 5)
end_row = min(original_df.height, actual_original_row + 10)
# Check for any null values in the original data that might cause issues
first_asset = "ADAUSDT" # Use first asset for debugging
cols_to_check = [
"timestamp",
f"{first_asset}_close",
f"{first_asset}_high",
f"{first_asset}_low"
]
subset = original_df.slice(start_row, end_row - start_row).select(cols_to_check)
print(f"Original data rows {start_row} to {end_row - 1}:")
print(subset)
# Check for null counts in this subset
print(f"\nNull counts in this range:")
for col in cols_to_check[1:]: # Skip timestamp
null_count = subset[col].null_count()
if null_count > 0:
print(f" {col}: {null_count} nulls")
null_rows = subset.with_row_index().filter(pl.col(col).is_null())["index"].to_list()
print(f" At relative positions: {null_rows}")
# Check if there are any infinite values or zeros that might cause log issues
print(f"\nChecking for problematic values (zeros, infinities)...")
for col in cols_to_check[1:]:
zeros = subset.filter(pl.col(col) == 0).height
if zeros > 0:
print(f" {col}: {zeros} zero values")
negatives = subset.filter(pl.col(col) < 0).height
if negatives > 0:
print(f" {col}: {negatives} negative values")
else:
print("❌ No NaN rows found in target columns!")