aeris/TEMP_debug_nan_problem.py at main · alecotto/aeris · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import polars as pl

# Load both datasets
print("Loading datasets...")
original_df = pl.read_csv("datasets/refined/1h_unified/unified_dataset.csv")
final_df = pl.read_csv("datasets/refined/1h_unified/unified_training_dataset.csv")

print(f"Original shape: {original_df.shape}")
print(f"Final shape: {final_df.shape}")

# Find the NaN row in the final dataset
print("\n🔍 Finding NaN rows in final dataset...")
target_cols = [col for col in final_df.columns if col.startswith("target_")]

for i, col in enumerate(target_cols[:5]):  # Check first 5 target columns
    null_positions = final_df.with_row_index().filter(pl.col(col).is_null())["index"].to_list()
    if null_positions:
        print(f"NaN found in {col} at rows: {null_positions}")
        break

if null_positions:
    nan_row = null_positions[0]
    print(f"\n📍 NaN detected at final dataset row: {nan_row}")

    # The final dataset comes from:
    # 1. Original dataset (35438 rows)
    # 2. Trimmed end by 72 rows → 35366 rows
    # 3. After all processing, trimmed beginning by 170 rows → 35196 rows
    # But we're seeing 35293 rows, so let's work backwards

    # Row in the dataset before final trim
    before_final_trim_row = nan_row + 170
    print(f"📍 Row before final trim (slice 170): {before_final_trim_row}")

    # This should be within the original dataset after end-trimming
    # Original after end trim: 35438 - 72 = 35366
    if before_final_trim_row >= 35366:
        print(f"❌ Issue: Row {before_final_trim_row} exceeds trimmed dataset size 35366")
        print(f"This means the NaN is at the very end, suggesting insufficient trimming")

        # The issue is the last row again - we're at the boundary
        actual_original_row = 35366 - 1  # Last valid row after trimming
    else:
        actual_original_row = before_final_trim_row

    print(f"📍 Actual original row to check: {actual_original_row}")

    # Check the timestamp to verify
    if nan_row < final_df.height:
        final_timestamp = final_df[nan_row, "timestamp"]
        print(f"📅 Final dataset timestamp at NaN row: {final_timestamp}")

    # Check the original data around this problematic row
    print(f"\n🔍 Checking original data around row {actual_original_row}...")

    # Look at a window around the problematic row
    start_row = max(0, actual_original_row - 5)
    end_row = min(original_df.height, actual_original_row + 10)

    # Check for any null values in the original data that might cause issues
    first_asset = "ADAUSDT"  # Use first asset for debugging
    cols_to_check = [
        "timestamp",
        f"{first_asset}_close",
        f"{first_asset}_high",
        f"{first_asset}_low"
    ]

    subset = original_df.slice(start_row, end_row - start_row).select(cols_to_check)
    print(f"Original data rows {start_row} to {end_row - 1}:")
    print(subset)

    # Check for null counts in this subset
    print(f"\nNull counts in this range:")
    for col in cols_to_check[1:]:  # Skip timestamp
        null_count = subset[col].null_count()
        if null_count > 0:
            print(f"  {col}: {null_count} nulls")
            null_rows = subset.with_row_index().filter(pl.col(col).is_null())["index"].to_list()
            print(f"    At relative positions: {null_rows}")

    # Check if there are any infinite values or zeros that might cause log issues
    print(f"\nChecking for problematic values (zeros, infinities)...")
    for col in cols_to_check[1:]:
        zeros = subset.filter(pl.col(col) == 0).height
        if zeros > 0:
            print(f"  {col}: {zeros} zero values")

        negatives = subset.filter(pl.col(col) < 0).height
        if negatives > 0:
            print(f"  {col}: {negatives} negative values")

else:
    print("❌ No NaN rows found in target columns!")