gpu_tests/tensorflow_cuda_test.py at main · rlogwood/gpu_tests · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
#!/usr/bin/env python3
"""
TensorFlow CUDA/GPU Test Program
Tests CUDA availability and performs GPU computations
"""
import os
# Disable XLA JIT compilation BEFORE importing TensorFlow.
# The RTX 5080 (Compute Capability 12.0) is not yet fully supported.
# XLA's fused kernel compilation consumes excessive memory and crashes with OOM.
os.environ['TF_XLA_FLAGS'] = '--tf_xla_auto_jit=-1'

# Suppress C++ level warnings (like "GPU interconnect information not available")
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# Disable oneDNN custom operations to avoid floating-point round-off warnings
# and ensure consistent CPU results for comparison.
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

import tensorflow as tf
import sys
import time
import os
from dotenv import load_dotenv

def load_show_env_vars():
    load_dotenv()
    print("TensorFlow environment variables:")
    print(f"TF_XLA_FLAGS={os.getenv('TF_XLA_FLAGS')}")
    print(f"TF_CPP_MIN_LOG_LEVEL={os.getenv('TF_CPP_MIN_LOG_LEVEL')}")
    print(f"TF_ENABLE_ONEDNN_OPTS={os.getenv('TF_ENABLE_ONEDNN_OPTS')}")

def set_memory_growth():
    """
    Enable memory growth to prevent OOM on unsupported architectures
    where JIT compilation requires extra driver memory.
    """
    gpus = tf.config.list_physical_devices('GPU')
    if gpus:
        try:
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            print("✅ GPU Memory Growth enabled (essential for JIT compilation)")
        except RuntimeError as e:
            print(f"⚠️ Failed to set memory growth: {e}")


def print_separator():
    print("=" * 70)

def test_gpu_availability():
    """Check if GPU is available"""
    print_separator()
    print("GPU AVAILABILITY TEST")
    print_separator()

    gpus = tf.config.list_physical_devices('GPU')

    if not gpus:
        print("❌ No GPUs detected!")
        print("Possible reasons:")
        print("  - NVIDIA drivers not installed")
        print("  - TensorFlow not installed with CUDA support")
        print("  - CUDA toolkit not properly configured")
        return False

    print(f"✅ GPU(s) detected: {len(gpus)} device(s)")
    return True

def show_gpu_info():
    """Display GPU information"""
    print_separator()
    print("GPU INFORMATION")
    print_separator()

    gpus = tf.config.list_physical_devices('GPU')

    print(f"Number of GPUs: {len(gpus)}")

    for i, gpu in enumerate(gpus):
        print(f"\nGPU {i}:")
        print(f"  Device: {gpu.name}")
        print(f"  Device Type: {gpu.device_type}")

        # Get GPU details
        gpu_details = tf.config.experimental.get_device_details(gpu)
        if gpu_details:
            print(f"  Compute Capability: {gpu_details.get('compute_capability', 'N/A')}")
            device_name = gpu_details.get('device_name', 'Unknown')
            print(f"  Name: {device_name}")

        # Memory growth setting
        try:
            memory_growth = tf.config.experimental.get_memory_growth(gpu)
            print(f"  Memory Growth: {memory_growth}")
        except:
            pass

    print(f"\nTensorFlow Version: {tf.__version__}")
    print(f"Built with CUDA: {tf.test.is_built_with_cuda()}")

    # Check if GPU is actually being used
    #print(f"GPU Support Available: {tf.test.is_gpu_available(cuda_only=True) if hasattr(tf.test, 'is_gpu_available') else 'Use tf.config.list_physical_devices()'}")
    # Check if GPU is actually being used
    print(f"GPU Support Available: {len(tf.config.list_physical_devices('GPU')) > 0}")


def test_gpu_computation():
    """Perform a simple computation on GPU"""
    print_separator()
    print("GPU COMPUTATION TEST")
    print_separator()

    size = 5000
    print(f"Creating two {size}x{size} random matrices...")

    # Test on GPU
    print("\nPerforming matrix multiplication on GPU...")
    try:
        with tf.device('/GPU:0'):
            # Using simple initialization to ensure the handle is valid
            a_gpu = tf.random.normal([size, size])
            b_gpu = tf.random.normal([size, size])

            # Warm-up run
            _ = tf.matmul(a_gpu, b_gpu)

            # Timed run
            start = time.time()
            c_gpu = tf.matmul(a_gpu, b_gpu)
            # Force execution
            _ = c_gpu.numpy()
            end = time.time()

            gpu_time = end - start
            print(f"✅ GPU computation completed in {gpu_time:.4f} seconds")
    except Exception as e:
        print(f"❌ GPU Computation failed: {e}")
        print("This is likely due to Blackwell (RTX 5080) compatibility issues.")
        print("Ensure you are using the nightly build: pip install tf-nightly")
        return

    # Test on CPU
    print("\nComparing with CPU computation...")
    with tf.device('/CPU:0'):
        a_cpu = tf.random.normal([size, size])
        b_cpu = tf.random.normal([size, size])

        start = time.time()
        c_cpu = tf.matmul(a_cpu, b_cpu)
        # Force execution
        _ = c_cpu.numpy()
        end = time.time()

        cpu_time = end - start
        print(f"CPU computation completed in {cpu_time:.4f} seconds")

    speedup = cpu_time / gpu_time
    print(f"\n🚀 GPU Speedup: {speedup:.2f}x faster than CPU")

    # Verify results match (using smaller matrices for verification)
    print("\nVerifying correctness with smaller matrices...")
    with tf.device('/GPU:0'):
        a_small_gpu = tf.random.normal([100, 100])
        b_small_gpu = tf.random.normal([100, 100])
        c_small_gpu = tf.matmul(a_small_gpu, b_small_gpu)

    with tf.device('/CPU:0'):
        a_small_cpu = tf.identity(a_small_gpu)
        b_small_cpu = tf.identity(b_small_gpu)
        c_small_cpu = tf.matmul(a_small_cpu, b_small_cpu)


    max_diff = tf.reduce_max(tf.abs(c_small_gpu - c_small_cpu)).numpy()
    print(f"Maximum difference between CPU and GPU results: {max_diff:.2e}")

    # Tolerance increased to 5e-2 (0.05) to account for TensorFloat-32 (TF32)
    # precision differences on modern NVIDIA GPUs (Ampere/Blackwell).
    if max_diff < 5e-2:
        print("✅ Results match (within tolerance for TF32 execution)!")
    else:
        print("⚠️  Large difference detected - possible issue")


def test_tensor_operations():
    """Test various tensor operations on GPU"""
    print_separator()
    print("TENSOR OPERATIONS TEST")
    print_separator()

    try:
        with tf.device('/GPU:0'):
            # Test 1: Basic operations
            print("Test 1: Basic tensor operations...")
            x = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0])
            y = tf.constant([2.0, 3.0, 4.0, 5.0, 6.0])

            z = x + y
            print(f"  Addition: {z.numpy()}")

            z = x * y
            print(f"  Multiplication: {z.numpy()}")

            # Test 2: Neural network operations
            print("\nTest 2: Neural network operations...")
            input_tensor = tf.random.normal([32, 10])  # Batch of 32, 10 features
            weights = tf.random.normal([10, 5])        # 10 inputs -> 5 outputs
            bias = tf.random.normal([5])

            output = tf.matmul(input_tensor, weights) + bias
            output = tf.nn.relu(output)  # ReLU activation
            print(f"  Input shape: {input_tensor.shape}")
            print(f"  Output shape: {output.shape}")
            print("  ✅ Neural network operations work!")

            # Test 3: Gradients
            print("\nTest 3: Gradient computation...")
            x = tf.Variable(tf.random.normal([5, 5]))

            with tf.GradientTape() as tape:
                y = x ** 2
                z = tf.reduce_sum(y)

            gradients = tape.gradient(z, x)
            print(f"  Variable created: {x.shape}")
            print(f"  Gradient computed: {gradients is not None}")
            print(f"  Gradient shape: {gradients.shape}")
            print("  ✅ Gradient computation works!")
    except Exception as e:
        print(f"❌ Gradient computation failed: {e}")
        print("This is likely due to Blackwell (RTX 5080) compatibility issues.")
        print("Ensure you are using the nightly build: pip install tf-nightly")
        return

def test_keras_model():
    """Test a simple Keras model on GPU"""
    print_separator()
    print("KERAS MODEL TEST")
    print_separator()

    # Clear session to ensure a clean slate for the new architecture
    tf.keras.backend.clear_session()

    print("Creating a simple neural network...")

    # Create a simple model
    # We define the model and then compile it.
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(128, activation='relu', input_shape=(10,)),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    # Explicitly disable JIT compilation in compile().
    # RTX 5080 (Compute Capability 12.0) is not yet supported by XLA,
    # and the JIT-compiled fused kernels crash with OOM.
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy'],
        jit_compile=False
    )

    print(f"Model created with {model.count_params()} parameters")

    # Generate dummy data
    # Ensure data is created on the correct device by using tf.identity or similar
    x_train = tf.random.normal([1000, 10])
    y_train = tf.random.uniform([1000, 1], minval=0, maxval=2, dtype=tf.int32)
    y_train = tf.cast(y_train, tf.float32)

    print("Training model for 3 epochs...")

    # Train briefly
    start = time.time()
    history = model.fit(x_train, y_train, epochs=3, batch_size=32, verbose=0)
    end = time.time()

    print(f"✅ Training completed in {end - start:.4f} seconds")
    print(f"Final loss: {history.history['loss'][-1]:.4f}")
    print(f"Final accuracy: {history.history['accuracy'][-1]:.4f}")

    # Test prediction
    x_test = tf.random.normal([10, 10])
    predictions = model.predict(x_test, verbose=0)
    print(f"\nTest predictions shape: {predictions.shape}")
    print("✅ Keras model works on GPU!")

def test_mixed_precision():
    """Test mixed precision training"""
    print_separator()
    print("MIXED PRECISION TEST")
    print_separator()

    try:
        # Check current policy
        policy = tf.keras.mixed_precision.global_policy()
        print(f"Current mixed precision policy: {policy.name}")

        # Set mixed precision policy
        tf.keras.mixed_precision.set_global_policy('mixed_float16')
        new_policy = tf.keras.mixed_precision.global_policy()
        print(f"New mixed precision policy: {new_policy.name}")

        # Create a simple layer to test
        with tf.device('/GPU:0'):
            layer = tf.keras.layers.Dense(64)
            x = tf.random.normal([32, 10])
            y = layer(x)

            print(f"Input dtype: {x.dtype}")
            print(f"Output dtype: {y.dtype}")
            print(f"Weights dtype: {layer.kernel.dtype}")
            print("✅ Mixed precision works!")

        # Reset policy
        tf.keras.mixed_precision.set_global_policy('float32')

    except Exception as e:
        print(f"⚠️  Mixed precision test failed: {e}")


def test_mnist_data():
    """Load MNIST dataset to verify Keras integration"""
    print_separator()
    print("MNIST DATA TEST")
    print("Verifying Keras nightly is installed with TensorFlow nightly...")

    try:
        import keras
        (x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()

        print(f"Training set shape: {x_train.shape}")
        print(f"Test set shape:     {x_test.shape}")
        print("✅ MNIST dataset loaded successfully!")

    except Exception as e:
        print(f"❌ Failed to load MNIST dataset: {e}")

def main():
    print("\n" + "="*70)
    print("TensorFlow CUDA/GPU Test Program")
    print("="*70 + "\n")

    # Set memory growth before any other GPU operations
    # This is critical for new architectures like RTX 5080 (Blackwell)
    # to prevent CUDA_ERROR_INVALID_HANDLE and OOM errors.
    try:
        # Reset Keras state
        tf.keras.backend.clear_session()

        gpus = tf.config.list_physical_devices('GPU')
        if gpus:
            for gpu in gpus:
                try:
                    tf.config.experimental.set_memory_growth(gpu, True)
                except RuntimeError as e:
                    # Memory growth must be set before GPUs have been initialized
                    print(f"  Note: {e}")
            print("✅ GPU memory growth enabled")
    except Exception as e:
        print(f"⚠️ Could not set memory growth: {e}")


    # Set memory growth before any other GPU operations
    # with JIT compilation disabled, this is not be necessary.
    #set_memory_growth()


    # superceeded by TF_XLA_FLAGS='--tf_xla_auto_jit=-1'
    # Disable XLA JIT compilation.
    # The RTX 5080 (Compute Capability 12.0) is not yet fully supported.
    # JIT compilation of fused kernels (XLA) consumes excessive memory and fails with OOM.
    # tf.config.optimizer.set_jit(False)


    print(f"TensorFlow Version: {tf.__version__}")
    print(f"Python Version: {sys.version.split()[0]}")

    # Test 1: GPU availability
    if not test_gpu_availability():
        print("\n❌ Exiting - No GPU detected")
        return 1

    # Test 2: GPU information
    show_gpu_info()

    # Test 3: GPU computation
    test_gpu_computation()

    # Test 4: Various tensor operations
    test_tensor_operations()

    # Test 5: Keras model
    test_keras_model()

    # Test 6: Mixed precision
    test_mixed_precision()

    # Test 7: MNIST Data
    test_mnist_data()

    # Summary
    print_separator()
    print("TEST SUMMARY")
    print_separator()
    print("✅ All tests passed!")
    print("Your TensorFlow CUDA installation is working correctly.")
    print_separator()

    return 0

if __name__ == "__main__":
    load_show_env_vars()
    sys.exit(main())