-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmatmul_bench.py
More file actions
39 lines (34 loc) · 1.5 KB
/
matmul_bench.py
File metadata and controls
39 lines (34 loc) · 1.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import os, time, torch
def bench(device, N=4096, iters=20, dtype=torch.float16):
# create on device
a = torch.randn(N, N, device=device, dtype=dtype)
b = torch.randn(N, N, device=device, dtype=dtype)
# warmup (important for fair timing)
for _ in range(3):
(a @ b).sum().item()
if device.type == "cuda":
torch.cuda.synchronize()
# timed loop
t0 = time.perf_counter()
for _ in range(iters):
(a @ b).sum().item()
if device.type == "cuda":
torch.cuda.synchronize()
dt = time.perf_counter() - t0
gflops = (2 * (N**3) * iters) / dt / 1e9
return dt/iters, gflops
if __name__ == "__main__":
N = int(os.getenv("N", "4096"))
iters = int(os.getenv("ITERS", "20"))
dtype = torch.float16 if os.getenv("DTYPE", "fp16").lower() == "fp16" else torch.float32
print(f"PyTorch {torch.__version__}")
print("CUDA available:", torch.cuda.is_available()) # basic check :contentReference[oaicite:1]{index=1}
if torch.cuda.is_available():
print("GPU:", torch.cuda.get_device_name(0))
# CPU
t_cpu, g_cpu = bench(torch.device("cpu"), N, iters, dtype=torch.float32) # keep CPU in fp32
print(f"[CPU] N={N} iters={iters} avg={t_cpu*1e3:.2f} ms ~{g_cpu:.1f} GFLOP/s")
# GPU (if present)
if torch.cuda.is_available():
t_gpu, g_gpu = bench(torch.device("cuda:0"), N, iters, dtype=dtype)
print(f"[CUDA] N={N} iters={iters} avg={t_gpu*1e3:.2f} ms ~{g_gpu:.1f} GFLOP/s (dtype={dtype})")