matmul/matmul_bench.py at main · simd-ai/matmul · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import os, time, torch

def bench(device, N=4096, iters=20, dtype=torch.float16):
    # create on device
    a = torch.randn(N, N, device=device, dtype=dtype)
    b = torch.randn(N, N, device=device, dtype=dtype)
    # warmup (important for fair timing)
    for _ in range(3):
        (a @ b).sum().item()
        if device.type == "cuda":
            torch.cuda.synchronize()
    # timed loop
    t0 = time.perf_counter()
    for _ in range(iters):
        (a @ b).sum().item()
        if device.type == "cuda":
            torch.cuda.synchronize()
    dt = time.perf_counter() - t0
    gflops = (2 * (N**3) * iters) / dt / 1e9
    return dt/iters, gflops

if __name__ == "__main__":
    N = int(os.getenv("N", "4096"))
    iters = int(os.getenv("ITERS", "20"))
    dtype = torch.float16 if os.getenv("DTYPE", "fp16").lower() == "fp16" else torch.float32

    print(f"PyTorch {torch.__version__}")
    print("CUDA available:", torch.cuda.is_available())  # basic check :contentReference[oaicite:1]{index=1}
    if torch.cuda.is_available():
        print("GPU:", torch.cuda.get_device_name(0))

    # CPU
    t_cpu, g_cpu = bench(torch.device("cpu"), N, iters, dtype=torch.float32)  # keep CPU in fp32
    print(f"[CPU]   N={N} iters={iters}  avg={t_cpu*1e3:.2f} ms  ~{g_cpu:.1f} GFLOP/s")

    # GPU (if present)
    if torch.cuda.is_available():
        t_gpu, g_gpu = bench(torch.device("cuda:0"), N, iters, dtype=dtype)
        print(f"[CUDA]  N={N} iters={iters}  avg={t_gpu*1e3:.2f} ms  ~{g_gpu:.1f} GFLOP/s  (dtype={dtype})")