Skip to content

RuntimeError: Triton Error [CUDA]: an illegal memory access was encountered #23

@veyron95

Description

@veyron95

Thank you for such a great open-source project, but I encountered some issues while using it.
Please help me check why this problem occurs, thanks

Environments:
OS: ubuntu 22.04
GPU: NVIDIA GeForce RTX 4090 D, 24GB
Driver Version: 575.64.03
CUDA Toolkit: 12.6
triton: 3.3.1
torch: 2.7.1

Error:
Image

Code:

import torch
import pickle
from pi0_infer import Pi0Inference

def main():
   converted_checkpoint = pickle.load(open('converted_checkpoint.pkl', 'rb'))

   length_of_trajectory = 10
   normalized_observation_image_bfloat16 = torch.randn(2, 224, 224, 3, dtype=torch.bfloat16)
   observation_state_bfloat16 = torch.randn(32, dtype=torch.bfloat16)
   diffusion_input_noise_bfloat16 = torch.randn(length_of_trajectory, 32, dtype=torch.bfloat16)

   infer = Pi0Inference(converted_checkpoint, 2, length_of_trajectory)

   import time
   t1 = time.time()


   output_actions = infer.forward(
      normalized_observation_image_bfloat16, # (number_of_images, 224, 224, 3)
      observation_state_bfloat16, # (32,)
      diffusion_input_noise_bfloat16, # (length_of_trajectory, 32)
   )

   t2 = time.time()

   print(t2 - t1)

if __name__ == "__main__":
    main()

Another normal case is ok:

Image
import torch
import triton
import triton.language as tl

@triton.jit
def add_kernel(
    x_ptr,
    y_ptr,
    output_ptr,
    n_elements,
    BLOCK_SIZE: tl.constexpr,
):
    pid = tl.program_id(axis=0)
    block_start = pid * BLOCK_SIZE
    offsets = block_start + tl.arange(0, BLOCK_SIZE)
    mask = offsets < n_elements
    x = tl.load(x_ptr + offsets, mask=mask)
    y = tl.load(y_ptr + offsets, mask=mask)
    output = x + y
    tl.store(output_ptr + offsets, output, mask=mask)

def test_triton_add():
    N = 1024
    device = "cuda"

    x = torch.randn(N, dtype=torch.float32, device=device)
    y = torch.randn(N, dtype=torch.float32, device=device)
    output = torch.empty_like(x)

    grid = lambda meta: (triton.cdiv(N, meta["BLOCK_SIZE"]),)

    add_kernel[grid](x, y, output, N, BLOCK_SIZE=256)

    expected = x + y
    if torch.allclose(output, expected, atol=1e-6):
        print("✅ Triton kernel works correctly!")
        return True
    else:
        print("❌ Triton kernel output mismatch!")
        print("Max diff:", (output - expected).abs().max().item())
        return False

if __name__ == "__main__":
    test_triton_add()

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions