Thank you for such a great open-source project, but I encountered some issues while using it.
Please help me check why this problem occurs, thanks
Environments:
OS: ubuntu 22.04
GPU: NVIDIA GeForce RTX 4090 D, 24GB
Driver Version: 575.64.03
CUDA Toolkit: 12.6
triton: 3.3.1
torch: 2.7.1
Error:

Code:
import torch
import pickle
from pi0_infer import Pi0Inference
def main():
converted_checkpoint = pickle.load(open('converted_checkpoint.pkl', 'rb'))
length_of_trajectory = 10
normalized_observation_image_bfloat16 = torch.randn(2, 224, 224, 3, dtype=torch.bfloat16)
observation_state_bfloat16 = torch.randn(32, dtype=torch.bfloat16)
diffusion_input_noise_bfloat16 = torch.randn(length_of_trajectory, 32, dtype=torch.bfloat16)
infer = Pi0Inference(converted_checkpoint, 2, length_of_trajectory)
import time
t1 = time.time()
output_actions = infer.forward(
normalized_observation_image_bfloat16, # (number_of_images, 224, 224, 3)
observation_state_bfloat16, # (32,)
diffusion_input_noise_bfloat16, # (length_of_trajectory, 32)
)
t2 = time.time()
print(t2 - t1)
if __name__ == "__main__":
main()
Another normal case is ok:
import torch
import triton
import triton.language as tl
@triton.jit
def add_kernel(
x_ptr,
y_ptr,
output_ptr,
n_elements,
BLOCK_SIZE: tl.constexpr,
):
pid = tl.program_id(axis=0)
block_start = pid * BLOCK_SIZE
offsets = block_start + tl.arange(0, BLOCK_SIZE)
mask = offsets < n_elements
x = tl.load(x_ptr + offsets, mask=mask)
y = tl.load(y_ptr + offsets, mask=mask)
output = x + y
tl.store(output_ptr + offsets, output, mask=mask)
def test_triton_add():
N = 1024
device = "cuda"
x = torch.randn(N, dtype=torch.float32, device=device)
y = torch.randn(N, dtype=torch.float32, device=device)
output = torch.empty_like(x)
grid = lambda meta: (triton.cdiv(N, meta["BLOCK_SIZE"]),)
add_kernel[grid](x, y, output, N, BLOCK_SIZE=256)
expected = x + y
if torch.allclose(output, expected, atol=1e-6):
print("✅ Triton kernel works correctly!")
return True
else:
print("❌ Triton kernel output mismatch!")
print("Max diff:", (output - expected).abs().max().item())
return False
if __name__ == "__main__":
test_triton_add()
Thank you for such a great open-source project, but I encountered some issues while using it.
Please help me check why this problem occurs, thanks
Environments:
OS: ubuntu 22.04
GPU: NVIDIA GeForce RTX 4090 D, 24GB
Driver Version: 575.64.03
CUDA Toolkit: 12.6
triton: 3.3.1
torch: 2.7.1
Error:

Code:
Another normal case is ok: