diff --git a/torchft/quantization.py b/torchft/quantization.py
index 4c9d72e..6899156 100644
--- a/torchft/quantization.py
+++ b/torchft/quantization.py
@@ -130,7 +130,7 @@ def _fused_kernel_quantize_into_fp8(
     # Compute maximum for the current row block by block
     col_offsets = tl.arange(0, BLOCK_SIZE)
     col_maxes = tl.full((BLOCK_SIZE,), 0, dtype=tl.float32)
-    for i_b in range(0, tl.cdiv(i_cols_num, BLOCK_SIZE)):
+    for _i_b in range(0, tl.cdiv(i_cols_num, BLOCK_SIZE)):
         i_row_block = tl.load(
             i_ptr + i_row_idx * i_row_stride + col_offsets * i_col_stride,
             mask=col_offsets < i_cols_num,
@@ -146,7 +146,7 @@ def _fused_kernel_quantize_into_fp8(
 
     # Scale and quantize current row block by block
     col_offsets = tl.arange(0, BLOCK_SIZE)
-    for i_b in range(0, tl.cdiv(i_cols_num, BLOCK_SIZE)):
+    for _i_b in range(0, tl.cdiv(i_cols_num, BLOCK_SIZE)):
         i_row_block = tl.load(
             i_ptr + i_row_idx * i_row_stride + col_offsets * i_col_stride,
             mask=col_offsets < i_cols_num,
@@ -240,7 +240,7 @@ def _fused_kernel_dequantize_from_fp8(
 
     # Dequantize and store current row block by block
     col_offsets = tl.arange(0, BLOCK_SIZE)
-    for i_b in range(0, tl.cdiv(i_cols_num, BLOCK_SIZE)):
+    for _i_b in range(0, tl.cdiv(i_cols_num, BLOCK_SIZE)):
         i_quant_row_block = tl.load(
             o_quant_ptr + col_offsets,
             mask=col_offsets < i_cols_num,
@@ -315,7 +315,7 @@ def _fused_kernel_reduce_fp8(
     col_offsets = tl.arange(0, BLOCK_SIZE)
     # Compute scaling factor the reduced row
     o_row_max = 0.0
-    for o_b in range(0, tl.cdiv(i_cols_num, BLOCK_SIZE)):
+    for _o_b in range(0, tl.cdiv(i_cols_num, BLOCK_SIZE)):
         o_row_block_acc = tl.zeros((BLOCK_SIZE,), dtype=tl.float32)
         col_offsets_mask = col_offsets < i_cols_num
         # Load blocks of quantized rows, dequantize and accumulate
@@ -347,7 +347,7 @@ def _fused_kernel_reduce_fp8(
 
     col_offsets = tl.arange(0, BLOCK_SIZE)
     # Reduce the row in blocks and write them out
-    for o_b in range(0, tl.cdiv(i_cols_num, BLOCK_SIZE)):
+    for _o_b in range(0, tl.cdiv(i_cols_num, BLOCK_SIZE)):
         o_row_block_acc = tl.zeros((BLOCK_SIZE,), dtype=tl.float32)
         col_offsets_mask = col_offsets < i_cols_num
         # Load blocks of quantized rows, dequantize and accumulate