From 3942d12adb2e71fc87af6380aa1ab94135aaf1e1 Mon Sep 17 00:00:00 2001
From: Andrey Bokovoy <avbokovoy@gmail.com>
Date: Tue, 29 Jul 2025 11:57:27 +0000
Subject: [PATCH 01/26] Add gfx950 build support + fp16 fix + index type fix

---
 fbgemm_gpu/cmake/Hip.cmake                    |  8 ++++++++
 .../embedding_backward_split_template.cu      |  2 +-
 ..._backward_split_device_kernel_template.hip |  2 +-
 .../include/fbgemm_gpu/rocm/cdna_guard.h      |  2 +-
 .../fbgemm_gpu/rocm/split_embeddings_common.h | 20 ++++++++++++++++++-
 fbgemm_gpu/src/tbe/eeg/indices_generator.cpp  |  2 +-
 6 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/fbgemm_gpu/cmake/Hip.cmake b/fbgemm_gpu/cmake/Hip.cmake
index 17640b7254..2011a34c33 100644
--- a/fbgemm_gpu/cmake/Hip.cmake
+++ b/fbgemm_gpu/cmake/Hip.cmake
@@ -78,6 +78,14 @@ if(HIP_FOUND)
   list(APPEND HIP_CXX_FLAGS -mf16c)
   list(APPEND HIP_CXX_FLAGS -mfma)
   list(APPEND HIP_CXX_FLAGS -std=c++20)
+  list(APPEND HIP_CXX_FLAGS -g)
+  list(APPEND HIP_CXX_FLAGS -ggdb)
+
+ # list(APPEND HIP_CXX_FLAGS -Wa,-adhln)
+  #list(APPEND HIP_CXX_FLAGS -adhln)
+  list(APPEND HIP_CXX_FLAGS -save-temps)
+  list(APPEND HIP_CXX_FLAGS -fverbose-asm)
+
 
   set(HIP_HCC_FLAGS ${HIP_CXX_FLAGS})
   # Ask hcc to generate device code during compilation so we can use
diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu
index 186a9d529f..7bc4427355 100644
--- a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu
+++ b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu
@@ -1195,7 +1195,7 @@ Tensor {{ embedding_cuda_op }}(
                     const auto supported_weights_type = dev_weights.scalar_type() == at::ScalarType::Half
                                                       || dev_weights.scalar_type() == at::ScalarType::Float;
 
-                    if (use_hip_kernel && supported_weights_type && !mixed_D && rocm::is_supported_cdna())
+                    if (use_hip_kernel && supported_weights_type && rocm::is_supported_cdna())
                     {
                         constexpr int segments_per_workgroup = 4;
                         {%- for kDimSize in [64, 128, 160, 192, 256] %}
diff --git a/fbgemm_gpu/codegen/training/backward/rocm/embedding_backward_split_device_kernel_template.hip b/fbgemm_gpu/codegen/training/backward/rocm/embedding_backward_split_device_kernel_template.hip
index 2fcbba395e..5acc61382e 100644
--- a/fbgemm_gpu/codegen/training/backward/rocm/embedding_backward_split_device_kernel_template.hip
+++ b/fbgemm_gpu/codegen/training/backward/rocm/embedding_backward_split_device_kernel_template.hip
@@ -179,7 +179,7 @@ __device__ void split_tbe_backward_hip_kernel_{{ kdesc }}(
     const int32_t segment_length_mod = segment_length & length_mask;
 
     cache_t grad_acc[dword_per_row];
-    int32_t infos[segment_unroll];
+    uint32_t infos[segment_unroll];
     grad_t grad_data[dword_per_row * segment_prefetch];
     emb_t emb_data[dword_per_row];
     float indice_weights[segment_unroll];
diff --git a/fbgemm_gpu/include/fbgemm_gpu/rocm/cdna_guard.h b/fbgemm_gpu/include/fbgemm_gpu/rocm/cdna_guard.h
index b55fd72fce..447613c5fc 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/rocm/cdna_guard.h
+++ b/fbgemm_gpu/include/fbgemm_gpu/rocm/cdna_guard.h
@@ -38,7 +38,7 @@
 namespace fbgemm_gpu::rocm {
 
 [[nodiscard]] inline bool is_supported_cdna() {
-  const std::set<std::string> supported_archs{"gfx942", "gfx90a"};
+  const std::set<std::string> supported_archs{"gfx942", "gfx90a", "gfx950"};
   int device_id = 0;
   HIP_CHECK(hipGetDevice(&device_id));
   hipDeviceProp_t dev_props;
diff --git a/fbgemm_gpu/include/fbgemm_gpu/rocm/split_embeddings_common.h b/fbgemm_gpu/include/fbgemm_gpu/rocm/split_embeddings_common.h
index b3a56c4b52..c96da01063 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/rocm/split_embeddings_common.h
+++ b/fbgemm_gpu/include/fbgemm_gpu/rocm/split_embeddings_common.h
@@ -215,6 +215,24 @@ struct load_row_per_warp<half, 512, index_t> {
   }
 };
 
+template <int32_t embedding_dim, typename index_t>
+struct load_row_per_warp<c10::Half, embedding_dim, index_t> {
+  static __device__ void run(
+      c10::Half* emb_data,
+      index_t row_index,
+      const c10::Half* p_emb_table,
+      int lane_id) {
+        load_row_per_warp<half, embedding_dim, index_t>::run(
+          reinterpret_cast<half*>(emb_data),
+          row_index,
+          reinterpret_cast<const half*>(p_emb_table),
+          lane_id
+        );
+      }
+
+};
+
+
 template <
     typename emb_t,
     int32_t embedding_dim,
@@ -471,7 +489,7 @@ __device__ __forceinline__ void generic_dpp_reduction(data_t& result) {
 // of trivial operation with an option to use custom operation
 template <typename data_t, typename reduce_op_t, int wave_size = 64>
 __device__ __forceinline__ void dpp_reduction(data_t& result) {
-#if defined(__gfx942__) || defined(__gfx90a__)
+#if defined(__gfx942__) || defined(__gfx90a__) || defined(__gfx950__)
   if constexpr (std::is_same_v<reduce_op_t, reduce_op::sum>) {
     DPP_REDUCE_F16_F32(add);
     return;
diff --git a/fbgemm_gpu/src/tbe/eeg/indices_generator.cpp b/fbgemm_gpu/src/tbe/eeg/indices_generator.cpp
index 36cd335a02..74b798d731 100644
--- a/fbgemm_gpu/src/tbe/eeg/indices_generator.cpp
+++ b/fbgemm_gpu/src/tbe/eeg/indices_generator.cpp
@@ -132,7 +132,7 @@ torch::Tensor IndicesGenerator::generate() {
   // Now sort the indices by their tags. Use parallel sort for some extra speed
   // (vector is very large).
   std::sort(
-      std::execution::par,
+  //    std::execution::par,
       std::begin(indicesWithTags),
       std::end(indicesWithTags),
       [](const std::pair<int64_t, double>& lhs,

From 7a20f4c4d40b54c0466fdb1232a9077ffd40a209 Mon Sep 17 00:00:00 2001
From: Andrey Bokovoy <avbokovoy@gmail.com>
Date: Tue, 29 Jul 2025 13:16:41 +0000
Subject: [PATCH 02/26] Change int64_t to index_t as template parameters in
 load_raw_per_warp

---
 .../rocm/embedding_backward_split_device_kernel_template.hip    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fbgemm_gpu/codegen/training/backward/rocm/embedding_backward_split_device_kernel_template.hip b/fbgemm_gpu/codegen/training/backward/rocm/embedding_backward_split_device_kernel_template.hip
index 5acc61382e..d5841d6e00 100644
--- a/fbgemm_gpu/codegen/training/backward/rocm/embedding_backward_split_device_kernel_template.hip
+++ b/fbgemm_gpu/codegen/training/backward/rocm/embedding_backward_split_device_kernel_template.hip
@@ -452,7 +452,7 @@ L_tail_grad_acc:
     }
 
     // load the old emb weight data
-    load_row_per_warp<emb_t, embedding_dim, int64_t>::run(
+    load_row_per_warp<emb_t, embedding_dim, index_t>::run(
         &emb_data[0], emb_idx, p_emb_table, lane_id);
     optimizer_t optimizer(opt_karg);
     optimizer.template update<dword_per_row, segment_split>(grad_acc, emb_data, emb_idx);

From 1236b3ace1300d51fa71d520cbe2ddbcc8416ef0 Mon Sep 17 00:00:00 2001
From: Andrey Bokovoy <avbokovoy@gmail.com>
Date: Tue, 29 Jul 2025 14:39:22 +0000
Subject: [PATCH 03/26] Implement llvm fp16 buffer load for gfx950

---
 .../fbgemm_gpu/rocm/split_embeddings_common.h      | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/fbgemm_gpu/include/fbgemm_gpu/rocm/split_embeddings_common.h b/fbgemm_gpu/include/fbgemm_gpu/rocm/split_embeddings_common.h
index c96da01063..4b33fd1422 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/rocm/split_embeddings_common.h
+++ b/fbgemm_gpu/include/fbgemm_gpu/rocm/split_embeddings_common.h
@@ -60,7 +60,12 @@ __device__ half llvm_amdgcn_raw_buffer_load_fp16(
     int32x4_t srsrc,
     int32_t voffset,
     int32_t soffset,
-    int32_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f16");
+    int32_t glc_slc)
+#if defined(__gfx950__)
+      __asm("llvm.amdgcn.raw.buffer.load.i16");
+#else
+      __asm("llvm.amdgcn.raw.buffer.load.f16");
+#endif
 
 __device__ float llvm_amdgcn_raw_buffer_load_fp32(
     int32x4_t srsrc,
@@ -72,7 +77,12 @@ __device__ half2 llvm_amdgcn_raw_buffer_load_fp16x2(
     int32x4_t srsrc,
     int32_t voffset,
     int32_t soffset,
-    int32_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f16");
+    int32_t glc_slc)
+#if defined(__gfx950__)
+      __asm("llvm.amdgcn.raw.buffer.load.i32");
+#else
+      __asm("llvm.amdgcn.raw.buffer.load.v2f16");
+#endif
 
 __device__ void llvm_amdgcn_raw_buffer_store_fp32(
     float vdata,

From 3bc1ba864f0c2f539877bd4f25c2836c100e5591 Mon Sep 17 00:00:00 2001
From: Andrey Bokovoy <avbokovoy@gmail.com>
Date: Mon, 11 Aug 2025 08:23:47 +0000
Subject: [PATCH 04/26] Fix c-style half to float cast

---
 .../include/fbgemm_gpu/rocm/split_embeddings_common.h    | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/fbgemm_gpu/include/fbgemm_gpu/rocm/split_embeddings_common.h b/fbgemm_gpu/include/fbgemm_gpu/rocm/split_embeddings_common.h
index 4b33fd1422..238a83440a 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/rocm/split_embeddings_common.h
+++ b/fbgemm_gpu/include/fbgemm_gpu/rocm/split_embeddings_common.h
@@ -261,7 +261,14 @@ struct accumulate_row_per_warp {
     } else {
 #pragma unroll
       for (int i = 0; i < dword_per_row; i++) {
-        acc[i] += static_cast<output_t>((float)emb_data[i] * row_weight);
+        if constexpr (std::is_same_v<emb_t, c10::Half>)
+        {
+          acc[i] += static_cast<output_t>(__half2float(emb_data[i]) * row_weight);
+        }
+        else
+        {
+          acc[i] += static_cast<output_t>(static_cast<float>(emb_data[i]) * row_weight);
+        }
       }
     }
   }

From 6dcd1043129eb34e3656b1367f654dc6715eb357 Mon Sep 17 00:00:00 2001
From: Andrey Bokovoy <avbokovoy@gmail.com>
Date: Mon, 11 Aug 2025 08:24:29 +0000
Subject: [PATCH 05/26] Patch 256 half stores

---
 .../include/fbgemm_gpu/rocm/split_embeddings_common.h  | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/fbgemm_gpu/include/fbgemm_gpu/rocm/split_embeddings_common.h b/fbgemm_gpu/include/fbgemm_gpu/rocm/split_embeddings_common.h
index 238a83440a..974eae2594 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/rocm/split_embeddings_common.h
+++ b/fbgemm_gpu/include/fbgemm_gpu/rocm/split_embeddings_common.h
@@ -294,6 +294,16 @@ struct store_row_per_warp {
   }
 };
 
+template <>
+struct store_row_per_warp<c10::Half, 256, c10::Half> {
+  static __device__ void run(c10::Half* acc, c10::Half* p_output, int lane_id) {
+    auto out = reinterpret_cast<half2*>(p_output);
+    out[lane_id] = *reinterpret_cast<half2*>(acc);
+    out[lane_id + 64] = *reinterpret_cast<half2*>(&acc[2]);
+  }
+};
+
+
 template <>
 struct store_row_per_warp<half, 128, float> {
   static __device__ void run(float* acc, float* p_output, int lane_id) {

From dc3d3e08acace661940af7a75b6ae5a13a34f830 Mon Sep 17 00:00:00 2001
From: shbiswas834 <shreyashri.biswas@amd.com>
Date: Fri, 8 Aug 2025 05:02:58 +0000
Subject: [PATCH 06/26] cta_per_row workgroup optim

---
 .../training/backward/embedding_backward_split_template.cu    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu
index 7bc4427355..6e6796f0ef 100644
--- a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu
+++ b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu
@@ -1044,7 +1044,7 @@ Tensor {{ embedding_cuda_op }}(
 
                     // Compute shared memory size for cta_per_row
                     constexpr auto kCacheAccBytes = sizeof(at::acc_type<cache_t, true>);
-                    int32_t num_cta_per_row_groups = kMaxThreads / kWarpSize;
+                    int32_t num_cta_per_row_groups = (kMaxThreads/2) / kWarpSize;
                     const size_t cta_per_row_smem_bytes = compute_num_groups_and_dynamic_smem_bytes(
                         &num_cta_per_row_groups,
                         [&] (int num_groups) {
@@ -1055,7 +1055,7 @@ Tensor {{ embedding_cuda_op }}(
                     );
 
                     const int32_t cta_per_row_grid_size = std::min(
-                        div_round_up(total_unique_indices, kMaxThreads),
+                        div_round_up(total_unique_indices, (kMaxThreads/2)),
                         get_max_thread_blocks_());
 
                     FBGEMM_LAUNCH_KERNEL(

From c5a6b25935eb599078951227e6a7506bf4f24c36 Mon Sep 17 00:00:00 2001
From: Shreyashri Biswas <shreyashri.biswas@amd.com>
Date: Mon, 11 Aug 2025 21:06:48 +0000
Subject: [PATCH 07/26] Added mi350 guards

---
 ...ding_backward_split_indice_weights_template.cu | 15 ++++++++++++++-
 .../backward/embedding_backward_split_template.cu | 10 ++++++++++
 .../forward/embedding_forward_split_template.cu   | 14 ++++++++++++++
 3 files changed, 38 insertions(+), 1 deletion(-)
 mode change 100644 => 100755 fbgemm_gpu/codegen/training/backward/embedding_backward_split_indice_weights_template.cu
 mode change 100644 => 100755 fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu
 mode change 100644 => 100755 fbgemm_gpu/codegen/training/forward/embedding_forward_split_template.cu

diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_indice_weights_template.cu b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_indice_weights_template.cu
old mode 100644
new mode 100755
index 1afb2943bb..8f190d04d2
--- a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_indice_weights_template.cu
+++ b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_indice_weights_template.cu
@@ -23,6 +23,10 @@
 #include "fbgemm_gpu/utils/assert_macros.h"
 #include "fbgemm_gpu/utils/kernel_launcher.cuh"
 
+{%- if is_rocm %}
+#include "fbgemm_gpu/rocm/cdna_guard.h"
+{%- endif %}
+
 using Tensor = at::Tensor;
 using namespace fbgemm_gpu;
 
@@ -359,7 +363,16 @@ Tensor {{ mdesc }}_embedding_codegen_grad_indice_weights{{ vdesc }}_cuda(
     auto aligned_grad_output = aligned_grad_output_tensor_for_cuda_backwards(grad_output);
 
     CUDA_DEVICE_GUARD(dev_weights);
-
+    #ifdef USE_ROCM
+        if (!rocm::is_supported_cdna()) {
+            TORCH_WARN_ONCE("Running on non-CDNA architecture. Performance may be suboptimal.");
+        }
+        else {
+            // Ensure we're running on a supported CDNA architecture (including MI350)
+            TORCH_WARN_ONCE("Running on CDNA architecture");
+        }
+    #endif
+    
     const auto T = D_offsets.size(0) - 1;
     TORCH_CHECK_GT(T, 0);
     // offsets = [B x T  + 1]
diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu
old mode 100644
new mode 100755
index 6e6796f0ef..a09f7d3886
--- a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu
+++ b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu
@@ -652,6 +652,16 @@ Tensor {{ embedding_cuda_op }}(
 
     CUDA_DEVICE_GUARD(dev_weights);
 
+    #ifdef USE_ROCM
+        if (!rocm::is_supported_cdna()) {
+            TORCH_WARN_ONCE("Running on non-CDNA architecture. Performance may be suboptimal.");
+        }
+        else {
+            // Ensure we're running on a supported CDNA architecture (including MI350)
+            TORCH_WARN_ONCE("Running on CDNA architecture");
+        }
+    #endif
+    
     {%- if nobag and not is_index_select %}
     auto max_D = D;
     {%- endif %}
diff --git a/fbgemm_gpu/codegen/training/forward/embedding_forward_split_template.cu b/fbgemm_gpu/codegen/training/forward/embedding_forward_split_template.cu
old mode 100644
new mode 100755
index 6574bda45e..bbd62a8bbc
--- a/fbgemm_gpu/codegen/training/forward/embedding_forward_split_template.cu
+++ b/fbgemm_gpu/codegen/training/forward/embedding_forward_split_template.cu
@@ -31,6 +31,10 @@
 #include "fbgemm_gpu/utils/dispatch_macros.h"
 {%- endif %}
 
+{%- if is_rocm %}
+#include "fbgemm_gpu/rocm/cdna_guard.h"
+{%- endif %}
+
 {%- if not is_index_select %}
 ////////////////////////////////////////////////////////////////////////////////
 // Required for op registrations
@@ -454,6 +458,16 @@ batch_index_select_dim0_codegen_forward_cuda(
 
     CUDA_DEVICE_GUARD(dev_weights);
 
+    #ifdef USE_ROCM
+        if (!rocm::is_supported_cdna()) {
+            TORCH_WARN_ONCE("Running on non-CDNA architecture. Performance may be suboptimal.");
+        }
+        else {
+            // Ensure we're running on a supported CDNA architecture (including MI350)
+            TORCH_WARN_ONCE("Running on CDNA architecture");
+        }
+    #endif
+    
     {%- if not nobag %}
     int32_t T = D_offsets.numel() - 1;
     {%- else %}

From 1057a22878c5a9ffae0a5594a1ca3b419a2e351a Mon Sep 17 00:00:00 2001
From: shbiswas834 <shreyashri.biswas@amd.com>
Date: Tue, 12 Aug 2025 15:09:39 +0000
Subject: [PATCH 08/26] Fix index overflow in row load

---
 ..._backward_split_device_kernel_template.hip | 28 +++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/fbgemm_gpu/codegen/training/backward/rocm/embedding_backward_split_device_kernel_template.hip b/fbgemm_gpu/codegen/training/backward/rocm/embedding_backward_split_device_kernel_template.hip
index d5841d6e00..d1a874805a 100644
--- a/fbgemm_gpu/codegen/training/backward/rocm/embedding_backward_split_device_kernel_template.hip
+++ b/fbgemm_gpu/codegen/training/backward/rocm/embedding_backward_split_device_kernel_template.hip
@@ -227,7 +227,7 @@ __device__ void split_tbe_backward_hip_kernel_{{ kdesc }}(
         table_index = infos[0] >> info_B_num_bits;
         bag_index = infos[0] & info_B_mask;
         {%- endif %}
-        load_row_per_warp<grad_t, embedding_dim, int32_t>::run(
+        load_row_per_warp<grad_t, embedding_dim, index_t>::run(
             &grad_data[0], bag_index * num_tables, p_output_grad + table_index * embedding_dim, lane_id);
 
         {%- if nobag %}
@@ -236,7 +236,7 @@ __device__ void split_tbe_backward_hip_kernel_{{ kdesc }}(
         table_index = infos[1] >> info_B_num_bits;
         bag_index = infos[1] & info_B_mask;
         {%- endif %}
-        load_row_per_warp<grad_t, embedding_dim, int32_t>::run(
+        load_row_per_warp<grad_t, embedding_dim, index_t>::run(
             &grad_data[dword_per_row], bag_index * num_tables, p_output_grad + table_index * embedding_dim, lane_id);
         if constexpr (!weighted){
             #pragma unroll
@@ -250,7 +250,7 @@ __device__ void split_tbe_backward_hip_kernel_{{ kdesc }}(
                 table_index = infos[j] >> info_B_num_bits;
                 bag_index = infos[j] & info_B_mask;
                 {%- endif %}
-                load_row_per_warp<grad_t, embedding_dim, int32_t>::run(
+                load_row_per_warp<grad_t, embedding_dim, index_t>::run(
                     &grad_data[0], bag_index * num_tables, p_output_grad + table_index * embedding_dim, lane_id);
 
                 accumulate_row_per_warp<grad_t, embedding_dim, cache_t, weighted>::run(
@@ -261,7 +261,7 @@ __device__ void split_tbe_backward_hip_kernel_{{ kdesc }}(
                 table_index = infos[j + 1] >> info_B_num_bits;
                 bag_index = infos[j + 1] & info_B_mask;
                 {%- endif %}
-                load_row_per_warp<grad_t, embedding_dim, int32_t>::run(
+                load_row_per_warp<grad_t, embedding_dim, index_t>::run(
                     &grad_data[dword_per_row], bag_index * num_tables, p_output_grad + table_index * embedding_dim, lane_id);
             }
 
@@ -290,7 +290,7 @@ __device__ void split_tbe_backward_hip_kernel_{{ kdesc }}(
                 table_index = infos[j] >> info_B_num_bits;
                 bag_index = infos[j] & info_B_mask;
                 {%- endif %}
-                load_row_per_warp<grad_t, embedding_dim, int32_t>::run(
+                load_row_per_warp<grad_t, embedding_dim, index_t>::run(
                     &grad_data[0], bag_index * num_tables, p_output_grad + table_index * embedding_dim, lane_id);
 
                 accumulate_row_per_warp<grad_t, embedding_dim, cache_t, weighted>::run(
@@ -301,7 +301,7 @@ __device__ void split_tbe_backward_hip_kernel_{{ kdesc }}(
                 table_index = infos[j + 1] >> info_B_num_bits;
                 bag_index = infos[j + 1] & info_B_mask;
                 {%- endif %}
-                load_row_per_warp<grad_t, embedding_dim, int32_t>::run(
+                load_row_per_warp<grad_t, embedding_dim, index_t>::run(
                     &grad_data[dword_per_row], bag_index * num_tables, p_output_grad + table_index * embedding_dim, lane_id);
             }
 
@@ -328,7 +328,7 @@ __device__ void split_tbe_backward_hip_kernel_{{ kdesc }}(
     table_index = infos[0] >> info_B_num_bits;
     bag_index = infos[0] & info_B_mask;
     {%- endif %}
-    load_row_per_warp<grad_t, embedding_dim, int32_t>::run(
+    load_row_per_warp<grad_t, embedding_dim, index_t>::run(
         &grad_data[0], bag_index * num_tables, p_output_grad + table_index * embedding_dim, lane_id);
 
     {%- if nobag %}
@@ -337,7 +337,7 @@ __device__ void split_tbe_backward_hip_kernel_{{ kdesc }}(
     table_index = infos[1] >> info_B_num_bits;
     bag_index = infos[1] & info_B_mask;
     {%- endif %}
-    load_row_per_warp<grad_t, embedding_dim, int32_t>::run(
+    load_row_per_warp<grad_t, embedding_dim, index_t>::run(
         &grad_data[dword_per_row], bag_index * num_tables, p_output_grad + table_index * embedding_dim, lane_id);
 
     if constexpr (!weighted) {
@@ -352,7 +352,7 @@ __device__ void split_tbe_backward_hip_kernel_{{ kdesc }}(
             table_index = infos[j] >> info_B_num_bits;
             bag_index = infos[j] & info_B_mask;
             {%- endif %}
-            load_row_per_warp<grad_t, embedding_dim, int32_t>::run(
+            load_row_per_warp<grad_t, embedding_dim, index_t>::run(
                 &grad_data[0], bag_index * num_tables, p_output_grad + table_index * embedding_dim, lane_id);
 
             accumulate_row_per_warp<grad_t, embedding_dim, cache_t, weighted>::run(
@@ -363,7 +363,7 @@ __device__ void split_tbe_backward_hip_kernel_{{ kdesc }}(
             table_index = infos[j + 1] >> info_B_num_bits;
             bag_index = infos[j + 1] & info_B_mask;
             {%- endif %}
-            load_row_per_warp<grad_t, embedding_dim, int32_t>::run(
+            load_row_per_warp<grad_t, embedding_dim, index_t>::run(
                 &grad_data[dword_per_row], bag_index * num_tables, p_output_grad + table_index * embedding_dim, lane_id);
         }
 
@@ -383,7 +383,7 @@ __device__ void split_tbe_backward_hip_kernel_{{ kdesc }}(
             table_index = infos[j] >> info_B_num_bits;
             bag_index = infos[j] & info_B_mask;
             {%- endif %}
-            load_row_per_warp<grad_t, embedding_dim, int32_t>::run(
+            load_row_per_warp<grad_t, embedding_dim, index_t>::run(
                 &grad_data[0], bag_index * num_tables, p_output_grad + table_index * embedding_dim, lane_id);
 
             accumulate_row_per_warp<grad_t, embedding_dim, cache_t, weighted>::run(
@@ -394,7 +394,7 @@ __device__ void split_tbe_backward_hip_kernel_{{ kdesc }}(
             table_index = infos[j + 1] >> info_B_num_bits;
             bag_index = infos[j + 1] & info_B_mask;
             {%- endif %}
-            load_row_per_warp<grad_t, embedding_dim, int32_t>::run(
+            load_row_per_warp<grad_t, embedding_dim, index_t>::run(
                 &grad_data[dword_per_row], bag_index * num_tables, p_output_grad + table_index * embedding_dim, lane_id);
         }
 
@@ -420,7 +420,7 @@ L_tail_grad_acc:
                 table_index = infos[0] >> info_B_num_bits;
                 bag_index = infos[0] & info_B_mask;
                 {%- endif %}
-                load_row_per_warp<grad_t, embedding_dim, int32_t>::run(
+                load_row_per_warp<grad_t, embedding_dim, index_t>::run(
                     &grad_data[0], bag_index * num_tables, p_output_grad + table_index * embedding_dim, lane_id);
                 accumulate_row_per_warp<grad_t, embedding_dim, cache_t, weighted>::run(
                     &grad_acc[0], &grad_data[0], lane_id);
@@ -441,7 +441,7 @@ L_tail_grad_acc:
                 table_index = infos[0] >> info_B_num_bits;
                 bag_index = infos[0] & info_B_mask;
                 {%- endif %}
-                load_row_per_warp<grad_t, embedding_dim, int32_t>::run(
+                load_row_per_warp<grad_t, embedding_dim, index_t>::run(
                     &grad_data[0], bag_index * num_tables, p_output_grad + table_index * embedding_dim, lane_id);
                 accumulate_row_per_warp<grad_t, embedding_dim, cache_t, weighted>::run(
                     &grad_acc[0], &grad_data[0], lane_id, indice_weights[0]);

From e981269c26ac383ef1932349dfbe19906b9c2ee0 Mon Sep 17 00:00:00 2001
From: shbiswas834 <shreyashri.biswas@amd.com>
Date: Tue, 12 Aug 2025 20:13:09 +0000
Subject: [PATCH 09/26] cta_per_row workgroup reduce by 4 optim

---
 .../training/backward/embedding_backward_split_template.cu    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu
index a09f7d3886..fd97aea59f 100755
--- a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu
+++ b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu
@@ -1054,7 +1054,7 @@ Tensor {{ embedding_cuda_op }}(
 
                     // Compute shared memory size for cta_per_row
                     constexpr auto kCacheAccBytes = sizeof(at::acc_type<cache_t, true>);
-                    int32_t num_cta_per_row_groups = (kMaxThreads/2) / kWarpSize;
+                    int32_t num_cta_per_row_groups = (kMaxThreads/4) / kWarpSize;
                     const size_t cta_per_row_smem_bytes = compute_num_groups_and_dynamic_smem_bytes(
                         &num_cta_per_row_groups,
                         [&] (int num_groups) {
@@ -1065,7 +1065,7 @@ Tensor {{ embedding_cuda_op }}(
                     );
 
                     const int32_t cta_per_row_grid_size = std::min(
-                        div_round_up(total_unique_indices, (kMaxThreads/2)),
+                        div_round_up(total_unique_indices, (kMaxThreads/4)),
                         get_max_thread_blocks_());
 
                     FBGEMM_LAUNCH_KERNEL(

From 3ef8e5605536071cfffc3bb2d13e333c5609b283 Mon Sep 17 00:00:00 2001
From: Andrey Bokovoy <avbokovoy@gmail.com>
Date: Wed, 13 Aug 2025 13:21:38 +0000
Subject: [PATCH 10/26] Fix mixed_D frontend to backend connection

---
 .../training/backward/embedding_backward_split_template.cu   | 2 +-
 .../pt2/embedding_split_host_pt2_autograd_template.cpp       | 1 +
 .../split_table_batched_embeddings_ops_training.py           | 5 ++++-
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu
index fd97aea59f..4732a7a0ec 100755
--- a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu
+++ b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu
@@ -1205,7 +1205,7 @@ Tensor {{ embedding_cuda_op }}(
                     const auto supported_weights_type = dev_weights.scalar_type() == at::ScalarType::Half
                                                       || dev_weights.scalar_type() == at::ScalarType::Float;
 
-                    if (use_hip_kernel && supported_weights_type && rocm::is_supported_cdna())
+                    if (use_hip_kernel && !mixed_D && supported_weights_type && rocm::is_supported_cdna())
                     {
                         constexpr int segments_per_workgroup = 4;
                         {%- for kDimSize in [64, 128, 160, 192, 256] %}
diff --git a/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_autograd_template.cpp b/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_autograd_template.cpp
index 3720f1ea42..20c055e917 100644
--- a/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_autograd_template.cpp
+++ b/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_autograd_template.cpp
@@ -698,6 +698,7 @@ class {{ autograd_func }} :
     TORCH_CHECK(aux_tensor[IDX_LXU_CACHE_LOCATIONS].has_value(), "lxu_cache_locations should have value.");
     const auto lxu_cache_locations = aux_tensor[IDX_LXU_CACHE_LOCATIONS].value();
     const auto is_experimental = aux_bool[IDX_IS_EXPERIMENTAL_TBE];
+    const auto mixed_D = aux_bool[IDX_MIXED_D];
     {%- endif %}
 
     // Default values for Dynamo tracing
diff --git a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_training.py b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_training.py
index 05e9cdb8c8..0bbb979009 100644
--- a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_training.py
+++ b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_training.py
@@ -808,7 +808,7 @@ def __init__(  # noqa C901
             assert (
                 self.pooling_mode != PoolingMode.NONE
             ), "Mixed dimension tables only supported for pooling tables."
-
+        self.mixed_D = mixed_D
         assert all(
             cd == compute_devices[0] for cd in compute_devices
         ), "Heterogenous compute_devices are NOT supported!"
@@ -2262,6 +2262,7 @@ def forward(  # noqa: C901
                         row_counter,
                         iter_int,
                         self.max_counter.item(),
+                        mixed_D=self.mixed_D,
                     ),
                 )
             elif self._used_rowwise_adagrad_with_global_weight_decay:
@@ -2280,6 +2281,7 @@ def forward(  # noqa: C901
                         #  `Optional[Tensor]` but got `Union[Module, Tensor]`.
                         prev_iter_dev=self.prev_iter_dev,
                         gwd_lower_bound=self.gwd_lower_bound,
+                        mixed_D=self.mixed_D,
                     ),
                 )
             else:
@@ -2289,6 +2291,7 @@ def forward(  # noqa: C901
                         common_args,
                         self.optimizer_args,
                         momentum1,
+                        mixed_D=self.mixed_D,
                     ),
                 )
 

From caf2e9e4bec2f5981d5d54e986ef8c05df14efc9 Mon Sep 17 00:00:00 2001
From: kudomcho <kudomcho@amd.com>
Date: Fri, 15 Aug 2025 15:32:19 +0000
Subject: [PATCH 11/26] changed max_segment_length_per_cta to 4096

---
 .../training/backward/embedding_backward_split_template.cu      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu
index 4732a7a0ec..2371f6a864 100755
--- a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu
+++ b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu
@@ -981,7 +981,7 @@ Tensor {{ embedding_cuda_op }}(
                 auto num_long_run_ids = at::zeros({1}, indices.options().dtype(at::kInt));
 
                 const bool use_deterministic_algorithms = at::globalContext().deterministicAlgorithms();
-                const int max_segment_length_per_cta = use_deterministic_algorithms ? INT_MAX : 1024;
+                const int max_segment_length_per_cta = use_deterministic_algorithms ? INT_MAX : 4096;
 
                 Tensor long_run_id_to_really_long_run_ids;
                 if (use_deterministic_algorithms) {

From 4b841bdbf218a353c697eec446493c2317662444 Mon Sep 17 00:00:00 2001
From: shbiswas834 <shreyashri.biswas@amd.com>
Date: Mon, 18 Aug 2025 22:32:58 +0000
Subject: [PATCH 12/26] added rocm guards and removed comment

---
 .../embedding_backward_split_template.cu      | 19 ++++++++++++++++---
 fbgemm_gpu/src/tbe/eeg/indices_generator.cpp  |  1 -
 2 files changed, 16 insertions(+), 4 deletions(-)
 mode change 100644 => 100755 fbgemm_gpu/src/tbe/eeg/indices_generator.cpp

diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu
index 2371f6a864..1446a69deb 100755
--- a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu
+++ b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu
@@ -981,7 +981,11 @@ Tensor {{ embedding_cuda_op }}(
                 auto num_long_run_ids = at::zeros({1}, indices.options().dtype(at::kInt));
 
                 const bool use_deterministic_algorithms = at::globalContext().deterministicAlgorithms();
-                const int max_segment_length_per_cta = use_deterministic_algorithms ? INT_MAX : 4096;
+                #ifdef USE_ROCM
+                    const int max_segment_length_per_cta = use_deterministic_algorithms ? INT_MAX : 4096;
+                #else
+                    const int max_segment_length_per_cta = use_deterministic_algorithms ? INT_MAX : 1024;
+                #endif
 
                 Tensor long_run_id_to_really_long_run_ids;
                 if (use_deterministic_algorithms) {
@@ -1054,7 +1058,11 @@ Tensor {{ embedding_cuda_op }}(
 
                     // Compute shared memory size for cta_per_row
                     constexpr auto kCacheAccBytes = sizeof(at::acc_type<cache_t, true>);
-                    int32_t num_cta_per_row_groups = (kMaxThreads/4) / kWarpSize;
+                    #ifdef USE_ROCM
+                        int32_t num_cta_per_row_groups = (kMaxThreads/4) / kWarpSize;
+                    #else
+                        int32_t num_cta_per_row_groups = kMaxThreads / kWarpSize;
+                    #endif
                     const size_t cta_per_row_smem_bytes = compute_num_groups_and_dynamic_smem_bytes(
                         &num_cta_per_row_groups,
                         [&] (int num_groups) {
@@ -1065,7 +1073,12 @@ Tensor {{ embedding_cuda_op }}(
                     );
 
                     const int32_t cta_per_row_grid_size = std::min(
-                        div_round_up(total_unique_indices, (kMaxThreads/4)),
+                        #ifdef USE_ROCM
+                            div_round_up(total_unique_indices, (kMaxThreads/4)),
+                        #else
+                            div_round_up(total_unique_indices, kMaxThreads),
+                        #endif
+                        
                         get_max_thread_blocks_());
 
                     FBGEMM_LAUNCH_KERNEL(
diff --git a/fbgemm_gpu/src/tbe/eeg/indices_generator.cpp b/fbgemm_gpu/src/tbe/eeg/indices_generator.cpp
old mode 100644
new mode 100755
index 74b798d731..6a3b20c4d4
--- a/fbgemm_gpu/src/tbe/eeg/indices_generator.cpp
+++ b/fbgemm_gpu/src/tbe/eeg/indices_generator.cpp
@@ -132,7 +132,6 @@ torch::Tensor IndicesGenerator::generate() {
   // Now sort the indices by their tags. Use parallel sort for some extra speed
   // (vector is very large).
   std::sort(
-  //    std::execution::par,
       std::begin(indicesWithTags),
       std::end(indicesWithTags),
       [](const std::pair<int64_t, double>& lhs,

From 63287f645c2a91f87e570b3141e1219b2f2ec140 Mon Sep 17 00:00:00 2001
From: Li Li <li.li3@amd.com>
Date: Wed, 20 Aug 2025 03:00:56 +0000
Subject: [PATCH 13/26] clean debug statements in Hip.cmake

---
 fbgemm_gpu/cmake/Hip.cmake | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/fbgemm_gpu/cmake/Hip.cmake b/fbgemm_gpu/cmake/Hip.cmake
index 2011a34c33..17640b7254 100644
--- a/fbgemm_gpu/cmake/Hip.cmake
+++ b/fbgemm_gpu/cmake/Hip.cmake
@@ -78,14 +78,6 @@ if(HIP_FOUND)
   list(APPEND HIP_CXX_FLAGS -mf16c)
   list(APPEND HIP_CXX_FLAGS -mfma)
   list(APPEND HIP_CXX_FLAGS -std=c++20)
-  list(APPEND HIP_CXX_FLAGS -g)
-  list(APPEND HIP_CXX_FLAGS -ggdb)
-
- # list(APPEND HIP_CXX_FLAGS -Wa,-adhln)
-  #list(APPEND HIP_CXX_FLAGS -adhln)
-  list(APPEND HIP_CXX_FLAGS -save-temps)
-  list(APPEND HIP_CXX_FLAGS -fverbose-asm)
-
 
   set(HIP_HCC_FLAGS ${HIP_CXX_FLAGS})
   # Ask hcc to generate device code during compilation so we can use

From b6d76f9aa116f0bbcedbf3f61be24e2b6e33d462 Mon Sep 17 00:00:00 2001
From: Shreya <shbiswas@amd.com>
Date: Thu, 28 Aug 2025 11:43:32 -0500
Subject: [PATCH 14/26] Merge pull request #121

warp per row wg change
---
 .../embedding_backward_split_template.cu      | 34 ++++++++++++++-----
 1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu
index 1446a69deb..f4dccde983 100755
--- a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu
+++ b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu
@@ -1058,10 +1058,21 @@ Tensor {{ embedding_cuda_op }}(
 
                     // Compute shared memory size for cta_per_row
                     constexpr auto kCacheAccBytes = sizeof(at::acc_type<cache_t, true>);
+                    int32_t total_L = indices.numel();
                     #ifdef USE_ROCM
-                        int32_t num_cta_per_row_groups = (kMaxThreads/4) / kWarpSize;
+                        int32_t num_cta_per_row_groups;
+                        int32_t work_group_size;
+                        if (total_L/total_B > 1){
+                            num_cta_per_row_groups = (kMaxThreads/4) / kWarpSize;
+                            work_group_size = (kMaxThreads/4);
+                        }
+                        else{
+                            num_cta_per_row_groups = kMaxThreads / kWarpSize;
+                            work_group_size = kMaxThreads;
+                        }
                     #else
                         int32_t num_cta_per_row_groups = kMaxThreads / kWarpSize;
+                        int32_t work_group_size = kMaxThreads;
                     #endif
                     const size_t cta_per_row_smem_bytes = compute_num_groups_and_dynamic_smem_bytes(
                         &num_cta_per_row_groups,
@@ -1073,17 +1084,13 @@ Tensor {{ embedding_cuda_op }}(
                     );
 
                     const int32_t cta_per_row_grid_size = std::min(
-                        #ifdef USE_ROCM
-                            div_round_up(total_unique_indices, (kMaxThreads/4)),
-                        #else
-                            div_round_up(total_unique_indices, kMaxThreads),
-                        #endif
-                        
+                        div_round_up(total_unique_indices, work_group_size),
                         get_max_thread_blocks_());
 
                     FBGEMM_LAUNCH_KERNEL(
                         backward_cta_per_row_kernel,
                         cta_per_row_grid_size,
+                        // (64, 2)
                         dim3(kThreadGroupSize, num_cta_per_row_groups),
                         cta_per_row_smem_bytes,
                         at::cuda::getCurrentCUDAStream(),
@@ -1187,7 +1194,18 @@ Tensor {{ embedding_cuda_op }}(
                              kUseVecBlocking>;
 
                     // Compute shared memory size for warp_per_row
-                    int32_t num_warp_per_row_groups = kBackwardMaxThreads / kThreadGroupSize;
+                    #ifdef USE_ROCM
+                        int32_t num_warp_per_row_groups;
+                        
+                        if (total_L/total_B > 1){
+                            num_warp_per_row_groups = (kBackwardMaxThreads/2) / kThreadGroupSize;
+                        }
+                        else{
+                            num_warp_per_row_groups = kBackwardMaxThreads / kThreadGroupSize;
+                        }
+                    #else
+                        int32_t num_warp_per_row_groups = kBackwardMaxThreads / kThreadGroupSize;
+                    #endif
                     int32_t warp_per_row_smem_bytes = 0;
 
                     if constexpr (kUseVecBlocking) {

From 76d091408a090aa94538db82fa7840ac26d32549 Mon Sep 17 00:00:00 2001
From: Andrey Bokovoy <avbokovoy@gmail.com>
Date: Tue, 2 Sep 2025 09:25:03 +0000
Subject: [PATCH 15/26] Guard f16 llvm intrinsics with ROCm >=7.0

---
 fbgemm_gpu/include/fbgemm_gpu/rocm/split_embeddings_common.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fbgemm_gpu/include/fbgemm_gpu/rocm/split_embeddings_common.h b/fbgemm_gpu/include/fbgemm_gpu/rocm/split_embeddings_common.h
index 974eae2594..46c4603381 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/rocm/split_embeddings_common.h
+++ b/fbgemm_gpu/include/fbgemm_gpu/rocm/split_embeddings_common.h
@@ -24,6 +24,7 @@
 #include <c10/util/Half.h>
 #include <hip/hip_fp16.h>
 #include <hip/hip_runtime.h>
+#include <rocm-core/rocm_version.h>
 
 /******************************************************************************/
 typedef int32_t int32x4_t __attribute__((ext_vector_type(4)));
@@ -61,7 +62,7 @@ __device__ half llvm_amdgcn_raw_buffer_load_fp16(
     int32_t voffset,
     int32_t soffset,
     int32_t glc_slc)
-#if defined(__gfx950__)
+#if ROCM_VERSION_MAJOR >= 7
       __asm("llvm.amdgcn.raw.buffer.load.i16");
 #else
       __asm("llvm.amdgcn.raw.buffer.load.f16");
@@ -78,7 +79,7 @@ __device__ half2 llvm_amdgcn_raw_buffer_load_fp16x2(
     int32_t voffset,
     int32_t soffset,
     int32_t glc_slc)
-#if defined(__gfx950__)
+#if ROCM_VERSION_MAJOR >= 7
       __asm("llvm.amdgcn.raw.buffer.load.i32");
 #else
       __asm("llvm.amdgcn.raw.buffer.load.v2f16");

From a56c29907e87a8e03d24d47b578273a42dd609af Mon Sep 17 00:00:00 2001
From: Li Li <li.li3@amd.com>
Date: Thu, 18 Sep 2025 16:28:31 +0000
Subject: [PATCH 16/26] fix the bug in dimention 160 in ROCm optimization

---
 fbgemm_gpu/include/fbgemm_gpu/rocm/split_embeddings_common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fbgemm_gpu/include/fbgemm_gpu/rocm/split_embeddings_common.h b/fbgemm_gpu/include/fbgemm_gpu/rocm/split_embeddings_common.h
index 46c4603381..8a97579d6a 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/rocm/split_embeddings_common.h
+++ b/fbgemm_gpu/include/fbgemm_gpu/rocm/split_embeddings_common.h
@@ -165,7 +165,7 @@ struct load_row_per_warp<half, 160, index_t> {
   static __device__ void
   run(half* emb_data, index_t row_index, const half* p_emb_table, int lane_id) {
     int32x4_t emb_res =
-        amdgcn_make_buffer_resource(p_emb_table + row_index * 192);
+        amdgcn_make_buffer_resource(p_emb_table + row_index * 160);
     *reinterpret_cast<half2*>(emb_data) = llvm_amdgcn_raw_buffer_load_fp16x2(
         emb_res, lane_id * sizeof(half2), 0, 0);
     if ((lane_id + 128) % 192 < 160) {

From 54f296d8445d8465ce675dcb0bbee726500629b0 Mon Sep 17 00:00:00 2001
From: Andrey Bokovoy <avbokovoy@gmail.com>
Date: Tue, 19 Aug 2025 13:41:17 +0000
Subject: [PATCH 17/26] Cleanup optimized warp_per_raw kernel

---
 fbgemm_gpu/cmake/tbe_sources.py               |  2 -
 .../genscript/generate_backward_split.py      | 10 +-
 ...ing_backward_split_kernel_warp_template.cu | 40 +++-----
 .../embedding_backward_split_template.cu      | 18 ++--
 ..._backward_split_device_kernel_template.hip | 94 +++++--------------
 5 files changed, 54 insertions(+), 110 deletions(-)

diff --git a/fbgemm_gpu/cmake/tbe_sources.py b/fbgemm_gpu/cmake/tbe_sources.py
index 31200b6190..dc3acace35 100644
--- a/fbgemm_gpu/cmake/tbe_sources.py
+++ b/fbgemm_gpu/cmake/tbe_sources.py
@@ -176,7 +176,6 @@
             "_nobag" if nobag else "",
         )
         for nobag in [
-            True,
             False,
         ]
         for weighted in (
@@ -495,7 +494,6 @@
         "_nobag" if nobag else "",
     )
     for nobag in [
-        True,
         False,
     ]
     for weighted in (
diff --git a/fbgemm_gpu/codegen/genscript/generate_backward_split.py b/fbgemm_gpu/codegen/genscript/generate_backward_split.py
index a817232910..5acb6f2e7f 100644
--- a/fbgemm_gpu/codegen/genscript/generate_backward_split.py
+++ b/fbgemm_gpu/codegen/genscript/generate_backward_split.py
@@ -52,7 +52,11 @@ def render_backward_templates(
             return
 
         weighted_options = [True, False]
-        nobag_options = [True, False] if (not is_gwd) else [False]
+        nobag_options = (
+            [True, False]
+            if (not (is_gwd or kwargs.get("is_hip_optimized_backward")))
+            else [False]
+        )
         vbe_options = [True, False] if (kwargs.get("has_vbe_support")) else [False]
         ssd_options = [True, False] if kwargs.get("has_ssd_support") else [False]
         template = CodeTemplate.load(template_filepath)
@@ -327,8 +331,7 @@ def generate_backward_indices() -> None:
 
     @staticmethod
     def generate_rocm_backward_split(**kwargs: Any) -> None:
-        # Generate backward device kernels based on weighted (True/False), VBE
-        # (True/False), no bag (True/False)
+        # Generate backward device kernels based on weighted (True/False)
         template_filepath = (
             "training/backward/rocm/embedding_backward_split_device_kernel_template.hip"
         )
@@ -343,6 +346,7 @@ def generate_rocm_backward_split(**kwargs: Any) -> None:
                 "has_ssd_support": False,
                 "dense": False,
                 "gen_once": False,
+                "is_hip_optimized_backward": True,
             },
         )
 
diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_kernel_warp_template.cu b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_kernel_warp_template.cu
index 322d997e83..45735ff1a9 100644
--- a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_kernel_warp_template.cu
+++ b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_kernel_warp_template.cu
@@ -32,6 +32,14 @@
 
 {%- set locs_or_addrs_tensor = "ssd_row_addrs" if ssd else "lxu_cache_locations" %}
 {%- set locs_or_addrs_type = "int64_t" if ssd else "int32_t" %}
+{%- set is_optimized_hip_kernel_supported_mode = is_rocm and 
+                                                 optimizer == "rowwise_adagrad" and 
+                                                 not dense and
+                                                 not nobag and 
+                                                 not is_index_select and
+                                                 not is_gwd_kernel and 
+                                                 not vbe and 
+                                                 not ssd %}
 
 #include "fbgemm_gpu/embedding_backward_template_helpers.cuh"
 #include "fbgemm_gpu/utils/tensor_accessor_builder.h"
@@ -538,7 +546,7 @@ batch_index_select_dim0_codegen_backward_kernel_warp_per_row
 
 {%- endif %}
 
-{%- if is_rocm and not is_index_select and optimizer == "rowwise_adagrad" and not dense and not is_gwd_kernel and not vbe and not ssd %}
+{%- if is_optimized_hip_kernel_supported_mode %}
 #include <hip/hip_runtime.h>
 #include <hip/hip_fp16.h>
 #include "fbgemm_gpu/rocm/split_embeddings_common.h"
@@ -612,12 +620,8 @@ hip_split_embedding{{ ndesc }}_backward_codegen_{{ optimizer }}_{{ wdesc }}{{ vd
     {{ args.split_kernel_args | replace_pta_namespace() | join(",\n    ") }}
     {%- endif %}
 ) {
-    {%- if not nobag %}
     int32_t T = D_offsets.size(0) - 1;
-    {%- else %}
-    int32_t T = weights_offsets.size(0);
-    {%- endif %}
-
+    
     auto p_output_grad = grad_output.data();
     auto p_emb_table = dev_weights.data();
     auto p_hash_size_cumsum = hash_size_cumsum.data();
@@ -632,8 +636,6 @@ hip_split_embedding{{ ndesc }}_backward_codegen_{{ optimizer }}_{{ wdesc }}{{ vd
     constexpr int32_t segment_prefetch = 2;
     constexpr int32_t segment_unroll = 8;
     constexpr int32_t segment_split = 0;
-    auto batch = grad_output.size(0);
-    auto num_rows = dev_weights.size(0) / T / max_D;
     {%- if weighted %}
     constexpr bool is_weighted = true;
     {%- else %}
@@ -646,22 +648,7 @@ hip_split_embedding{{ ndesc }}_backward_codegen_{{ optimizer }}_{{ wdesc }}{{ vd
     // weight_decay(_mode) is supplied as args.split_function_args_no_defaults
     opt_karg.weight_decay_mode = weight_decay_mode_v;
     opt_karg.weight_decay = weight_decay;
-    auto batch_mdiv = [](uint32_t d) -> rocm::magic_div_u32_t {
-        assert(d >= 1 && d <= INT32_MAX);
-        uint8_t shift;
-        for(shift = 0; shift < 32; shift++)
-            if((1U << shift) >= d)
-                break;
-
-        uint64_t one   = 1;
-        uint64_t magic = ((one << 32) * ((one << shift) - d)) / d + 1;
-        assert(magic <= 0xffffffffUL);
-
-        rocm::magic_div_u32_t result;
-        result.magic = magic;
-        result.shift = shift;
-        return result;
-    }(batch);
+
     rocm::split_tbe_backward_hip_kernel_{{kdesc}}<
         rocm::{{optimizer}}_optimizer_t<cache_t, emb_t, embedding_dim, weight_decay_mode_v>,
         rocm::{{optimizer}}_kernel_arg_t,
@@ -680,16 +667,11 @@ hip_split_embedding{{ ndesc }}_backward_codegen_{{ optimizer }}_{{ wdesc }}{{ vd
                p_sorted_linear_indices_run,
                p_sorted_linear_indices_cumulative_run_lengths,
                p_sorted_linear_indices_num_runs,
-               {%- if not nobag %}
                info_B_num_bits,
                info_B_mask,
-               {%- endif %}
                p_sorted_infos,
-               batch_mdiv,
                max_segment_length_per_warp,
                emb_dim,
-               batch,
-               num_rows,
                T,
                opt_karg
                {%- if weighted %}
diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu
index f4dccde983..64b6995e55 100755
--- a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu
+++ b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu
@@ -48,6 +48,15 @@ using namespace fbgemm_gpu;
     has_global_weight_decay_support,
     ssd) %}
 {%- set desc_suffix = get_desc_suffix(is_gwd_kernel) %}
+{%- set is_optimized_hip_kernel_supported_mode = is_rocm and 
+                                                 optimizer == "rowwise_adagrad" and 
+                                                 not dense and
+                                                 not nobag and 
+                                                 not is_index_select and
+                                                 not is_gwd_kernel and 
+                                                 not vbe and 
+                                                 not ssd %}
+
 template <
     typename emb_t,
     typename grad_t,
@@ -227,8 +236,7 @@ batch_index_select_dim0_codegen_backward_kernel_warp_per_row(
     {%- endif %}
 );
 
-{%- if is_rocm and optimizer == "rowwise_adagrad" and not dense and not is_index_select
-               and not is_gwd_kernel and not vbe and not ssd %}
+{%- if is_optimized_hip_kernel_supported_mode %}
 #include "fbgemm_gpu/rocm/split_embeddings_common.h"
 template <
     typename emb_t,
@@ -862,8 +870,7 @@ Tensor {{ embedding_cuda_op }}(
     }
     {%- endif %}
 
-    {%- if is_rocm and optimizer == "rowwise_adagrad" and not dense and not is_index_select
-                   and not is_gwd_kernel and not vbe and not ssd %}
+    {%- if is_optimized_hip_kernel_supported_mode %}
     {%- set hip_kernel = "hip_split_embedding{}_backward_codegen_{}_{}{}_kernel_warp_per_row_1".format(
             ndesc,
             optimizer,
@@ -1228,8 +1235,7 @@ Tensor {{ embedding_cuda_op }}(
                         get_max_thread_blocks_());
 
 #ifdef USE_ROCM
-                    {%- if is_rocm and not is_index_select and optimizer == "rowwise_adagrad" and
-                        not dense and not is_gwd_kernel and not vbe and not ssd and not nobag %}
+                    {%- if is_optimized_hip_kernel_supported_mode %}
 
                     const static auto use_hip_kernel = fbgemm_gpu::config::is_feature_enabled(fbgemm_gpu::config::FeatureGateName::TBE_ROCM_HIP_BACKWARD_KERNEL);
 
diff --git a/fbgemm_gpu/codegen/training/backward/rocm/embedding_backward_split_device_kernel_template.hip b/fbgemm_gpu/codegen/training/backward/rocm/embedding_backward_split_device_kernel_template.hip
index d1a874805a..951cff4399 100644
--- a/fbgemm_gpu/codegen/training/backward/rocm/embedding_backward_split_device_kernel_template.hip
+++ b/fbgemm_gpu/codegen/training/backward/rocm/embedding_backward_split_device_kernel_template.hip
@@ -122,20 +122,11 @@ __device__ void split_tbe_backward_hip_kernel_{{ kdesc }}(
     const index_t* p_sorted_linear_indices_run,
     const int32_t* p_sorted_linear_indices_cumulative_run_lengths,
     const int32_t* p_sorted_linear_indices_num_runs,
-    {%- if not nobag %}
     const int32_t info_B_num_bits,
     const uint32_t info_B_mask,
-    {%- endif %}
-    {%- if not nobag %}
     const int32_t* p_sorted_infos,
-    {%- else %}
-    const int64_t* p_sorted_infos,
-    {%- endif %}
-    magic_div_u32_t batch_mdiv,
     uint32_t max_segment_length_per_warp,
     uint32_t emb_dim,
-    uint32_t batch,
-    uint32_t num_rows,
     uint32_t num_tables,
     optimizer_karg_t opt_karg,
     const float * p_sorted_indice_weights = nullptr)
@@ -157,13 +148,9 @@ __device__ void split_tbe_backward_hip_kernel_{{ kdesc }}(
     const int32_t segment_start = p_sorted_linear_indices_cumulative_run_lengths[run_id];
     const int32_t segment_end   = p_sorted_linear_indices_cumulative_run_lengths[run_id + 1];
 
-    {%- if nobag %}
-    const auto info_0 = p_sorted_infos[segment_start];
-    int32_t t_0 = info_0 % num_tables;
-    {%- else %}
     const auto info_0 = reinterpret_cast<const uint32_t*>(&p_sorted_infos[0])[segment_start];
     const auto t_0 = info_0 >> info_B_num_bits;
-    {%- endif %}
+
     int64_t hash_size = p_hash_size_cumsum[t_0];
 
     const int64_t emb_idx = linear_index - hash_size;
@@ -221,21 +208,15 @@ __device__ void split_tbe_backward_hip_kernel_{{ kdesc }}(
     // LOOP
     for(; itr < segment_length_mod; itr += segment_unroll)
     {
-        {%- if nobag %}
-        magic_div_u32_run_with_mod(batch_mdiv, infos[0], batch, table_index, bag_index);
-        {%- else %}
         table_index = infos[0] >> info_B_num_bits;
         bag_index = infos[0] & info_B_mask;
-        {%- endif %}
+
         load_row_per_warp<grad_t, embedding_dim, index_t>::run(
             &grad_data[0], bag_index * num_tables, p_output_grad + table_index * embedding_dim, lane_id);
 
-        {%- if nobag %}
-        magic_div_u32_run_with_mod(batch_mdiv, infos[1], batch, table_index, bag_index);
-        {%- else %}
         table_index = infos[1] >> info_B_num_bits;
         bag_index = infos[1] & info_B_mask;
-        {%- endif %}
+
         load_row_per_warp<grad_t, embedding_dim, index_t>::run(
             &grad_data[dword_per_row], bag_index * num_tables, p_output_grad + table_index * embedding_dim, lane_id);
         if constexpr (!weighted){
@@ -244,23 +225,19 @@ __device__ void split_tbe_backward_hip_kernel_{{ kdesc }}(
             {
                 accumulate_row_per_warp<grad_t, embedding_dim, cache_t, weighted>::run(
                     &grad_acc[0], &grad_data[0], lane_id);
-                {%- if nobag %}
-                magic_div_u32_run_with_mod(batch_mdiv, infos[j], batch, table_index, bag_index);
-                {%- else %}
+                
                 table_index = infos[j] >> info_B_num_bits;
                 bag_index = infos[j] & info_B_mask;
-                {%- endif %}
+
                 load_row_per_warp<grad_t, embedding_dim, index_t>::run(
                     &grad_data[0], bag_index * num_tables, p_output_grad + table_index * embedding_dim, lane_id);
 
                 accumulate_row_per_warp<grad_t, embedding_dim, cache_t, weighted>::run(
                     &grad_acc[0], &grad_data[dword_per_row], lane_id);
-                {%- if nobag %}
-                magic_div_u32_run_with_mod(batch_mdiv, infos[j + 1], batch, table_index, bag_index);
-                {%- else %}
+                
                 table_index = infos[j + 1] >> info_B_num_bits;
                 bag_index = infos[j + 1] & info_B_mask;
-                {%- endif %}
+
                 load_row_per_warp<grad_t, embedding_dim, index_t>::run(
                     &grad_data[dword_per_row], bag_index * num_tables, p_output_grad + table_index * embedding_dim, lane_id);
             }
@@ -284,23 +261,19 @@ __device__ void split_tbe_backward_hip_kernel_{{ kdesc }}(
             {
                 accumulate_row_per_warp<grad_t, embedding_dim, cache_t, weighted>::run(
                     &grad_acc[0], &grad_data[0], lane_id, indice_weights[j-2]);
-                {%- if nobag %}
-                magic_div_u32_run_with_mod(batch_mdiv, infos[j], batch, table_index, bag_index);
-                {%- else %}
+                
                 table_index = infos[j] >> info_B_num_bits;
                 bag_index = infos[j] & info_B_mask;
-                {%- endif %}
+
                 load_row_per_warp<grad_t, embedding_dim, index_t>::run(
                     &grad_data[0], bag_index * num_tables, p_output_grad + table_index * embedding_dim, lane_id);
 
                 accumulate_row_per_warp<grad_t, embedding_dim, cache_t, weighted>::run(
                     &grad_acc[0], &grad_data[dword_per_row], lane_id, indice_weights[j-1]);
-                {%- if nobag %}
-                magic_div_u32_run_with_mod(batch_mdiv, infos[j + 1], batch, table_index, bag_index);
-                {%- else %}
+                
                 table_index = infos[j + 1] >> info_B_num_bits;
                 bag_index = infos[j + 1] & info_B_mask;
-                {%- endif %}
+
                 load_row_per_warp<grad_t, embedding_dim, index_t>::run(
                     &grad_data[dword_per_row], bag_index * num_tables, p_output_grad + table_index * embedding_dim, lane_id);
             }
@@ -322,21 +295,16 @@ __device__ void split_tbe_backward_hip_kernel_{{ kdesc }}(
     }
 
     // LAST
-    {%- if nobag %}
-    magic_div_u32_run_with_mod(batch_mdiv, infos[0], batch, table_index, bag_index);
-    {%- else %}
     table_index = infos[0] >> info_B_num_bits;
     bag_index = infos[0] & info_B_mask;
-    {%- endif %}
+
     load_row_per_warp<grad_t, embedding_dim, index_t>::run(
         &grad_data[0], bag_index * num_tables, p_output_grad + table_index * embedding_dim, lane_id);
 
-    {%- if nobag %}
-    magic_div_u32_run_with_mod(batch_mdiv, infos[1], batch, table_index, bag_index);
-    {%- else %}
+    
     table_index = infos[1] >> info_B_num_bits;
     bag_index = infos[1] & info_B_mask;
-    {%- endif %}
+
     load_row_per_warp<grad_t, embedding_dim, index_t>::run(
         &grad_data[dword_per_row], bag_index * num_tables, p_output_grad + table_index * embedding_dim, lane_id);
 
@@ -346,23 +314,19 @@ __device__ void split_tbe_backward_hip_kernel_{{ kdesc }}(
         {
             accumulate_row_per_warp<grad_t, embedding_dim, cache_t, weighted>::run(
                 &grad_acc[0], &grad_data[0], lane_id);
-            {%- if nobag %}
-            magic_div_u32_run_with_mod(batch_mdiv, infos[j], batch, table_index, bag_index);
-            {%- else %}
+            
             table_index = infos[j] >> info_B_num_bits;
             bag_index = infos[j] & info_B_mask;
-            {%- endif %}
+
             load_row_per_warp<grad_t, embedding_dim, index_t>::run(
                 &grad_data[0], bag_index * num_tables, p_output_grad + table_index * embedding_dim, lane_id);
 
             accumulate_row_per_warp<grad_t, embedding_dim, cache_t, weighted>::run(
                 &grad_acc[0], &grad_data[dword_per_row], lane_id);
-            {%- if nobag %}
-            magic_div_u32_run_with_mod(batch_mdiv, infos[j + 1], batch, table_index, bag_index);
-            {%- else %}
+            
             table_index = infos[j + 1] >> info_B_num_bits;
             bag_index = infos[j + 1] & info_B_mask;
-            {%- endif %}
+
             load_row_per_warp<grad_t, embedding_dim, index_t>::run(
                 &grad_data[dword_per_row], bag_index * num_tables, p_output_grad + table_index * embedding_dim, lane_id);
         }
@@ -377,23 +341,19 @@ __device__ void split_tbe_backward_hip_kernel_{{ kdesc }}(
         {
             accumulate_row_per_warp<grad_t, embedding_dim, cache_t, weighted>::run(
                 &grad_acc[0], &grad_data[0], lane_id, indice_weights[j-2]);
-            {%- if nobag %}
-            magic_div_u32_run_with_mod(batch_mdiv, infos[j], batch, table_index, bag_index);
-            {%- else %}
+            
             table_index = infos[j] >> info_B_num_bits;
             bag_index = infos[j] & info_B_mask;
-            {%- endif %}
+
             load_row_per_warp<grad_t, embedding_dim, index_t>::run(
                 &grad_data[0], bag_index * num_tables, p_output_grad + table_index * embedding_dim, lane_id);
 
             accumulate_row_per_warp<grad_t, embedding_dim, cache_t, weighted>::run(
                 &grad_acc[0], &grad_data[dword_per_row], lane_id, indice_weights[j-1]);
-            {%- if nobag %}
-            magic_div_u32_run_with_mod(batch_mdiv, infos[j + 1], batch, table_index, bag_index);
-            {%- else %}
+            
             table_index = infos[j + 1] >> info_B_num_bits;
             bag_index = infos[j + 1] & info_B_mask;
-            {%- endif %}
+
             load_row_per_warp<grad_t, embedding_dim, index_t>::run(
                 &grad_data[dword_per_row], bag_index * num_tables, p_output_grad + table_index * embedding_dim, lane_id);
         }
@@ -414,12 +374,9 @@ L_tail_grad_acc:
                 infos[0] = p_sorted_infos[segment_start];
                 p_sorted_infos++;
 
-                {%- if nobag %}
-                magic_div_u32_run_with_mod(batch_mdiv, infos[0], batch, table_index, bag_index);
-                {%- else %}
                 table_index = infos[0] >> info_B_num_bits;
                 bag_index = infos[0] & info_B_mask;
-                {%- endif %}
+
                 load_row_per_warp<grad_t, embedding_dim, index_t>::run(
                     &grad_data[0], bag_index * num_tables, p_output_grad + table_index * embedding_dim, lane_id);
                 accumulate_row_per_warp<grad_t, embedding_dim, cache_t, weighted>::run(
@@ -435,12 +392,9 @@ L_tail_grad_acc:
                 p_sorted_infos++;
                 p_sorted_indice_weights++;
 
-                {%- if nobag %}
-                magic_div_u32_run_with_mod(batch_mdiv, infos[0], batch, table_index, bag_index);
-                {%- else %}
                 table_index = infos[0] >> info_B_num_bits;
                 bag_index = infos[0] & info_B_mask;
-                {%- endif %}
+
                 load_row_per_warp<grad_t, embedding_dim, index_t>::run(
                     &grad_data[0], bag_index * num_tables, p_output_grad + table_index * embedding_dim, lane_id);
                 accumulate_row_per_warp<grad_t, embedding_dim, cache_t, weighted>::run(

From b3b9868e167f060bab634cfc1147d5e0b37ed154 Mon Sep 17 00:00:00 2001
From: Andrey Bokovoy <avbokovoy@gmail.com>
Date: Wed, 20 Aug 2025 12:15:37 +0000
Subject: [PATCH 18/26] Add 320 embedding dim support for optimized
 warp_per_row kernel

---
 ...ing_backward_split_kernel_warp_template.cu |  2 +-
 .../embedding_backward_split_template.cu      |  2 +-
 .../fbgemm_gpu/rocm/split_embeddings_common.h | 26 +++++++++++++++++++
 3 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_kernel_warp_template.cu b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_kernel_warp_template.cu
index 45735ff1a9..f85ab016a0 100644
--- a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_kernel_warp_template.cu
+++ b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_kernel_warp_template.cu
@@ -766,7 +766,7 @@ hip_split_embedding{{ ndesc }}_backward_codegen_{{ optimizer }}_{{ wdesc }}{{ vd
     {%- for emb_type in (['float', 'at::Half'] + (['at::Float8_e4m3fnuz'] if is_rocm else ['at::Float8_e4m3fn'])) %}
     {%- for cache_type in ['float', 'at::Half'] %}
     {%- for index_type in ['int32_t', 'int64_t'] %}
-    {%- for kEmbeddingDim in [64, 128, 160, 192, 256] %}
+    {%- for kEmbeddingDim in [64, 128, 160, 192, 256, 320] %}
     {%- for kWeighDecayMode in [0, 1, 2] %}
         {{ hip_template_instantiation(
             emb_type,
diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu
index 64b6995e55..d792a1d4bd 100755
--- a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu
+++ b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu
@@ -1245,7 +1245,7 @@ Tensor {{ embedding_cuda_op }}(
                     if (use_hip_kernel && !mixed_D && supported_weights_type && rocm::is_supported_cdna())
                     {
                         constexpr int segments_per_workgroup = 4;
-                        {%- for kDimSize in [64, 128, 160, 192, 256] %}
+                        {%- for kDimSize in [64, 128, 160, 192, 256, 320] %}
                         {%- for kWeightDecayMode in [0, 1, 2] %}
                         if (max_D == {{ kDimSize }} && weight_decay_mode == {{ kWeightDecayMode }})
                         {
diff --git a/fbgemm_gpu/include/fbgemm_gpu/rocm/split_embeddings_common.h b/fbgemm_gpu/include/fbgemm_gpu/rocm/split_embeddings_common.h
index 8a97579d6a..5b9d69d910 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/rocm/split_embeddings_common.h
+++ b/fbgemm_gpu/include/fbgemm_gpu/rocm/split_embeddings_common.h
@@ -205,6 +205,22 @@ struct load_row_per_warp<half, 256, index_t> {
   }
 };
 
+template <typename index_t>
+struct load_row_per_warp<half, 320, index_t> {
+  static __device__ void
+  run(half* emb_data, index_t row_index, const half* p_emb_table, int lane_id) {
+    int32x4_t emb_res =
+        amdgcn_make_buffer_resource(p_emb_table + row_index * 320);
+    *reinterpret_cast<half2*>(&emb_data[0]) =
+        llvm_amdgcn_raw_buffer_load_fp16x2(
+            emb_res, lane_id * sizeof(half2), 0, 0);
+    *reinterpret_cast<half2*>(&emb_data[2]) =
+        llvm_amdgcn_raw_buffer_load_fp16x2(
+            emb_res, (lane_id + 64) * sizeof(half2), 0, 0);
+     emb_data[4] = p_emb_table[row_index * 320 + 256 + lane_id]; 
+  }
+};
+
 template <typename index_t>
 struct load_row_per_warp<half, 512, index_t> {
   static __device__ void
@@ -304,6 +320,16 @@ struct store_row_per_warp<c10::Half, 256, c10::Half> {
   }
 };
 
+template <>
+struct store_row_per_warp<c10::Half, 320, c10::Half> {
+  static __device__ void run(c10::Half* acc, c10::Half* p_output, int lane_id) {
+    auto out = reinterpret_cast<half2*>(p_output);
+    out[lane_id] = *reinterpret_cast<half2*>(acc);
+    out[lane_id + 64] = *reinterpret_cast<half2*>(&acc[2]);
+    p_output[lane_id + 256] = acc[4]; 
+  }
+};
+
 
 template <>
 struct store_row_per_warp<half, 128, float> {

From 8ae472447bd6a01703d3fd9481c7a0ece5459e09 Mon Sep 17 00:00:00 2001
From: root <root@asrock-126-015b.aus.dcgpu>
Date: Mon, 8 Sep 2025 19:34:16 +0000
Subject: [PATCH 19/26] changed the max length per warp and cta per row WG size

---
 .../backward/embedding_backward_split_host_template.cpp     | 2 +-
 .../training/backward/embedding_backward_split_template.cu  | 6 +-----
 .../training/index_select/batch_index_select_dim0_host.cpp  | 2 +-
 .../pt2/embedding_split_host_pt2_autograd_template.cpp      | 2 +-
 4 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_host_template.cpp b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_host_template.cpp
index e071d88768..6d3769534e 100644
--- a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_host_template.cpp
+++ b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_host_template.cpp
@@ -949,7 +949,7 @@ class {{ autograd_func }} :
 
 #ifdef USE_ROCM
     constexpr int32_t BT_block_size = 64;
-    constexpr int32_t max_segment_length_per_warp = 64;
+    constexpr int32_t max_segment_length_per_warp =  4096;
 #else
     constexpr int32_t BT_block_size = 32;
     constexpr int32_t max_segment_length_per_warp = 32;
diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu
index d792a1d4bd..77f807233f 100755
--- a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu
+++ b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu
@@ -988,11 +988,7 @@ Tensor {{ embedding_cuda_op }}(
                 auto num_long_run_ids = at::zeros({1}, indices.options().dtype(at::kInt));
 
                 const bool use_deterministic_algorithms = at::globalContext().deterministicAlgorithms();
-                #ifdef USE_ROCM
-                    const int max_segment_length_per_cta = use_deterministic_algorithms ? INT_MAX : 4096;
-                #else
-                    const int max_segment_length_per_cta = use_deterministic_algorithms ? INT_MAX : 1024;
-                #endif
+                const int max_segment_length_per_cta = use_deterministic_algorithms ? INT_MAX : 4096;
 
                 Tensor long_run_id_to_really_long_run_ids;
                 if (use_deterministic_algorithms) {
diff --git a/fbgemm_gpu/codegen/training/index_select/batch_index_select_dim0_host.cpp b/fbgemm_gpu/codegen/training/index_select/batch_index_select_dim0_host.cpp
index 06cd53b16b..fd43cc18a6 100644
--- a/fbgemm_gpu/codegen/training/index_select/batch_index_select_dim0_host.cpp
+++ b/fbgemm_gpu/codegen/training/index_select/batch_index_select_dim0_host.cpp
@@ -656,7 +656,7 @@ class BatchIndexSelectDim0TensorGPUOp
     const auto permute_output_dim_0_1 =
         ctx->saved_data["permute_output_dim_0_1"].toBool();
 
-    constexpr int32_t max_segment_length_per_warp = 32;
+    constexpr int32_t max_segment_length_per_warp =  4096;
 
     auto grad_output = grad_outputs[0];
 
diff --git a/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_autograd_template.cpp b/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_autograd_template.cpp
index 20c055e917..46384be1bb 100644
--- a/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_autograd_template.cpp
+++ b/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_autograd_template.cpp
@@ -1006,7 +1006,7 @@ static torch::autograd::variable_list backward(
 
 #ifdef USE_ROCM
     constexpr int32_t BT_block_size = 64;
-    constexpr int32_t max_segment_length_per_warp = 64;
+    constexpr int32_t max_segment_length_per_warp =  4096;
 #else
     constexpr int32_t BT_block_size = 32;
     constexpr int32_t max_segment_length_per_warp = 32;

From a5f48da65ca79c1b88850428223bd986a507360b Mon Sep 17 00:00:00 2001
From: kudomcho <kudomcho@amd.com>
Date: Tue, 9 Sep 2025 20:25:30 +0000
Subject: [PATCH 20/26] added DPP and changed max length per warp to 16k

---
 .../embedding_backward_split_host_template.cpp     |  2 +-
 .../index_select/batch_index_select_dim0_host.cpp  |  4 ++--
 .../embedding_split_host_pt2_autograd_template.cpp |  2 +-
 .../include/fbgemm_gpu/utils/cuda_prelude.cuh      | 14 ++++++++------
 4 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_host_template.cpp b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_host_template.cpp
index 6d3769534e..05b93d9d7e 100644
--- a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_host_template.cpp
+++ b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_host_template.cpp
@@ -949,7 +949,7 @@ class {{ autograd_func }} :
 
 #ifdef USE_ROCM
     constexpr int32_t BT_block_size = 64;
-    constexpr int32_t max_segment_length_per_warp =  4096;
+    constexpr int32_t max_segment_length_per_warp =  16384;
 #else
     constexpr int32_t BT_block_size = 32;
     constexpr int32_t max_segment_length_per_warp = 32;
diff --git a/fbgemm_gpu/codegen/training/index_select/batch_index_select_dim0_host.cpp b/fbgemm_gpu/codegen/training/index_select/batch_index_select_dim0_host.cpp
index fd43cc18a6..37c5ce7cc0 100644
--- a/fbgemm_gpu/codegen/training/index_select/batch_index_select_dim0_host.cpp
+++ b/fbgemm_gpu/codegen/training/index_select/batch_index_select_dim0_host.cpp
@@ -341,7 +341,7 @@ class BatchIndexSelectDim0GPUOp
     Tensor grad_dev_weights;
     TORCH_CHECK_EQ(grad_outputs.size(), 1);
 
-    constexpr int32_t max_segment_length_per_warp = 32;
+    constexpr int32_t max_segment_length_per_warp = 16384;
 
     auto grad_output = grad_outputs[0];
 
@@ -656,7 +656,7 @@ class BatchIndexSelectDim0TensorGPUOp
     const auto permute_output_dim_0_1 =
         ctx->saved_data["permute_output_dim_0_1"].toBool();
 
-    constexpr int32_t max_segment_length_per_warp =  4096;
+    constexpr int32_t max_segment_length_per_warp =  16384;
 
     auto grad_output = grad_outputs[0];
 
diff --git a/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_autograd_template.cpp b/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_autograd_template.cpp
index 46384be1bb..8fb2cdf2ed 100644
--- a/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_autograd_template.cpp
+++ b/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_autograd_template.cpp
@@ -1006,7 +1006,7 @@ static torch::autograd::variable_list backward(
 
 #ifdef USE_ROCM
     constexpr int32_t BT_block_size = 64;
-    constexpr int32_t max_segment_length_per_warp =  4096;
+    constexpr int32_t max_segment_length_per_warp =  16384;
 #else
     constexpr int32_t BT_block_size = 32;
     constexpr int32_t max_segment_length_per_warp = 32;
diff --git a/fbgemm_gpu/include/fbgemm_gpu/utils/cuda_prelude.cuh b/fbgemm_gpu/include/fbgemm_gpu/utils/cuda_prelude.cuh
index 0d65c4798a..a1d9819017 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/utils/cuda_prelude.cuh
+++ b/fbgemm_gpu/include/fbgemm_gpu/utils/cuda_prelude.cuh
@@ -21,7 +21,9 @@
 #include <ATen/cuda/CUDAGraphsUtils.cuh>
 #endif
 #include <cassert>
-
+#ifdef USE_ROCM
+#include "fbgemm_gpu/rocm/split_embeddings_common.h"
+#endif
 namespace {
 
 inline int get_device_sm_cnt_() {
@@ -138,11 +140,11 @@ template <typename T, int ReduceWidth = kWarpSize>
 DEVICE_INLINE T warpReduceAllSum(
     T val,
     unsigned shfl_sync_mask = static_cast<unsigned>(kFullWarpMask)) {
-#pragma unroll
-  for (int mask = ReduceWidth / 2; mask > 0; mask >>= 1) {
-    val += shfl_xor(val, mask, ReduceWidth, shfl_sync_mask);
-  }
-  return val;
+      return rocm::wave_reduce<
+        rocm::reduce_op::sum,  // Sum reduction
+        T,                                 // Data type
+        ReduceWidth                        // Wave/Warp size
+    >(val);
 }
 
 DEVICE_INLINE void syncwarp() {

From a4cceb719acf72d2761c0951076b99e68a215d71 Mon Sep 17 00:00:00 2001
From: kudomcho <kudomcho@amd.com>
Date: Wed, 10 Sep 2025 19:33:44 +0000
Subject: [PATCH 21/26] guard max segment warp based on emb dim

---
 ...dding_split_host_pt2_autograd_template.cpp | 20 ++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_autograd_template.cpp b/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_autograd_template.cpp
index 8fb2cdf2ed..c587ccb83a 100644
--- a/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_autograd_template.cpp
+++ b/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_autograd_template.cpp
@@ -1006,7 +1006,25 @@ static torch::autograd::variable_list backward(
 
 #ifdef USE_ROCM
     constexpr int32_t BT_block_size = 64;
-    constexpr int32_t max_segment_length_per_warp =  16384;
+    int32_t max_segment_length_per_warp = 64;
+    // Workaround. Should not be upstreamed in any way.
+    // Redistribute all cta_per_row work to warp_per_row.
+    {%- if (not nobag) and 
+           (optimizer == "rowwise_adagrad") and 
+           (not vbe) and 
+           (not is_gwd) and 
+           (not ssd) and 
+           (not is_index_select) and 
+           (not dense) %}
+    const auto T = weights_offsets.sym_numel();
+    const auto B = (offsets.size(0) - 1) / T;
+    {%- for kDimSize in [64, 128, 160, 192, 256, 320] %}
+    if(!mixed_D && (max_D == {{ kDimSize }})) 
+    {
+      max_segment_length_per_warp = 16384;
+    }
+    {%- endfor %}
+    {%- endif %}
 #else
     constexpr int32_t BT_block_size = 32;
     constexpr int32_t max_segment_length_per_warp = 32;

From 8b4f25cda1425526a5f070a0741fb42aa763cc85 Mon Sep 17 00:00:00 2001
From: kudomcho <kudomcho@amd.com>
Date: Wed, 10 Sep 2025 22:00:20 +0000
Subject: [PATCH 22/26] added guarding opt of max segment for the case batch
 size list=1

---
 .../pt2/embedding_split_host_pt2_autograd_template.cpp      | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_autograd_template.cpp b/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_autograd_template.cpp
index c587ccb83a..fa6a27ab55 100644
--- a/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_autograd_template.cpp
+++ b/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_autograd_template.cpp
@@ -1009,6 +1009,7 @@ static torch::autograd::variable_list backward(
     int32_t max_segment_length_per_warp = 64;
     // Workaround. Should not be upstreamed in any way.
     // Redistribute all cta_per_row work to warp_per_row.
+    int32_t total_L = indices.numel();
     {%- if (not nobag) and 
            (optimizer == "rowwise_adagrad") and 
            (not vbe) and 
@@ -1017,9 +1018,10 @@ static torch::autograd::variable_list backward(
            (not is_index_select) and 
            (not dense) %}
     const auto T = weights_offsets.sym_numel();
-    const auto B = (offsets.size(0) - 1) / T;
+    auto total_B = (offsets.size(0) - 1);
+    const auto B = total_B / T;
     {%- for kDimSize in [64, 128, 160, 192, 256, 320] %}
-    if(!mixed_D && (max_D == {{ kDimSize }})) 
+    if(!mixed_D && total_L / total_B > 1 && (max_D == {{ kDimSize }})) 
     {
       max_segment_length_per_warp = 16384;
     }

From bf4769b749e07365b71704c6f1d2d33e6d4ce2c7 Mon Sep 17 00:00:00 2001
From: kudomcho <kudomcho@amd.com>
Date: Tue, 16 Sep 2025 17:51:25 +0000
Subject: [PATCH 23/26] added condition to apply DPP warp reduce sum when emb
 matches backward opt size and adjusted WG size for L=1 on hip split embed
 kernel

---
 ..._backward_split_indice_weights_template.cu |  7 +++---
 .../embedding_backward_split_template.cu      |  2 +-
 ...dding_split_host_pt2_autograd_template.cpp |  5 +++--
 .../include/fbgemm_gpu/utils/cuda_prelude.cuh | 22 ++++++++++++++-----
 4 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_indice_weights_template.cu b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_indice_weights_template.cu
index 8f190d04d2..95950b8b1e 100755
--- a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_indice_weights_template.cu
+++ b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_indice_weights_template.cu
@@ -420,13 +420,14 @@ Tensor {{ mdesc }}_embedding_codegen_grad_indice_weights{{ vdesc }}_cuda(
                     "{}_embedding_codegen_grad_indice_weights{}_{}kernel".format(
                         mdesc, vdesc, vbdesc)
                 %}
-                FBGEMM_LAUNCH_KERNEL(
-                    ({{ kernel_name }}<
+                auto kernel_name_ = {{ kernel_name }}<
                         emb_t,
                         grad_t,
                         cache_t,
                         index_t,
-                        kFixedMaxVecsPerThread>),
+                        kFixedMaxVecsPerThread>;
+                FBGEMM_LAUNCH_KERNEL(
+                    kernel_name_,
                     div_round_up(total_B, kForwardMaxThreads / kWarpSize),
                     dim3(kWarpSize, kForwardMaxThreads / kWarpSize),
                     0,
diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu
index 77f807233f..06a27f36f0 100755
--- a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu
+++ b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu
@@ -1246,7 +1246,7 @@ Tensor {{ embedding_cuda_op }}(
                         if (max_D == {{ kDimSize }} && weight_decay_mode == {{ kWeightDecayMode }})
                         {
                             warp_per_row_grid_size = div_round_up(sorted_linear_indices_num_runs[0].item<int32_t>(), segments_per_workgroup);
-                            blockSize = dim3(256);
+                            total_L/total_B == 1? blockSize = dim3(64):dim3(256);
                             warp_per_row_smem_bytes = 0;
 
                             backward_warp_per_row_kernel =
diff --git a/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_autograd_template.cpp b/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_autograd_template.cpp
index fa6a27ab55..a81aa9e886 100644
--- a/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_autograd_template.cpp
+++ b/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_autograd_template.cpp
@@ -1005,7 +1005,7 @@ static torch::autograd::variable_list backward(
     TORCH_CHECK_EQ(grad_outputs.size(), 1);
 
 #ifdef USE_ROCM
-    constexpr int32_t BT_block_size = 64;
+    int32_t BT_block_size = 64;
     int32_t max_segment_length_per_warp = 64;
     // Workaround. Should not be upstreamed in any way.
     // Redistribute all cta_per_row work to warp_per_row.
@@ -1021,9 +1021,10 @@ static torch::autograd::variable_list backward(
     auto total_B = (offsets.size(0) - 1);
     const auto B = total_B / T;
     {%- for kDimSize in [64, 128, 160, 192, 256, 320] %}
-    if(!mixed_D && total_L / total_B > 1 && (max_D == {{ kDimSize }})) 
+    if(!mixed_D  && (max_D == {{ kDimSize }})) 
     {
       max_segment_length_per_warp = 16384;
+      BT_block_size = 16384;
     }
     {%- endfor %}
     {%- endif %}
diff --git a/fbgemm_gpu/include/fbgemm_gpu/utils/cuda_prelude.cuh b/fbgemm_gpu/include/fbgemm_gpu/utils/cuda_prelude.cuh
index a1d9819017..707e369b1c 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/utils/cuda_prelude.cuh
+++ b/fbgemm_gpu/include/fbgemm_gpu/utils/cuda_prelude.cuh
@@ -135,16 +135,28 @@ DEVICE_INLINE uint32_t ballot_sync(
 #endif
 }
 
-/// Sums a register value across all warp threads
-template <typename T, int ReduceWidth = kWarpSize>
+// Sums a register value across all warp threads
+template <typename T, int ReduceWidth = kWarpSize, bool embDimMatch = false>
 DEVICE_INLINE T warpReduceAllSum(
     T val,
     unsigned shfl_sync_mask = static_cast<unsigned>(kFullWarpMask)) {
-      return rocm::wave_reduce<
+
+  if constexpr (embDimMatch) {
+    // Use ROCm wave-level reduction when embeddings dimensions already match warp size.
+    return rocm::wave_reduce<
         rocm::reduce_op::sum,  // Sum reduction
-        T,                                 // Data type
-        ReduceWidth                        // Wave/Warp size
+        T,                     // Data type
+        ReduceWidth            // Wave/Warp size
     >(val);
+  } else {
+    // Generic implementation using shuffle-xor reduction
+    #pragma unroll
+    for (int mask = ReduceWidth / 2; mask > 0; mask >>= 1) {
+      T other = shfl_xor(val, mask, ReduceWidth, shfl_sync_mask);
+      val += other;
+    }
+    return val;
+  }
 }
 
 DEVICE_INLINE void syncwarp() {

From 2c44b8daa6439be290d2583a50222e4adb6dd558 Mon Sep 17 00:00:00 2001
From: kudomcho <kudomcho@amd.com>
Date: Tue, 16 Sep 2025 17:53:27 +0000
Subject: [PATCH 24/26] reverted pt2 autograd version back by 1 commit

---
 .../pt2/embedding_split_host_pt2_autograd_template.cpp     | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_autograd_template.cpp b/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_autograd_template.cpp
index a81aa9e886..cde582ae8d 100644
--- a/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_autograd_template.cpp
+++ b/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_autograd_template.cpp
@@ -1005,7 +1005,7 @@ static torch::autograd::variable_list backward(
     TORCH_CHECK_EQ(grad_outputs.size(), 1);
 
 #ifdef USE_ROCM
-    int32_t BT_block_size = 64;
+    constexpr int32_t BT_block_size = 64;
     int32_t max_segment_length_per_warp = 64;
     // Workaround. Should not be upstreamed in any way.
     // Redistribute all cta_per_row work to warp_per_row.
@@ -1021,10 +1021,9 @@ static torch::autograd::variable_list backward(
     auto total_B = (offsets.size(0) - 1);
     const auto B = total_B / T;
     {%- for kDimSize in [64, 128, 160, 192, 256, 320] %}
-    if(!mixed_D  && (max_D == {{ kDimSize }})) 
+    if(!mixed_D && total_L / total_B > 1 && (max_D == {{ kDimSize }})) 
     {
       max_segment_length_per_warp = 16384;
-      BT_block_size = 16384;
     }
     {%- endfor %}
     {%- endif %}
@@ -1313,4 +1312,4 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
         {{ op_name }} );
 }
 
-{%- endif %} {#-/* if has_gpu_support or has_cpu_support */#}
+{%- endif %} {#-/* if has_gpu_support or has_cpu_support */#}
\ No newline at end of file

From 711e565fc7da9b3b369812f68ad0f54ff56e6347 Mon Sep 17 00:00:00 2001
From: root <root@cv350-zts-gtu-c11-18.zts-gtu.dcgpu>
Date: Tue, 16 Sep 2025 22:47:39 +0000
Subject: [PATCH 25/26] enabled DPP if emb dim in range of backward opt

---
 ..._backward_split_indice_weights_template.cu | 46 ++++++++++++++++---
 1 file changed, 40 insertions(+), 6 deletions(-)

diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_indice_weights_template.cu b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_indice_weights_template.cu
index 95950b8b1e..b6e5681412 100755
--- a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_indice_weights_template.cu
+++ b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_indice_weights_template.cu
@@ -13,7 +13,14 @@
 {%- set locs_or_addrs_tensor = "ssd_row_addrs" if ssd else "lxu_cache_locations" %}
 {%- set locs_or_addrs_type = "int64_t" if ssd else "int32_t" %}
 {%- set locs_or_addrs_idx = "row_idx" if ssd else "cache_idx" %}
-
+{%- set is_optimized_hip_kernel_supported_mode = is_rocm and 
+                                                 optimizer == "rowwise_adagrad" and 
+                                                 not dense and
+                                                 not nobag and 
+                                                 not is_index_select and
+                                                 not is_gwd_kernel and 
+                                                 not vbe and 
+                                                 not ssd %}
 ////////////////////////////////////////////////////////////////////////////////
 // Required for op registrations
 ////////////////////////////////////////////////////////////////////////////////
@@ -22,11 +29,9 @@
 #include "fbgemm_gpu/utils/tensor_utils.h"
 #include "fbgemm_gpu/utils/assert_macros.h"
 #include "fbgemm_gpu/utils/kernel_launcher.cuh"
-
 {%- if is_rocm %}
 #include "fbgemm_gpu/rocm/cdna_guard.h"
 {%- endif %}
-
 using Tensor = at::Tensor;
 using namespace fbgemm_gpu;
 
@@ -71,7 +76,8 @@ template <
   typename grad_t,
   typename cache_t,
   typename index_t,
-  int32_t kFixedMaxVecsPerThread
+  int32_t kFixedMaxVecsPerThread,
+  bool embDimMatch
 >
 __global__ __launch_bounds__(kForwardMaxThreads) void
 {{ mdesc }}_embedding_codegen_grad_indice_weights{{ vdesc }}_{{ vbdesc }}kernel(
@@ -265,7 +271,7 @@ __global__ __launch_bounds__(kForwardMaxThreads) void
                     {%- endif %}
                 }
                 grad_indice_weight =
-                    warpReduceAllSum<at::acc_type<cache_t, true>>(grad_indice_weight);
+                    warpReduceAllSum<at::acc_type<cache_t, true>, kWarpSize, embDimMatch>(grad_indice_weight);
                 if (threadIdx.x == 0) {
                     {%- if use_vec_blocking %}
                     if (vec_start == 0) {
@@ -425,7 +431,35 @@ Tensor {{ mdesc }}_embedding_codegen_grad_indice_weights{{ vdesc }}_cuda(
                         grad_t,
                         cache_t,
                         index_t,
-                        kFixedMaxVecsPerThread>;
+                        kFixedMaxVecsPerThread,
+                        /*embDimMatch=*/ false>;
+#ifdef USE_ROCM
+                {%- if is_optimized_hip_kernel_supported_mode %}
+                const auto supported_weights_type = dev_weights.scalar_type() == at::ScalarType::Half
+                                                      || dev_weights.scalar_type() == at::ScalarType::Float;
+
+                if (!mixed_D && supported_weights_type && rocm::is_supported_cdna())
+                {
+                    {%- for kDimSize in [64, 128, 160, 192, 256, 320] %}
+                        {%- for kWeightDecayMode in [0, 1, 2] %}
+                        if (max_D == {{ kDimSize }} && weight_decay_mode == {{ kWeightDecayMode }})
+                        {
+                            kernel_name_ =
+                                {{ kernel_name }}
+                                <
+                                    emb_t,
+                                    grad_t,
+                                    cache_t,
+                                    index_t,
+                                    kFixedMaxVecsPerThread,
+                                    /*embDimMatch=*/ true
+                                    >;
+                }
+                {%- endfor %}
+                {%- endfor %}
+            }
+            {%- endif %}
+#endif          
                 FBGEMM_LAUNCH_KERNEL(
                     kernel_name_,
                     div_round_up(total_B, kForwardMaxThreads / kWarpSize),

From af7392a17d513963070d4217af88556411bed8ba Mon Sep 17 00:00:00 2001
From: root <root@cv350-zts-gtu-c11-18.zts-gtu.dcgpu>
Date: Thu, 18 Sep 2025 09:26:57 +0000
Subject: [PATCH 26/26] opt for grad_indice_weights kernel

---
 ..._backward_split_indice_weights_template.cu | 77 ++++++++++++++++++-
 1 file changed, 76 insertions(+), 1 deletion(-)

diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_indice_weights_template.cu b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_indice_weights_template.cu
index b6e5681412..e02359f1a2 100755
--- a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_indice_weights_template.cu
+++ b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_indice_weights_template.cu
@@ -220,7 +220,82 @@ __global__ __launch_bounds__(kForwardMaxThreads) void
             )
             {%- endif %}
 
-            for (auto j = 0; j < kWarpSize && l_start + j < L; ++j) {
+            int32_t j = 0;
+            {%- if not ssd and not dense and not use_vec_blocking and not vbe %}
+            // Currently for split_embedding_codegen_grad_indice_weights_kernel only
+            for (; j < kWarpSize && l_start + j + 3 < L; j += 4) {
+                const auto offset_idx_j0 = shfl_sync(offset_idx, j);
+                const auto offset_idx_j1 = shfl_sync(offset_idx, j+1);
+                const auto offset_idx_j2 = shfl_sync(offset_idx, j+2);
+                const auto offset_idx_j3 = shfl_sync(offset_idx, j+3);
+
+                const auto cache_idx_j0 = shfl_sync(cache_idx, j);
+                const auto cache_idx_j1 = shfl_sync(cache_idx, j+1);
+                const auto cache_idx_j2 = shfl_sync(cache_idx, j+2);
+                const auto cache_idx_j3 = shfl_sync(cache_idx, j+3);
+
+                at::acc_type<cache_t, true> grad_indice_weight0 = 0.0;
+                at::acc_type<cache_t, true> grad_indice_weight1 = 0.0;
+                at::acc_type<cache_t, true> grad_indice_weight2 = 0.0;
+                at::acc_type<cache_t, true> grad_indice_weight3 = 0.0;
+
+                [[maybe_unused]] const auto weight_row0 = WeightRowAccessor<emb_t, at::acc_type<cache_t, true>>(&weights[offset_idx_j0], D);
+                [[maybe_unused]] const auto weight_row1 = WeightRowAccessor<emb_t, at::acc_type<cache_t, true>>(&weights[offset_idx_j1], D);
+                [[maybe_unused]] const auto weight_row2 = WeightRowAccessor<emb_t, at::acc_type<cache_t, true>>(&weights[offset_idx_j2], D);
+                [[maybe_unused]] const auto weight_row3 = WeightRowAccessor<emb_t, at::acc_type<cache_t, true>>(&weights[offset_idx_j3], D);
+
+                #pragma unroll kFixedMaxVecsPerThread
+                for (int32_t vec = 0; vec < kFixedMaxVecsPerThread && (kWarpSize * vec + threadIdx.x) * kVecWidth < D; ++vec) {
+                    const int32_t d = (kWarpSize * vec + threadIdx.x) * kVecWidth;
+
+                    Vec4T<at::acc_type<cache_t, true>> weight0, weight1, weight2, weight3;
+                    if (placement == PlacementType::MANAGED_CACHING) {
+                        weight0 = (cache_idx_j0 != kCacheLocationMissing) ?
+                        Vec4T<at::acc_type<cache_t, true>>(&lxu_cache_weights[cache_idx_j0][d]) :
+                        weight_row0.load(d);
+
+                        weight1 = (cache_idx_j1 != kCacheLocationMissing) ?
+                        Vec4T<at::acc_type<cache_t, true>>(&lxu_cache_weights[cache_idx_j1][d]) :
+                        weight_row1.load(d);
+
+                        weight2 = (cache_idx_j2 != kCacheLocationMissing) ?
+                        Vec4T<at::acc_type<cache_t, true>>(&lxu_cache_weights[cache_idx_j2][d]) :
+                        weight_row2.load(d);
+
+                        weight3 = (cache_idx_j3 != kCacheLocationMissing) ?
+                        Vec4T<at::acc_type<cache_t, true>>(&lxu_cache_weights[cache_idx_j3][d]) :
+                        weight_row3.load(d);
+                    } else {
+                        weight0 = weight_row0.load(d);
+                        weight1 = weight_row1.load(d);
+                        weight2 = weight_row2.load(d);
+                        weight3 = weight_row3.load(d);
+                    }
+
+                    grad_indice_weight0 += weight0.acc.x * grad_out[vec].acc.x + weight0.acc.y * grad_out[vec].acc.y +
+                            weight0.acc.z * grad_out[vec].acc.z + weight0.acc.w * grad_out[vec].acc.w;
+                    grad_indice_weight1 += weight1.acc.x * grad_out[vec].acc.x + weight1.acc.y * grad_out[vec].acc.y +
+                        weight1.acc.z * grad_out[vec].acc.z + weight1.acc.w * grad_out[vec].acc.w;
+                    grad_indice_weight2 += weight2.acc.x * grad_out[vec].acc.x + weight2.acc.y * grad_out[vec].acc.y +
+                        weight2.acc.z * grad_out[vec].acc.z + weight2.acc.w * grad_out[vec].acc.w;
+                    grad_indice_weight3 += weight3.acc.x * grad_out[vec].acc.x + weight3.acc.y * grad_out[vec].acc.y +
+                        weight3.acc.z * grad_out[vec].acc.z + weight3.acc.w * grad_out[vec].acc.w;
+                }
+                
+                grad_indice_weight0 = warpReduceAllSum<at::acc_type<cache_t, true>>(grad_indice_weight0);
+                grad_indice_weight1 = warpReduceAllSum<at::acc_type<cache_t, true>>(grad_indice_weight1);
+                grad_indice_weight2 = warpReduceAllSum<at::acc_type<cache_t, true>>(grad_indice_weight2);
+                grad_indice_weight3 = warpReduceAllSum<at::acc_type<cache_t, true>>(grad_indice_weight3);
+
+                if (threadIdx.x == 0) {
+                    grad_indice_weights[indices_start + l_start + j] = grad_indice_weight0;
+                    grad_indice_weights[indices_start + l_start + j+1] = grad_indice_weight1;
+                    grad_indice_weights[indices_start + l_start + j+2] = grad_indice_weight2;
+                    grad_indice_weights[indices_start + l_start + j+3] = grad_indice_weight3;
+                }
+            }
+            {%- endif %}
+            for (; j < kWarpSize && l_start + j < L; ++j) {
                 const auto offset_idx_j = shfl_sync(offset_idx, j);
                 {%- if not dense %}
                 const auto {{ locs_or_addrs_idx }}_j = shfl_sync({{ locs_or_addrs_idx }}, j);