NVIDIA · andralex · Mar 14, 2026 · Mar 14, 2026 · Mar 14, 2026 · Mar 14, 2026
@@ -58,7 +58,7 @@ static data_place to_data_place(stf_data_place* data_p)
       {
         return data_place::invalid();
       }
-      exec_place_grid* grid_ptr = static_cast<exec_place_grid*>(grid_handle);
+      exec_place* grid_ptr = static_cast<exec_place*>(grid_handle);
       // Layout-compatible: pass C mapper directly so the runtime calls it
       get_executor_func_t cpp_mapper = reinterpret_cast<get_executor_func_t>(mapper);
       return data_place::composite(cpp_mapper, *grid_ptr);
@@ -425,8 +425,8 @@ stf_exec_place_grid_handle stf_exec_place_grid_from_devices(const int* device_id
   {
     places.push_back(exec_place::device(device_ids[i]));
   }
-  exec_place_grid grid = make_grid(::std::move(places));
-  return new exec_place_grid(::std::move(grid));
+  exec_place grid = make_grid(::std::move(places));
+  return new exec_place(::std::move(grid));
 }
 
 stf_exec_place_grid_handle
@@ -439,18 +439,17 @@ stf_exec_place_grid_create(const stf_exec_place* places, size_t count, const stf
   {
     cpp_places.push_back(to_exec_place(const_cast<stf_exec_place*>(&places[i])));
   }
-  exec_place_grid grid =
-    (grid_dims != nullptr)
-      ? make_grid(::std::move(cpp_places), dim4(grid_dims->x, grid_dims->y, grid_dims->z, grid_dims->t))
-      : make_grid(::std::move(cpp_places));
-  return new exec_place_grid(::std::move(grid));
+  exec_place grid = (grid_dims != nullptr)
+                    ? make_grid(::std::move(cpp_places), dim4(grid_dims->x, grid_dims->y, grid_dims->z, grid_dims->t))
+                    : make_grid(::std::move(cpp_places));
+  return new exec_place(::std::move(grid));
 }
 
 void stf_exec_place_grid_destroy(stf_exec_place_grid_handle grid)
 {
   if (grid != nullptr)
   {
-    delete static_cast<exec_place_grid*>(grid);
+    delete static_cast<exec_place*>(grid);
   }
 }
 

@@ -456,17 +456,17 @@ public:
 
   void set_current_place(pos4 p)
   {
-    get_exec_place().as_grid().set_current_place(p);
+    get_exec_place().set_current_place(p);
   }
 
   void unset_current_place()
   {
-    get_exec_place().as_grid().unset_current_place();
+    get_exec_place().unset_current_place();
   }
 
-  const exec_place& get_current_place() const
+  exec_place get_current_place() const
   {
-    return get_exec_place().as_grid().get_current_place();
+    return get_exec_place().get_current_place();
   }
 
 private:

@@ -87,8 +87,8 @@ public:
       return;
     }
 
-    exec_place_grid grid = memory_node.get_grid();
-    size_t total_size    = this->shape.size();
+    exec_place grid   = memory_node.get_grid();
+    size_t total_size = this->shape.size();
 
     // position (x,y,z,t) on (nx,ny,nz,nt)
     // * index = x + nx*y + nx*ny*z + nx*ny*nz*t

@@ -1117,9 +1117,6 @@ public:
     }
   }
 
-  template <typename S, typename... Deps>
-  auto parallel_for(exec_place_grid e_place, S shape, Deps... deps) = delete;
-
   template <typename S, typename... Deps>
   auto parallel_for(S shape, Deps... deps)
   {

@@ -95,7 +95,7 @@ void cuda_launcher_graph(interpreted_spec interpreted_policy, Fun&& f, void** ar
 template <typename Fun, typename interpreted_spec, typename Arg>
 void launch_impl(interpreted_spec interpreted_policy, exec_place& p, Fun f, Arg arg, cudaStream_t stream, size_t rank)
 {
-  assert(!p.is_grid());
+  assert(p.size() == 1);
 
   p->*[&] {
     auto th = thread_hierarchy(static_cast<int>(rank), interpreted_policy);
@@ -140,7 +140,7 @@ void launch_impl(interpreted_spec interpreted_policy, exec_place& p, Fun f, Arg
 template <typename task_t, typename Fun, typename interpreted_spec, typename Arg>
 void graph_launch_impl(task_t& t, interpreted_spec interpreted_policy, exec_place& p, Fun f, Arg arg, size_t rank)
 {
-  assert(!p.is_grid());
+  assert(p.size() == 1);
 
   auto kernel_args = tuple_prepend(thread_hierarchy(static_cast<int>(rank), interpreted_policy), mv(arg));
   using args_type  = decltype(kernel_args);
@@ -331,11 +331,11 @@ public:
     assert(e_place.affine_data_place() == t.get_affine_data_place());
 
     /*
-     * If we have a grid of places, the implicit affine partitioner is the blocked_partition.
+     * If we have a grid (including 1-element grids), the implicit affine partitioner is the blocked_partition.
      *
      * An explicit composite data place is required per data dependency to customize this behaviour.
      */
-    if (e_place.is_grid())
+    if (e_place.size() > 1)
     {
       // Create a composite data place defined by the grid of places + the partitioning function
       t.set_affine_data_place(data_place::composite(blocked_partition(), e_place.as_grid()));

@@ -161,8 +161,7 @@ inline void loop_dispatch(context_t ctx, size_t start, size_t end, ::std::functi
   }
   else
   {
-    loop_dispatch<context_t, exec_place_grid, use_threads>(
-      mv(ctx), exec_place::all_devices(), scope, start, end, mv(func));
+    loop_dispatch<context_t, exec_place, use_threads>(mv(ctx), exec_place::all_devices(), scope, start, end, mv(func));
   }
 }
 #endif // _CCCL_DOXYGEN_INVOKED

@@ -550,8 +550,8 @@ public:
     // If there is a partitioner, we ensure there is a proper affine data place for this execution place
     if constexpr (!::std::is_same_v<partitioner_t, null_partition>)
     {
-      // This is only meaningful for grid of places
-      if (e_place.is_grid())
+      // Grids need a composite data place
+      if (e_place.size() > 1)
       {
         // Create a composite data place defined by the grid of places + the partitioning function
         t.set_affine_data_place(data_place::composite(partitioner_t(), e_place.as_grid()));
@@ -629,7 +629,7 @@ public:
     if constexpr (need_reduction)
     {
       _CCCL_ASSERT(e_place != exec_place::host(), "Reduce access mode currently unimplemented on host.");
-      _CCCL_ASSERT(!e_place.is_grid(), "Reduce access mode currently unimplemented on grid of places.");
+      _CCCL_ASSERT(e_place.size() == 1, "Reduce access mode currently unimplemented on grid of places.");
       do_parallel_for_redux(f, e_place, shape, t);
       return;
     }
@@ -659,17 +659,18 @@ public:
     if constexpr (!::std::is_same_v<exec_place_t, exec_place_host> && is_extended_host_device_lambda_closure_type
                   || is_extended_device_lambda_closure_type)
     {
-      if (!e_place.is_grid())
+      if (e_place.size() == 1)
       {
         // Apply the parallel_for construct over the entire shape on the
-        // execution place of the task
+        // execution place of the task.
+        const exec_place& scalar_place = e_place;
         if constexpr (need_reduction)
         {
-          do_parallel_for_redux(f, e_place, shape, t);
+          do_parallel_for_redux(f, scalar_place, shape, t);
         }
         else
         {
-          do_parallel_for(f, e_place, shape, t);
+          do_parallel_for(f, scalar_place, shape, t);
         }
       }
       else
@@ -681,11 +682,12 @@ public:
         }
         else
         {
-          size_t grid_size = t.grid_dims().size();
+          const auto& t_place = t.get_exec_place();
+          size_t grid_size    = t_place.size();
           for (size_t i = 0; i < grid_size; i++)
           {
             t.set_current_place(pos4(i));
-            const auto sub_shape = partitioner_t::apply(shape, pos4(i), t.grid_dims());
+            const auto sub_shape = partitioner_t::apply(shape, pos4(i), t_place.get_dims());
             do_parallel_for(f, t.get_current_place(), sub_shape, t);
             t.unset_current_place();
           }

@@ -75,12 +75,8 @@ public:
   // ::std::function<pos4(size_t)> delinearize : translate the index in a buffer into a position in the data
   // TODO pass mv(place)
   template <typename F>
-  localized_array(exec_place_grid grid,
-                  get_executor_func_t mapper,
-                  F&& delinearize,
-                  size_t total_size,
-                  size_t elemsize,
-                  dim4 data_dims)
+  localized_array(
+    exec_place grid, get_executor_func_t mapper, F&& delinearize, size_t total_size, size_t elemsize, dim4 data_dims)
       : grid(mv(grid))
       , mapper(mv(mapper))
       , total_size_bytes(total_size * elemsize)
@@ -422,7 +418,7 @@ private:
   }
 
   event_list prereqs; // To allow reuse in a cache
-  exec_place_grid grid;
+  exec_place grid;
   get_executor_func_t mapper = nullptr;
   ::std::vector<metadata> meta;
 

@@ -46,7 +46,6 @@ namespace cuda::experimental::stf
 {
 // Forward declarations
 class exec_place;
-class exec_place_grid;
 class pos4;
 class dim4;
 
@@ -184,7 +183,7 @@ public:
    * @brief Get the grid for composite places
    * @throws std::logic_error if not a composite place
    */
-  virtual const exec_place_grid& get_grid() const
+  virtual const exec_place& get_grid() const
   {
     throw ::std::logic_error("get_grid() called on non-composite data_place");
   }

@@ -30,92 +30,82 @@
 namespace cuda::experimental::stf
 {
 /**
- * @brief Designates execution that is to run on a specific CUDA stream
- *
+ * @brief Implementation for CUDA stream execution places
  */
-class exec_place_cuda_stream : public exec_place
+class exec_place_cuda_stream_impl : public exec_place::impl
 {
 public:
-  class impl : public exec_place::impl
+  exec_place_cuda_stream_impl(const decorated_stream& dstream)
+      : exec_place::impl(data_place::device(dstream.dev_id))
+      , dstream_(dstream)
+      , dummy_pool_(dstream)
+  {}
+
+  exec_place get_place(size_t idx) const override
   {
-  public:
-    impl(const decorated_stream& _dstream)
-        : exec_place::impl(data_place::device(_dstream.dev_id))
-        , dstream(_dstream)
-        , dummy_pool(_dstream)
-    {}
-
-    /* We set the current device to be the device on which the CUDA stream was created */
-    exec_place activate() const override
-    {
-      return exec_place::device(dstream.dev_id).activate();
-    }
+    EXPECT(idx == 0, "Index out of bounds for cuda_stream exec_place");
+    return exec_place::cuda_stream(dstream_);
+  }
 
-    void deactivate(const exec_place& prev) const override
-    {
-      return exec_place::device(dstream.dev_id).deactivate(prev);
-    }
+  exec_place activate(size_t idx) const override
+  {
+    EXPECT(idx == 0, "Index out of bounds for cuda_stream exec_place");
+    return exec_place::device(dstream_.dev_id).activate();
+  }
 
-    stream_pool& get_stream_pool(bool) const override
-    {
-      return dummy_pool;
-    }
+  void deactivate(size_t idx, const exec_place& prev) const override
+  {
+    EXPECT(idx == 0, "Index out of bounds for cuda_stream exec_place");
+    exec_place::device(dstream_.dev_id).deactivate(prev);
+  }
 
-    ::std::string to_string() const override
-    {
-      return "exec(stream id=" + ::std::to_string(dstream.id) + " dev=" + ::std::to_string(dstream.dev_id) + ")";
-    }
+  stream_pool& get_stream_pool(bool) const override
+  {
+    return dummy_pool_;
+  }
 
-    bool operator==(const exec_place::impl& rhs) const override
+  ::std::string to_string() const override
+  {
+    return "exec(stream id=" + ::std::to_string(dstream_.id) + " dev=" + ::std::to_string(dstream_.dev_id) + ")";
+  }
+
+  int cmp(const exec_place::impl& rhs) const override
+  {
+    if (typeid(*this) != typeid(rhs))
     {
-      if (typeid(*this) != typeid(rhs))
-      {
-        return false;
-      }
-      const auto& other = static_cast<const impl&>(rhs);
-      // Compare by stream handle
-      return dstream.stream == other.dstream.stream;
+      return typeid(*this).before(typeid(rhs)) ? -1 : 1;
     }
-
-    size_t hash() const override
+    const auto& other = static_cast<const exec_place_cuda_stream_impl&>(rhs);
+    if (dstream_.stream < other.dstream_.stream)
     {
-      // Hash the stream handle, not the affine data place
-      return ::std::hash<cudaStream_t>()(dstream.stream);
+      return -1;
     }
-
-    bool operator<(const exec_place::impl& rhs) const override
+    if (other.dstream_.stream < dstream_.stream)
     {
-      if (typeid(*this) != typeid(rhs))
-      {
-        return typeid(*this).before(typeid(rhs));
-      }
-      const auto& other = static_cast<const impl&>(rhs);
-      return dstream.stream < other.dstream.stream;
+      return 1;
     }
+    return 0;
+  }
 
-  private:
-    decorated_stream dstream;
-    // We create a dummy pool of streams which only consists in a single stream in practice.
-    mutable stream_pool dummy_pool;
-  };
-
-public:
-  exec_place_cuda_stream(const decorated_stream& dstream)
-      : exec_place(::std::make_shared<impl>(dstream))
+  size_t hash() const override
   {
-    static_assert(sizeof(exec_place_cuda_stream) == sizeof(exec_place),
-                  "exec_place_cuda_stream cannot add state; it would be sliced away.");
+    return ::std::hash<cudaStream_t>()(dstream_.stream);
   }
+
+private:
+  decorated_stream dstream_;
+  mutable stream_pool dummy_pool_;
 };
 
-inline exec_place_cuda_stream exec_place::cuda_stream(cudaStream_t stream)
+inline exec_place exec_place::cuda_stream(cudaStream_t stream)
 {
   int devid = get_device_from_stream(stream);
-  return exec_place_cuda_stream(decorated_stream(stream, get_stream_id(stream), devid));
+  return exec_place(
+    ::std::make_shared<exec_place_cuda_stream_impl>(decorated_stream(stream, get_stream_id(stream), devid)));
 }
 
-inline exec_place_cuda_stream exec_place::cuda_stream(const decorated_stream& dstream)
+inline exec_place exec_place::cuda_stream(const decorated_stream& dstream)
 {
-  return exec_place_cuda_stream(dstream);
+  return exec_place(::std::make_shared<exec_place_cuda_stream_impl>(dstream));
 }
 } // end namespace cuda::experimental::stf