From dacd49e814a17e87c34d6e8d840e7c3224da3e38 Mon Sep 17 00:00:00 2001
From: Andrei Alexandrescu <andrei@erdani.com>
Date: Fri, 13 Mar 2026 22:02:59 -0400
Subject: [PATCH 01/12] [WIP] Refactor exec_place to unified grid model

All execution places are now modeled as grids:
- Scalar places (host, device) are 1-element grids
- Multi-device grids remain as before

Key changes:
- Added get_dims(), get_place(idx) to exec_place::impl
- Changed activate/deactivate to take index parameter
- Moved set_current_place/unset_current_place to exec_place base
- Deprecated is_grid() in favor of size() > 1
- Updated all client code to use new interface

This eliminates special-casing for grids vs non-grids and allows
uniform iteration over any exec_place.

Made-with: Cursor
---
 .../experimental/__stf/graph/graph_task.cuh   |   8 +-
 .../experimental/__stf/internal/launch.cuh    |   8 +-
 .../__stf/internal/parallel_for_scope.cuh     |  13 +-
 .../__stf/places/exec/cuda_stream.cuh         |  39 +-
 .../__stf/places/exec/green_context.cuh       |  70 +-
 .../__stf/places/place_partition.cuh          |  14 +-
 .../cuda/experimental/__stf/places/places.cuh | 666 +++++++++---------
 .../experimental/__stf/stream/stream_task.cuh |  38 +-
 8 files changed, 428 insertions(+), 428 deletions(-)
diff --git a/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh b/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh
index 39375f6db2d..3251b32eded 100644
--- a/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh
+++ b/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh
@@ -456,17 +456,17 @@ public:
 
   void set_current_place(pos4 p)
   {
-    get_exec_place().as_grid().set_current_place(p);
+    get_exec_place().set_current_place(p);
   }
 
   void unset_current_place()
   {
-    get_exec_place().as_grid().unset_current_place();
+    get_exec_place().unset_current_place();
   }
 
-  const exec_place& get_current_place() const
+  exec_place get_current_place() const
   {
-    return get_exec_place().as_grid().get_current_place();
+    return get_exec_place().get_current_place();
   }
 
 private:
diff --git a/cudax/include/cuda/experimental/__stf/internal/launch.cuh b/cudax/include/cuda/experimental/__stf/internal/launch.cuh
index b33b14929c7..f20205acd59 100644
--- a/cudax/include/cuda/experimental/__stf/internal/launch.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/launch.cuh
@@ -95,7 +95,7 @@ void cuda_launcher_graph(interpreted_spec interpreted_policy, Fun&& f, void** ar
 template <typename Fun, typename interpreted_spec, typename Arg>
 void launch_impl(interpreted_spec interpreted_policy, exec_place& p, Fun f, Arg arg, cudaStream_t stream, size_t rank)
 {
-  assert(!p.is_grid());
+  assert(p.size() == 1);
 
   p->*[&] {
     auto th = thread_hierarchy(static_cast<int>(rank), interpreted_policy);
@@ -140,7 +140,7 @@ void launch_impl(interpreted_spec interpreted_policy, exec_place& p, Fun f, Arg
 template <typename task_t, typename Fun, typename interpreted_spec, typename Arg>
 void graph_launch_impl(task_t& t, interpreted_spec interpreted_policy, exec_place& p, Fun f, Arg arg, size_t rank)
 {
-  assert(!p.is_grid());
+  assert(p.size() == 1);
 
   auto kernel_args = tuple_prepend(thread_hierarchy(static_cast<int>(rank), interpreted_policy), mv(arg));
   using args_type  = decltype(kernel_args);
@@ -331,11 +331,11 @@ public:
     assert(e_place.affine_data_place() == t.get_affine_data_place());
 
     /*
-     * If we have a grid of places, the implicit affine partitioner is the blocked_partition.
+     * If we have a multi-place grid, the implicit affine partitioner is the blocked_partition.
      *
      * An explicit composite data place is required per data dependency to customize this behaviour.
      */
-    if (e_place.is_grid())
+    if (e_place.size() > 1)
     {
       // Create a composite data place defined by the grid of places + the partitioning function
       t.set_affine_data_place(data_place::composite(blocked_partition(), e_place.as_grid()));
diff --git a/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh
index ccc420f6609..90acd49ad22 100644
--- a/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh
@@ -550,8 +550,8 @@ public:
     // If there is a partitioner, we ensure there is a proper affine data place for this execution place
     if constexpr (!::std::is_same_v<partitioner_t, null_partition>)
     {
-      // This is only meaningful for grid of places
-      if (e_place.is_grid())
+      // This is only meaningful for multi-place grids
+      if (e_place.size() > 1)
       {
         // Create a composite data place defined by the grid of places + the partitioning function
         t.set_affine_data_place(data_place::composite(partitioner_t(), e_place.as_grid()));
@@ -629,7 +629,7 @@ public:
     if constexpr (need_reduction)
     {
       _CCCL_ASSERT(e_place != exec_place::host(), "Reduce access mode currently unimplemented on host.");
-      _CCCL_ASSERT(!e_place.is_grid(), "Reduce access mode currently unimplemented on grid of places.");
+      _CCCL_ASSERT(e_place.size() == 1, "Reduce access mode currently unimplemented on grid of places.");
       do_parallel_for_redux(f, e_place, shape, t);
       return;
     }
@@ -659,7 +659,7 @@ public:
     if constexpr (!::std::is_same_v<exec_place_t, exec_place_host> && is_extended_host_device_lambda_closure_type
                   || is_extended_device_lambda_closure_type)
     {
-      if (!e_place.is_grid())
+      if (e_place.size() == 1)
       {
         // Apply the parallel_for construct over the entire shape on the
         // execution place of the task
@@ -681,11 +681,12 @@ public:
         }
         else
         {
-          size_t grid_size = t.grid_dims().size();
+          const auto& t_place = t.get_exec_place();
+          size_t grid_size    = t_place.size();
           for (size_t i = 0; i < grid_size; i++)
           {
             t.set_current_place(pos4(i));
-            const auto sub_shape = partitioner_t::apply(shape, pos4(i), t.grid_dims());
+            const auto sub_shape = partitioner_t::apply(shape, pos4(i), t_place.get_dims());
             do_parallel_for(f, t.get_current_place(), sub_shape, t);
             t.unset_current_place();
           }
diff --git a/cudax/include/cuda/experimental/__stf/places/exec/cuda_stream.cuh b/cudax/include/cuda/experimental/__stf/places/exec/cuda_stream.cuh
index a18545e4014..5cf256cb9ea 100644
--- a/cudax/include/cuda/experimental/__stf/places/exec/cuda_stream.cuh
+++ b/cudax/include/cuda/experimental/__stf/places/exec/cuda_stream.cuh
@@ -41,29 +41,37 @@ public:
   public:
     impl(const decorated_stream& _dstream)
         : exec_place::impl(data_place::device(_dstream.dev_id))
-        , dstream(_dstream)
-        , dummy_pool(_dstream)
+        , dstream_(_dstream)
+        , dummy_pool_(_dstream)
     {}
 
-    /* We set the current device to be the device on which the CUDA stream was created */
-    exec_place activate() const override
+    // Grid interface - cuda_stream is a 1-element grid
+    exec_place get_place(size_t idx) const override
     {
-      return exec_place::device(dstream.dev_id).activate();
+      EXPECT(idx == 0, "Index out of bounds for cuda_stream exec_place");
+      return exec_place::cuda_stream(dstream_);
     }
 
-    void deactivate(const exec_place& prev) const override
+    exec_place activate(size_t idx) const override
     {
-      return exec_place::device(dstream.dev_id).deactivate(prev);
+      EXPECT(idx == 0, "Index out of bounds for cuda_stream exec_place");
+      return exec_place::device(dstream_.dev_id).activate();
+    }
+
+    void deactivate(size_t idx, const exec_place& prev) const override
+    {
+      EXPECT(idx == 0, "Index out of bounds for cuda_stream exec_place");
+      exec_place::device(dstream_.dev_id).deactivate(prev);
     }
 
     stream_pool& get_stream_pool(bool) const override
     {
-      return dummy_pool;
+      return dummy_pool_;
     }
 
     ::std::string to_string() const override
     {
-      return "exec(stream id=" + ::std::to_string(dstream.id) + " dev=" + ::std::to_string(dstream.dev_id) + ")";
+      return "exec(stream id=" + ::std::to_string(dstream_.id) + " dev=" + ::std::to_string(dstream_.dev_id) + ")";
     }
 
     bool operator==(const exec_place::impl& rhs) const override
@@ -73,14 +81,12 @@ public:
         return false;
       }
       const auto& other = static_cast<const impl&>(rhs);
-      // Compare by stream handle
-      return dstream.stream == other.dstream.stream;
+      return dstream_.stream == other.dstream_.stream;
     }
 
     size_t hash() const override
     {
-      // Hash the stream handle, not the affine data place
-      return ::std::hash<cudaStream_t>()(dstream.stream);
+      return ::std::hash<cudaStream_t>()(dstream_.stream);
     }
 
     bool operator<(const exec_place::impl& rhs) const override
@@ -90,13 +96,12 @@ public:
         return typeid(*this).before(typeid(rhs));
       }
       const auto& other = static_cast<const impl&>(rhs);
-      return dstream.stream < other.dstream.stream;
+      return dstream_.stream < other.dstream_.stream;
     }
 
   private:
-    decorated_stream dstream;
-    // We create a dummy pool of streams which only consists in a single stream in practice.
-    mutable stream_pool dummy_pool;
+    decorated_stream dstream_;
+    mutable stream_pool dummy_pool_;
   };
 
 public:
diff --git a/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh b/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh
index f5e5531eb52..7a7c6b62c8d 100644
--- a/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh
+++ b/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh
@@ -270,53 +270,50 @@ public:
     impl(green_ctx_view gc_view, bool use_green_ctx_data_place = false)
         : exec_place::impl(
             use_green_ctx_data_place ? make_green_ctx_data_place(gc_view) : data_place::device(gc_view.devid))
-        , devid(gc_view.devid)
-        , g_ctx(gc_view.g_ctx)
-        , pool(mv(gc_view.pool))
+        , devid_(gc_view.devid)
+        , g_ctx_(gc_view.g_ctx)
+        , pool_(mv(gc_view.pool))
     {}
 
     // This is used to implement deactivate and wrap an existing context
     impl(CUcontext saved_context)
-        : driver_context(saved_context)
+        : driver_context_(saved_context)
     {}
 
-    exec_place activate() const override
+    // Grid interface - green_ctx is a 1-element grid
+    exec_place get_place(size_t idx) const override
     {
+      EXPECT(idx == 0, "Index out of bounds for green_ctx exec_place");
+      return exec_place::green_ctx(green_ctx_view(g_ctx_, pool_, devid_));
+    }
+
+    exec_place activate(size_t idx) const override
+    {
+      EXPECT(idx == 0, "Index out of bounds for green_ctx exec_place");
+
       // Save the current context and transform it into a fake green context place
       CUcontext current_ctx;
       cuda_safe_call(cuCtxGetCurrent(&current_ctx));
       exec_place result = exec_place(::std::make_shared<impl>(current_ctx));
 
-      // Convert the green context to a primary context (TODO cache this ?)
-      cuda_safe_call(cuCtxFromGreenCtx(&driver_context, g_ctx));
-
-#  if 0
-            // for debug purposes, display the affinity
-            {
-                CUdevResource check_resource;
-                cuda_safe_call(cuGreenCtxGetDevResource(g_ctx, &check_resource, CU_DEV_RESOURCE_TYPE_SM));
-                unsigned long long check_ctxId;
-                cuda_safe_call(cuCtxGetId(driver_context, &check_ctxId));
-                fprintf(stderr, "ACTIVATE : set affinity with %d SMs (ctx ID = %llu)\n", check_resource.sm.smCount,
-                        check_ctxId);
-            }
-#  endif
-
-      cuda_safe_call(cuCtxSetCurrent(driver_context));
+      // Convert the green context to a primary context
+      cuda_safe_call(cuCtxFromGreenCtx(&driver_context_, g_ctx_));
+      cuda_safe_call(cuCtxSetCurrent(driver_context_));
 
       return result;
     }
 
-    void deactivate(const exec_place& prev) const override
+    void deactivate(size_t idx, const exec_place& prev) const override
     {
+      EXPECT(idx == 0, "Index out of bounds for green_ctx exec_place");
+
       auto prev_impl      = ::std::static_pointer_cast<impl>(prev.get_impl());
-      CUcontext saved_ctx = prev_impl->driver_context;
+      CUcontext saved_ctx = prev_impl->driver_context_;
 
 #  ifdef DEBUG
-      // Ensure that the current context is the green context that we have activated before
       CUcontext current_ctx;
       cuda_safe_call(cuCtxGetCurrent(&current_ctx));
-      assert(get_cuda_context_id(current_ctx) == get_cuda_context_id(driver_context));
+      assert(get_cuda_context_id(current_ctx) == get_cuda_context_id(driver_context_));
 #  endif
 
       cuda_safe_call(cuCtxSetCurrent(saved_ctx));
@@ -324,13 +321,12 @@ public:
 
     ::std::string to_string() const override
     {
-      return "green ctx ( id=" + ::std::to_string(get_cuda_context_id(g_ctx)) + " dev_id =" + ::std::to_string(devid)
-           + ")";
+      return "green_ctx(id=" + ::std::to_string(get_cuda_context_id(g_ctx_)) + " dev=" + ::std::to_string(devid_) + ")";
     }
 
     stream_pool& get_stream_pool(bool) const override
     {
-      return pool;
+      return pool_;
     }
 
     bool operator==(const exec_place::impl& rhs) const override
@@ -340,14 +336,12 @@ public:
         return false;
       }
       const auto& other = static_cast<const impl&>(rhs);
-      // Compare green context handles
-      return g_ctx == other.g_ctx;
+      return g_ctx_ == other.g_ctx_;
     }
 
     size_t hash() const override
     {
-      // Hash the green context handle, not the affine data place
-      return ::std::hash<CUgreenCtx>()(g_ctx);
+      return ::std::hash<CUgreenCtx>()(g_ctx_);
     }
 
     bool operator<(const exec_place::impl& rhs) const override
@@ -357,16 +351,14 @@ public:
         return typeid(*this).before(typeid(rhs));
       }
       const auto& other = static_cast<const impl&>(rhs);
-      return g_ctx < other.g_ctx;
+      return g_ctx_ < other.g_ctx_;
     }
 
   private:
-    int devid        = -1;
-    CUgreenCtx g_ctx = {};
-    // a context created from the green context (or used to store an existing context to implement
-    // activate/deactivate)
-    mutable CUcontext driver_context = {};
-    mutable stream_pool pool;
+    int devid_                        = -1;
+    CUgreenCtx g_ctx_                 = {};
+    mutable CUcontext driver_context_ = {};
+    mutable stream_pool pool_;
   };
 
 public:
diff --git a/cudax/include/cuda/experimental/__stf/places/place_partition.cuh b/cudax/include/cuda/experimental/__stf/places/place_partition.cuh
index d7cd70dca54..e0d2afed705 100644
--- a/cudax/include/cuda/experimental/__stf/places/place_partition.cuh
+++ b/cudax/include/cuda/experimental/__stf/places/place_partition.cuh
@@ -222,7 +222,7 @@ private:
   /** @brief Compute the subplaces of a place at the specified granularity (scope) into the sub_places vector */
   void compute_subplaces(async_resources_handle& handle, const exec_place& place, place_partition_scope scope)
   {
-    if (place.is_grid() && scope == place_partition_scope::cuda_stream)
+    if (place.size() > 1 && scope == place_partition_scope::cuda_stream)
     {
       // Recursively partition grid into devices, then into streams
       for (auto& device_p : place_partition(place, handle, place_partition_scope::cuda_device))
@@ -247,7 +247,7 @@ private:
 
 // Green contexts are only supported since CUDA 12.4
 #if _CCCL_CTK_AT_LEAST(12, 4)
-    if (place.is_grid() && scope == place_partition_scope::green_context)
+    if (place.size() > 1 && scope == place_partition_scope::green_context)
     {
       // Recursively partition grid into devices, then into green contexts
       for (auto& device_p : place_partition(place, handle, place_partition_scope::cuda_device))
@@ -291,11 +291,13 @@ private:
 #endif // _CCCL_CTK_BELOW(12, 4)
     _CCCL_ASSERT(scope != place_partition_scope::cuda_stream, "CUDA stream scope needs an async resource handle.");
 
-    if (place.is_grid() && scope == place_partition_scope::cuda_device)
+    if (place.size() > 1 && scope == place_partition_scope::cuda_device)
     {
-      exec_place_grid g = place.as_grid();
-      // Copy the vector of places
-      sub_places = g.get_places();
+      // Get places from the grid
+      for (size_t i = 0; i < place.size(); ++i)
+      {
+        sub_places.push_back(place.get_place(i));
+      }
       return;
     }
 
diff --git a/cudax/include/cuda/experimental/__stf/places/places.cuh b/cudax/include/cuda/experimental/__stf/places/places.cuh
index 8b8a75c23cc..f1708188986 100644
--- a/cudax/include/cuda/experimental/__stf/places/places.cuh
+++ b/cudax/include/cuda/experimental/__stf/places/places.cuh
@@ -377,18 +377,19 @@ inline data_place from_index(size_t n);
 /**
  * @brief Indicates where a computation takes place (CPU, dev0, dev1, ...)
  *
- * Currently data and computation are together `(devid == int(data_place))`.
+ * All execution places are modeled as grids. Scalar places (host, single device)
+ * are simply 1-element grids. This unified model eliminates special-casing and
+ * allows uniform iteration over any exec_place.
  */
 class exec_place
 {
 public:
   /*
-   * @brief Using the pimpl idiom. Public because a number of classes inehrit from this.
+   * @brief Using the pimpl idiom. Public because a number of classes inherit from this.
    */
   class impl
   {
   public:
-    // Note that the default ctor assumes an invalid affine data place
     impl()                       = default;
     impl(const impl&)            = delete;
     impl& operator=(const impl&) = delete;
@@ -398,8 +399,44 @@ public:
         : affine(mv(place))
     {}
 
-    virtual exec_place activate() const
+    // ===== Grid interface (all places are grids) =====
+
+    /**
+     * @brief Get the dimensions of this grid
+     *
+     * For scalar places, returns dim4(1, 1, 1, 1).
+     */
+    virtual dim4 get_dims() const
+    {
+      return dim4(1, 1, 1, 1);
+    }
+
+    /**
+     * @brief Get the total number of places in this grid
+     */
+    virtual size_t size() const
+    {
+      return 1;
+    }
+
+    /**
+     * @brief Get the sub-place at the given linear index
+     *
+     * For scalar places, idx must be 0.
+     */
+    virtual exec_place get_place(size_t idx) const;
+
+    // ===== Activation/deactivation (indexed) =====
+
+    /**
+     * @brief Activate the sub-place at the given index
+     *
+     * For scalar places, idx must be 0.
+     * Returns the previous execution state needed for deactivate().
+     */
+    virtual exec_place activate(size_t idx) const
     {
+      EXPECT(idx == 0, "Index out of bounds for scalar exec_place");
       if (!affine.is_device())
       {
         return exec_place();
@@ -414,8 +451,12 @@ public:
       return exec_place(mv(old_dev));
     }
 
-    virtual void deactivate(const exec_place& prev) const
+    /**
+     * @brief Deactivate the sub-place at the given index, restoring previous state
+     */
+    virtual void deactivate(size_t idx, const exec_place& prev) const
     {
+      EXPECT(idx == 0, "Index out of bounds for scalar exec_place");
       if (affine.is_device())
       {
         auto current_dev_id  = cuda_try<cudaGetDevice>();
@@ -427,6 +468,8 @@ public:
       }
     }
 
+    // ===== Properties =====
+
     virtual const data_place affine_data_place() const
     {
       return affine;
@@ -447,14 +490,13 @@ public:
       return affine.is_device();
     }
 
+    /**
+     * @brief Check if this is a multi-element grid (size > 1)
+     * @deprecated Use size() > 1 instead
+     */
     virtual bool is_grid() const
     {
-      return false;
-    }
-
-    virtual size_t size() const
-    {
-      return 1;
+      return size() > 1;
     }
 
     virtual void set_affine_data_place(data_place place)
@@ -462,6 +504,8 @@ public:
       affine = mv(place);
     }
 
+    // ===== Comparison =====
+
     virtual bool operator==(const impl& rhs) const
     {
       return affine == rhs.affine;
@@ -474,22 +518,15 @@ public:
 
     virtual bool operator<(const impl& rhs) const
     {
-      // Different types: order by typeid
       if (typeid(*this) != typeid(rhs))
       {
         return typeid(*this).before(typeid(rhs));
       }
-      // Same type (both base impl): compare by device ID
-      // (base impl stores devid in affine, so we extract it via device_ordinal)
       return device_ordinal(affine) < device_ordinal(rhs.affine);
     }
 
-    /**
-     * @brief Get the stream pool for this execution place.
-     *
-     * The base implementation returns pool_compute or pool_data stored
-     * directly on the impl.
-     */
+    // ===== Stream management =====
+
     virtual stream_pool& get_stream_pool(bool for_computation) const
     {
       return for_computation ? pool_compute : pool_data;
@@ -503,6 +540,10 @@ public:
     data_place affine = data_place::invalid();
     mutable stream_pool pool_compute;
     mutable stream_pool pool_data;
+
+    // Current place state for grid iteration
+    mutable ::std::ptrdiff_t current_idx = -1;
+    mutable ::std::shared_ptr<impl> saved_prev_impl;
   };
 
   exec_place() = default;
@@ -522,7 +563,6 @@ public:
     return !(*this == rhs);
   }
 
-  // To use in a ::std::map indexed by exec_place
   bool operator<(const exec_place& rhs) const
   {
     return *pimpl < *rhs.pimpl;
@@ -543,20 +583,51 @@ public:
     return !(*this < rhs);
   }
 
+  size_t hash() const
+  {
+    return pimpl->hash();
+  }
+
+  // ===== Grid interface (all places are grids) =====
+
   /**
-   * @brief Compute a hash value for this execution place
+   * @brief Get the dimensions of this grid
    *
-   * Used by std::hash specialization for unordered containers.
+   * For scalar places (host, single device), returns dim4(1, 1, 1, 1).
    */
-  size_t hash() const
+  dim4 get_dims() const
   {
-    return pimpl->hash();
+    return pimpl->get_dims();
   }
 
   /**
-   * @brief an iterator class which goes over all subplaces in an exec place.
+   * @brief Get the total number of places in this grid
+   */
+  size_t size() const
+  {
+    return pimpl->size();
+  }
+
+  /**
+   * @brief Get the sub-place at the given linear index
    *
-   * This is a trivial singleton unless we have a grid of places.
+   * For scalar places, idx must be 0 and returns the place itself.
+   */
+  exec_place get_place(size_t idx) const
+  {
+    return pimpl->get_place(idx);
+  }
+
+  /**
+   * @brief Get the sub-place at the given multi-dimensional position
+   */
+  exec_place get_place(pos4 p) const
+  {
+    return get_place(get_dims().get_index(p));
+  }
+
+  /**
+   * @brief an iterator class which goes over all subplaces in an exec place.
    */
   class iterator
   {
@@ -566,7 +637,10 @@ public:
         , index(index)
     {}
 
-    exec_place operator*();
+    exec_place operator*()
+    {
+      return it_impl->get_place(index);
+    }
 
     iterator& operator++()
     {
@@ -598,68 +672,123 @@ public:
     return iterator(pimpl, pimpl->size());
   }
 
+  // ===== Activation/deactivation =====
+
   /**
-   * @brief Returns a string representation of the execution place object.
+   * @brief Activate the sub-place at the given index
    *
-   * @return std::string
+   * @param idx The index of the sub-place to activate (default 0 for scalar places)
+   * @return The previous execution state needed for deactivate()
    */
-  ::std::string to_string() const
+  exec_place activate(size_t idx = 0) const
   {
-    return pimpl->to_string();
+    return pimpl->activate(idx);
   }
 
   /**
-   * @brief Returns the `data_place` naturally associated with this execution place.
+   * @brief Deactivate the sub-place at the given index, restoring previous state
+   *
+   * @param idx The index of the sub-place to deactivate (default 0 for scalar places)
+   * @param prev The previous state returned by activate()
    */
-  const data_place affine_data_place() const
+  void deactivate(size_t idx, const exec_place& prev) const
   {
-    return pimpl->affine_data_place();
+    pimpl->deactivate(idx, prev);
   }
 
-  void set_affine_data_place(data_place place)
+  /**
+   * @brief Convenience overload for scalar places (idx=0)
+   */
+  void deactivate(const exec_place& prev) const
   {
-    pimpl->set_affine_data_place(mv(place));
+    deactivate(0, prev);
   }
 
-  stream_pool& get_stream_pool(bool for_computation) const
+  /**
+   * @brief Set the current place for grid iteration
+   *
+   * Activates the place at the given index and saves state for later restoration.
+   */
+  void set_current_place(size_t idx)
   {
-    return pimpl->get_stream_pool(for_computation);
+    if (pimpl->current_idx >= 0)
+    {
+      exec_place saved_prev(pimpl->saved_prev_impl);
+      pimpl->deactivate(pimpl->current_idx, saved_prev);
+    }
+    pimpl->current_idx     = static_cast<::std::ptrdiff_t>(idx);
+    exec_place prev        = pimpl->activate(idx);
+    pimpl->saved_prev_impl = prev.pimpl;
   }
 
   /**
-   * @brief Get a decorated stream from the stream pool associated to this execution place.
+   * @brief Set the current place using multi-dimensional position
    */
-  decorated_stream getStream(bool for_computation) const;
-
-  cudaStream_t pick_stream(bool for_computation = true) const
+  void set_current_place(pos4 p)
   {
-    return getStream(for_computation).stream;
+    set_current_place(get_dims().get_index(p));
   }
 
-  // TODO make protected !
-  const ::std::shared_ptr<impl>& get_impl() const
+  /**
+   * @brief Unset the current place, restoring previous execution context
+   */
+  void unset_current_place()
   {
-    return pimpl;
+    EXPECT(pimpl->current_idx >= 0, "unset_current_place() called without corresponding set_current_place()");
+    exec_place saved_prev(pimpl->saved_prev_impl);
+    pimpl->deactivate(pimpl->current_idx, saved_prev);
+    pimpl->current_idx = -1;
   }
 
   /**
-   * @brief Set computation to run on this place.
-   *
-   * @return `exec_place` The previous execution place. See `deactivate` below.
+   * @brief Get the currently active sub-place
    */
-  exec_place activate() const
+  exec_place get_current_place() const
   {
-    return pimpl->activate();
+    EXPECT(pimpl->current_idx >= 0, "No current place set");
+    return get_place(pimpl->current_idx);
   }
 
   /**
-   * @brief Undoes the effect of `activate`. Call with the previous `exec_place` object returned by `activate`.
-   *
-   * @warning Undefined behavior if you don't pass the result of `activate`.
+   * @brief Get the index of the currently active sub-place, or -1 if none
    */
-  void deactivate(const exec_place& p) const
+  ::std::ptrdiff_t current_place_id() const
   {
-    pimpl->deactivate(p);
+    return pimpl->current_idx;
+  }
+
+  // ===== Properties =====
+
+  ::std::string to_string() const
+  {
+    return pimpl->to_string();
+  }
+
+  const data_place affine_data_place() const
+  {
+    return pimpl->affine_data_place();
+  }
+
+  void set_affine_data_place(data_place place)
+  {
+    pimpl->set_affine_data_place(mv(place));
+  }
+
+  stream_pool& get_stream_pool(bool for_computation) const
+  {
+    return pimpl->get_stream_pool(for_computation);
+  }
+
+  decorated_stream getStream(bool for_computation) const;
+
+  cudaStream_t pick_stream(bool for_computation = true) const
+  {
+    return getStream(for_computation).stream;
+  }
+
+  const ::std::shared_ptr<impl>& get_impl() const
+  {
+    return pimpl;
   }
 
   bool is_host() const
@@ -672,24 +801,38 @@ public:
     return pimpl->is_device();
   }
 
+  /**
+   * @brief Check if this is a multi-element grid (size > 1)
+   * @deprecated Use size() > 1 instead. All places are now grids.
+   */
   bool is_grid() const
   {
     return pimpl->is_grid();
   }
 
-  size_t size() const
+  /**
+   * @brief Get the dimension along a specific axis
+   * @deprecated Use get_dims().get(axis_id) instead
+   */
+  size_t grid_dim(int axis_id) const
   {
-    return pimpl->size();
+    return get_dims().get(axis_id);
   }
 
-  // Get the implementation assuming this is a grid
-  // We need to defer the implementation after exec_place_grid has been
-  // defined because this requires a ::std::static_pointer_cast from the base
-  // class to exec_place_grid
-  exec_place_grid as_grid() const;
+  /**
+   * @brief Get all dimensions
+   * @deprecated Use get_dims() instead
+   */
+  dim4 grid_dims() const
+  {
+    return get_dims();
+  }
 
-  size_t grid_dim(int axid_is) const;
-  dim4 grid_dims() const;
+  /**
+   * @brief Convert to exec_place_grid type
+   * @deprecated All places are grids now; use exec_place methods directly
+   */
+  exec_place_grid as_grid() const;
 
   /* These helper methods provide convenient way to express execution places,
    * for example exec_place::host or exec_place::device(4).
@@ -870,11 +1013,11 @@ inline decorated_stream exec_place::getStream(bool for_computation) const
 /**
  * @brief Designates execution that is to run on the host.
  *
+ * Host is modeled as a 1-element grid containing the host execution context.
  */
 class exec_place_host : public exec_place
 {
 public:
-  // Implementation of the exec_place_host class
   class impl : public exec_place::impl
   {
   public:
@@ -882,21 +1025,27 @@ public:
         : exec_place::impl(data_place::host())
     {}
 
-    // operator<: base class implementation is correct (compares typeid, then device_ordinal).
-    // Since host is a singleton, all instances compare equal.
+    // Grid interface - host is a 1-element grid
+    exec_place get_place(size_t idx) const override;
 
-    exec_place activate() const override
+    // Activation - no-op for host
+    exec_place activate(size_t idx) const override
     {
+      EXPECT(idx == 0, "Index out of bounds for host exec_place");
       return exec_place();
-    } // no-op
-    void deactivate(const exec_place& p) const override
+    }
+
+    void deactivate(size_t idx, const exec_place& prev) const override
     {
-      _CCCL_ASSERT(!p.get_impl(), "");
-    } // no-op
-    virtual const data_place affine_data_place() const override
+      EXPECT(idx == 0, "Index out of bounds for host exec_place");
+      _CCCL_ASSERT(!prev.get_impl(), "Host deactivate expects empty prev");
+    }
+
+    const data_place affine_data_place() const override
     {
       return data_place::host();
     }
+
     stream_pool& get_stream_pool(bool for_computation) const override
     {
       return exec_place::current_device().get_stream_pool(for_computation);
@@ -943,6 +1092,8 @@ UNITTEST("exec_place_host::operator->*")
 
 /**
  * @brief Designates execution that is to run on a specific CUDA device.
+ *
+ * Device is modeled as a 1-element grid containing that device.
  */
 class exec_place_device : public exec_place
 {
@@ -952,10 +1103,22 @@ public:
   public:
     explicit impl(int devid)
         : exec_place::impl(data_place::device(devid))
+        , devid_(devid)
     {
       pool_compute = stream_pool(pool_size);
       pool_data    = stream_pool(data_pool_size);
     }
+
+    // Grid interface - device is a 1-element grid
+    exec_place get_place(size_t idx) const override;
+
+    int get_devid() const
+    {
+      return devid_;
+    }
+
+  private:
+    int devid_;
   };
 };
 
@@ -1032,83 +1195,87 @@ public:
   class impl : public exec_place::impl
   {
   public:
-    // Define a grid directly from a vector of places
-    // This creates an execution grid automatically
     impl(::std::vector<exec_place> _places)
-        : dims(_places.size(), 1, 1, 1)
-        , places(mv(_places))
+        : dims_(_places.size(), 1, 1, 1)
+        , places_(mv(_places))
     {
-      _CCCL_ASSERT(!places.empty(), "");
-      _CCCL_ASSERT(dims.x > 0, "");
-      _CCCL_ASSERT(affine.is_invalid(), "");
+      _CCCL_ASSERT(!places_.empty(), "Grid must have at least one place");
+      _CCCL_ASSERT(dims_.x > 0, "Grid dimensions must be positive");
     }
 
-    // With a "dim4 shape"
     impl(::std::vector<exec_place> _places, const dim4& _dims)
-        : dims(_dims)
-        , places(mv(_places))
+        : dims_(_dims)
+        , places_(mv(_places))
+    {
+      _CCCL_ASSERT(dims_.x > 0, "Grid dimensions must be positive");
+    }
+
+    // ===== Grid interface =====
+
+    dim4 get_dims() const override
     {
-      _CCCL_ASSERT(dims.x > 0, "");
-      _CCCL_ASSERT(affine.is_invalid(), "");
+      return dims_;
     }
 
-    // TODO improve with a better description
-    ::std::string to_string() const final
+    size_t size() const override
     {
-      return ::std::string("GRID place");
+      return dims_.size();
     }
 
-    exec_place activate() const override
+    exec_place get_place(size_t idx) const override
     {
-      // No-op
-      return exec_place();
+      EXPECT(idx < places_.size(), "Index out of bounds");
+      return places_[idx];
+    }
+
+    // ===== Activation (delegates to sub-places) =====
+
+    exec_place activate(size_t idx) const override
+    {
+      EXPECT(idx < places_.size(), "Index out of bounds");
+      return places_[idx].activate(0);
     }
 
-    // TODO : shall we deactivate the current place, if any ?
-    void deactivate(const exec_place& _prev) const override
+    void deactivate(size_t idx, const exec_place& prev) const override
     {
-      // No-op
-      EXPECT(!_prev.get_impl(), "Invalid execution place.");
+      EXPECT(idx < places_.size(), "Index out of bounds");
+      places_[idx].deactivate(0, prev);
+    }
+
+    // ===== Properties =====
+
+    ::std::string to_string() const override
+    {
+      return "grid(" + ::std::to_string(dims_.x) + "x" + ::std::to_string(dims_.y) + "x" + ::std::to_string(dims_.z)
+           + "x" + ::std::to_string(dims_.t) + ")";
     }
 
-    /* Dynamically checks whether an execution place is a device */
     bool is_device() const override
     {
       return false;
     }
 
-    /* Dynamically checks whether an execution place is a grid */
-    bool is_grid() const override
+    bool is_host() const override
     {
-      return true;
+      return false;
     }
 
+    // ===== Comparison =====
+
     bool operator==(const exec_place::impl& rhs) const override
     {
-      // First, check if rhs is of type exec_place_grid::impl
       auto other = dynamic_cast<const impl*>(&rhs);
       if (!other)
       {
-        return false; // rhs is not a grid, so they are not equal
+        return false;
       }
-
-      // Compare two grids
-      return *this == *other;
-    }
-
-    // Compare two grids
-    bool operator==(const impl& rhs) const
-    {
-      // Compare grid-specific properties
-      // Note: for grids, equality is determined by dims and places, not the affine data place
-      return dims == rhs.dims && places == rhs.places;
+      return dims_ == other->dims_ && places_ == other->places_;
     }
 
     size_t hash() const override
     {
-      // Hash based on dims and places, consistent with operator==
-      size_t h = ::cuda::experimental::stf::hash<dim4>{}(dims);
-      for (const auto& p : places)
+      size_t h = ::cuda::experimental::stf::hash<dim4>{}(dims_);
+      for (const auto& p : places_)
       {
         hash_combine(h, p.hash());
       }
@@ -1117,211 +1284,61 @@ public:
 
     bool operator<(const exec_place::impl& rhs) const override
     {
-      // Different types: order by typeid
       if (typeid(*this) != typeid(rhs))
       {
         return typeid(*this).before(typeid(rhs));
       }
-      // Same type: safe to cast
       const auto& other = static_cast<const impl&>(rhs);
-      // Compare dims first, then places
-      if (!(dims == other.dims))
+      if (!(dims_ == other.dims_))
       {
-        // Use tuple comparison for consistent ordering
-        return ::std::tie(dims.x, dims.y, dims.z, dims.t)
-             < ::std::tie(other.dims.x, other.dims.y, other.dims.z, other.dims.t);
+        return ::std::tie(dims_.x, dims_.y, dims_.z, dims_.t)
+             < ::std::tie(other.dims_.x, other.dims_.y, other.dims_.z, other.dims_.t);
       }
-      return places < other.places;
+      return places_ < other.places_;
     }
 
-    const ::std::vector<exec_place>& get_places() const
-    {
-      return places;
-    }
+    // ===== Stream management =====
 
     stream_pool& get_stream_pool(bool for_computation) const override
     {
       _CCCL_ASSERT(!for_computation, "Expected data transfer stream pool");
-      const auto& v = get_places();
-      _CCCL_ASSERT(v.size() > 0, "Grid must have at least one place");
-      return v[0].get_stream_pool(for_computation);
-    }
-
-    exec_place grid_activate(size_t i) const
-    {
-      const auto& v = get_places();
-      return v[i].activate();
-    }
-
-    void grid_deactivate(size_t i, exec_place p) const
-    {
-      const auto& v = get_places();
-      v[i].deactivate(p);
-    }
-
-    const exec_place& get_current_place()
-    {
-      return get_places()[current_p_1d];
-    }
-
-    // Set the current place from the 1D index within the grid (flattened grid)
-    void set_current_place(size_t p_index)
-    {
-      // Unset the previous place, if any
-      if (current_p_1d >= 0)
-      {
-        // First deactivate the previous place
-        grid_deactivate(current_p_1d, old_place);
-      }
-
-      // get the 1D index for that position
-      current_p_1d = (::std::ptrdiff_t) p_index;
-
-      // The returned value contains the state to restore when we deactivate the place
-      old_place = grid_activate(current_p_1d);
-    }
-
-    // Set the current place, given the position in the grid
-    void set_current_place(pos4 p)
-    {
-      size_t p_index = dims.get_index(p);
-      set_current_place(p_index);
-    }
-
-    void unset_current_place()
-    {
-      EXPECT(current_p_1d >= 0, "unset_current_place() called without corresponding call to set_current_place()");
-
-      // First deactivate the previous place
-      grid_deactivate(current_p_1d, old_place);
-      current_p_1d = -1;
-    }
-
-    ::std::ptrdiff_t current_place_id() const
-    {
-      return current_p_1d;
-    }
-
-    dim4 get_dims() const
-    {
-      return dims;
-    }
-
-    size_t get_dim(int axis_id) const
-    {
-      return dims.get(axis_id);
+      _CCCL_ASSERT(!places_.empty(), "Grid must have at least one place");
+      return places_[0].get_stream_pool(for_computation);
     }
 
-    size_t size() const override
-    {
-      return dims.size();
-    }
+    // ===== Grid-specific accessors =====
 
-    /* Get the place associated to this position in the grid */
-    const exec_place& get_place(pos4 p) const
-    {
-      return coords_to_place(p);
-    }
-
-    const exec_place& get_place(size_t p_index) const
+    const ::std::vector<exec_place>& get_places() const
     {
-      return coords_to_place(p_index);
+      return places_;
     }
 
   private:
-    // What is the execution place at theses coordinates in the exec place grid ?
-    const exec_place& coords_to_place(size_t c0, size_t c1 = 0, size_t c2 = 0, size_t c3 = 0) const
-    {
-      // Flatten the (c0, c1, c2, c3) vector into a global index
-      size_t index = c0 + dims.get(0) * (c1 + dims.get(1) * (c2 + c3 * dims.get(2)));
-      return places[index];
-    }
-
-    const exec_place& coords_to_place(pos4 coords) const
-    {
-      return coords_to_place(coords.x, coords.y, coords.z, coords.t);
-    }
-
-    // current position in the grid (flattened to 1D) if we have a grid of
-    // execution place. -1 indicates there is no current position.
-    ::std::ptrdiff_t current_p_1d = -1;
-
-    // saved state before setting the current place
-    exec_place old_place;
-
-    // dimensions of the "grid"
-    dim4 dims;
-    ::std::vector<exec_place> places;
+    dim4 dims_;
+    ::std::vector<exec_place> places_;
   };
 
-  ///@{ @name Constructors
-  dim4 get_dims() const
-  {
-    return get_impl()->get_dims();
-  }
-
-  size_t get_dim(int axis_id) const
-  {
-    return get_dims().get(axis_id);
-  }
-
-  size_t size() const
-  {
-    return get_dims().size();
-  }
-
   explicit operator bool() const
   {
-    return get_impl() != nullptr;
+    return exec_place::get_impl() != nullptr;
   }
 
-  /* Note that we compare against the exact same implementation : we could
-   * have equivalent grids with the same execution places, but to avoid a
-   * costly comparison we here only look for actually identical grids.
-   */
   bool operator==(const exec_place_grid& rhs) const
   {
     return *get_impl() == *(rhs.get_impl());
   }
 
-  ::std::ptrdiff_t current_place_id() const
-  {
-    return get_impl()->current_place_id();
-  }
-
-  const exec_place& get_place(pos4 p) const
-  {
-    return get_impl()->get_place(p);
-  }
-
+  /**
+   * @brief Get the vector of sub-places (grid-specific)
+   */
   const ::std::vector<exec_place>& get_places() const
   {
     return get_impl()->get_places();
   }
 
-  // Set the current place from the 1D index within the grid (flattened grid)
-  void set_current_place(size_t p_index)
-  {
-    return get_impl()->set_current_place(p_index);
-  }
-
-  // Get the current execution place
-  const exec_place& get_current_place()
-  {
-    return get_impl()->get_current_place();
-  }
-
-  // Set the current place, given the position in the grid
-  void set_current_place(pos4 p)
-  {
-    return get_impl()->set_current_place(p);
-  }
-
-  void unset_current_place()
-  {
-    return get_impl()->unset_current_place();
-  }
-
+  /**
+   * @brief Get the typed impl (for grid-specific operations)
+   */
   ::std::shared_ptr<impl> get_impl() const
   {
     _CCCL_ASSERT(::std::dynamic_pointer_cast<impl>(exec_place::get_impl()), "Invalid exec_place_grid impl");
@@ -1333,7 +1350,6 @@ public:
       : exec_place(nullptr)
   {}
 
-  // private:
   exec_place_grid(::std::shared_ptr<impl> p)
       : exec_place(mv(p))
   {}
@@ -1398,41 +1414,40 @@ inline exec_place data_place::affine_exec_place() const
                            + ::std::to_string(pimpl_->get_device_ordinal()));
 }
 
-/// Implementation deferred because we need the definition of exec_place_grid
-inline exec_place exec_place::iterator::operator*()
+// === Deferred implementations for get_place() ===
+
+inline exec_place exec_place::impl::get_place(size_t idx) const
 {
-  EXPECT(index < it_impl->size());
-  if (it_impl->is_grid())
-  {
-    return ::std::static_pointer_cast<exec_place_grid::impl>(it_impl)->get_place(index);
-  }
-  return exec_place(it_impl);
+  EXPECT(idx == 0, "Index out of bounds for scalar exec_place");
+  // For generic scalar places, we can't easily return self
+  // This should be overridden by concrete implementations
+  return exec_place(
+    ::std::const_pointer_cast<impl>(::std::shared_ptr<const impl>(::std::shared_ptr<const impl>{}, this)));
 }
 
-//! Creates a grid by replicating an execution place multiple times
-inline exec_place_grid exec_place::repeat(const exec_place& e, size_t cnt)
+inline exec_place exec_place_host::impl::get_place(size_t idx) const
 {
-  return make_grid(::std::vector<exec_place>(cnt, e));
+  EXPECT(idx == 0, "Index out of bounds for host exec_place");
+  return exec_place::host();
 }
 
-/* Deferred implementation : ::std::static_pointer_cast requires that exec_place_grid is a complete type */
-inline exec_place_grid exec_place::as_grid() const
+inline exec_place exec_place_device::impl::get_place(size_t idx) const
 {
-  // Make sure it is really a grid
-  EXPECT(is_grid());
-  return exec_place_grid(::std::static_pointer_cast<exec_place_grid::impl>(pimpl));
+  EXPECT(idx == 0, "Index out of bounds for device exec_place");
+  return exec_place::device(devid_);
 }
 
-inline dim4 exec_place::grid_dims() const
+//! Creates a grid by replicating an execution place multiple times
+inline exec_place_grid exec_place::repeat(const exec_place& e, size_t cnt)
 {
-  EXPECT(is_grid());
-  return ::std::static_pointer_cast<exec_place_grid::impl>(pimpl)->get_dims();
+  return make_grid(::std::vector<exec_place>(cnt, e));
 }
 
-inline size_t exec_place::grid_dim(int axis_id) const
+/* Deferred implementation : ::std::static_pointer_cast requires that exec_place_grid is a complete type */
+inline exec_place_grid exec_place::as_grid() const
 {
-  EXPECT(is_grid());
-  return ::std::static_pointer_cast<exec_place_grid::impl>(pimpl)->get_dim(axis_id);
+  EXPECT(size() > 1, "as_grid() called on scalar exec_place");
+  return exec_place_grid(::std::static_pointer_cast<exec_place_grid::impl>(pimpl));
 }
 
 /* Get the first N available devices */
@@ -1466,8 +1481,7 @@ inline exec_place_grid exec_place::all_devices()
 //! Creates a cyclic partition of an execution place grid with specified strides
 inline exec_place_grid partition_cyclic(const exec_place_grid& e_place, dim4 strides, pos4 tile_id)
 {
-  const auto& g = e_place.as_grid();
-  dim4 g_dims   = e_place.get_dims();
+  dim4 g_dims = e_place.get_dims();
 
   /*
    *  Example : strides = (3, 2). tile 1 id = (1, 0)
@@ -1479,15 +1493,10 @@ inline exec_place_grid partition_cyclic(const exec_place_grid& e_place, dim4 str
   // Dimension K_x of the new grid on axis x :
   // pos_x + K_x stride_x = dim_x
   // K_x = (dim_x - pos_x)/stride_x
-  dim4 size = dim4((g.get_dim(0) - tile_id.x + strides.x - 1) / strides.x,
-                   (g.get_dim(1) - tile_id.y + strides.y - 1) / strides.y,
-                   (g.get_dim(2) - tile_id.z + strides.z - 1) / strides.z,
-                   (g.get_dim(3) - tile_id.t + strides.t - 1) / strides.t);
-
-  //    fprintf(stderr, "G DIM %d STRIDE %d ID %d\n", g_dims.x, strides.x, tile_id.x);
-  //    fprintf(stderr, "G DIM %d STRIDE %d ID %d\n", g_dims.y, strides.y, tile_id.y);
-  //    fprintf(stderr, "G DIM %d STRIDE %d ID %d\n", g_dims.z, strides.z, tile_id.z);
-  //    fprintf(stderr, "G DIM %d STRIDE %d ID %d\n", g_dims.t, strides.t, tile_id.t);
+  dim4 size = dim4((g_dims.x - tile_id.x + strides.x - 1) / strides.x,
+                   (g_dims.y - tile_id.y + strides.y - 1) / strides.y,
+                   (g_dims.z - tile_id.z + strides.z - 1) / strides.z,
+                   (g_dims.t - tile_id.t + strides.t - 1) / strides.t);
 
   ::std::vector<exec_place> places;
   places.reserve(size.x * size.y * size.z * size.t);
@@ -1500,7 +1509,7 @@ inline exec_place_grid partition_cyclic(const exec_place_grid& e_place, dim4 str
       {
         for (size_t x = static_cast<size_t>(tile_id.x); x < g_dims.x; x += strides.x)
         {
-          places.push_back(g.get_place(pos4(x, y, z, t)));
+          places.push_back(e_place.get_place(pos4(x, y, z, t)));
         }
       }
     }
@@ -1519,18 +1528,15 @@ inline exec_place_grid partition_cyclic(const exec_place_grid& e_place, dim4 str
 //! auto sub_g = partition_tile(g, dim4(2,2), dim4(0,1))
 inline exec_place_grid partition_tile(const exec_place_grid& e_place, dim4 tile_sizes, pos4 tile_id)
 {
-  const auto& g = e_place.as_grid();
+  dim4 g_dims = e_place.get_dims();
 
-  // TODO define dim4=dim4 * dim4
   dim4 begin_coords(
     tile_id.x * tile_sizes.x, tile_id.y * tile_sizes.y, tile_id.z * tile_sizes.z, tile_id.t * tile_sizes.t);
 
-  // TODO define dim4=MIN(dim4,dim4)
-  // upper bound coordinate (excluded)
-  dim4 end_coords(::std::min((tile_id.x + 1) * tile_sizes.x, g.get_dim(0)),
-                  ::std::min((tile_id.y + 1) * tile_sizes.y, g.get_dim(1)),
-                  ::std::min((tile_id.z + 1) * tile_sizes.z, g.get_dim(2)),
-                  ::std::min((tile_id.t + 1) * tile_sizes.t, g.get_dim(3)));
+  dim4 end_coords(::std::min((tile_id.x + 1) * tile_sizes.x, g_dims.x),
+                  ::std::min((tile_id.y + 1) * tile_sizes.y, g_dims.y),
+                  ::std::min((tile_id.z + 1) * tile_sizes.z, g_dims.z),
+                  ::std::min((tile_id.t + 1) * tile_sizes.t, g_dims.t));
 
   //    fprintf(stderr, "G DIM %d TILE SIZE %d ID %d\n", g_dims.x, tile_sizes.x, tile_id.x);
   //    fprintf(stderr, "G DIM %d TILE SIZE %d ID %d\n", g_dims.y, tile_sizes.y, tile_id.y);
@@ -1559,7 +1565,7 @@ inline exec_place_grid partition_tile(const exec_place_grid& e_place, dim4 tile_
       {
         for (size_t x = static_cast<size_t>(begin_coords.x); x < end_coords.x; x++)
         {
-          places.push_back(g.get_place(pos4(x, y, z, t)));
+          places.push_back(e_place.get_place(pos4(x, y, z, t)));
         }
       }
     }
diff --git a/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh b/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh
index 4176e74b01d..5c000862613 100644
--- a/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh
+++ b/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh
@@ -75,24 +75,23 @@ public:
   cudaStream_t get_stream() const
   {
     const auto& e_place = get_exec_place();
-    if (e_place.is_grid())
+    if (e_place.size() > 1)
     {
       // Even with a grid, when we have a ctx.task construct we have not
       // yet selected/activated a specific place. So we take the main
       // stream associated to the whole task in that case.
-      ::std::ptrdiff_t current_place_id = e_place.as_grid().current_place_id();
-      return (current_place_id < 0 ? dstream.stream : stream_grid[current_place_id].stream);
+      ::std::ptrdiff_t current_id = e_place.current_place_id();
+      return (current_id < 0 ? dstream.stream : stream_grid[current_id].stream);
     }
 
     return dstream.stream;
   }
 
-  // TODO use a pos4 and check that we have a grid, of the proper dimension
   cudaStream_t get_stream(size_t pos) const
   {
     const auto& e_place = get_exec_place();
 
-    if (e_place.is_grid())
+    if (e_place.size() > 1)
     {
       return stream_grid[pos].stream;
     }
@@ -116,19 +115,15 @@ public:
     event_list ready_prereqs = acquire(ctx);
 
     /* Select the stream(s) */
-    if (e_place.is_grid())
+    if (e_place.size() > 1)
     {
       // We have currently no way to pass an array of per-place streams
       _CCCL_ASSERT(automatic_stream, "automatic stream is not enabled");
 
-      // Note: we store grid in a variable to avoid dangling references
-      // because the compiler does not know we are making a reference to
-      // a vector that remains valid
-      const auto& grid   = e_place.as_grid();
-      const auto& places = grid.get_places();
-      for (const exec_place& p : places)
+      // Get stream for each place in the grid
+      for (size_t i = 0; i < e_place.size(); ++i)
       {
-        stream_grid.push_back(p.getStream(true));
+        stream_grid.push_back(e_place.get_place(i).getStream(true));
       }
 
       EXPECT(stream_grid.size() > 0UL);
@@ -187,7 +182,7 @@ public:
     }
 
     // Select one stream to sync with all prereqs
-    auto& s0 = e_place.is_grid() ? stream_grid[0] : dstream;
+    auto& s0 = (e_place.size() > 1) ? stream_grid[0] : dstream;
 
     /* Ensure that stream depend(s) on prereqs */
     submitted_events = stream_async_op(ctx, s0, ready_prereqs);
@@ -196,8 +191,8 @@ public:
       submitted_events.set_symbol("Submitted" + get_symbol());
     }
 
-    /* If this is a grid, all other streams must wait on s0 too */
-    if (e_place.is_grid())
+    /* If this is a multi-place grid, all other streams must wait on s0 too */
+    if (e_place.size() > 1)
     {
       insert_dependencies(stream_grid);
     }
@@ -215,17 +210,17 @@ public:
 
   void set_current_place(pos4 p)
   {
-    get_exec_place().as_grid().set_current_place(p);
+    get_exec_place().set_current_place(p);
   }
 
   void unset_current_place()
   {
-    return get_exec_place().as_grid().unset_current_place();
+    get_exec_place().unset_current_place();
   }
 
-  const exec_place& get_current_place()
+  exec_place get_current_place()
   {
-    return get_exec_place().as_grid().get_current_place();
+    return get_exec_place().get_current_place();
   }
 
   /* End the task, but do not clear its data structures yet */
@@ -236,9 +231,8 @@ public:
     event_list end_list;
 
     const auto& e_place = get_exec_place();
-    // Create an event with this stream
 
-    if (e_place.is_grid())
+    if (e_place.size() > 1)
     {
       // s0 depends on all other streams
       for (size_t i = 1; i < stream_grid.size(); i++)

From 6e2df83648b97d4fbd33799379b98ba59ce8667f Mon Sep 17 00:00:00 2001
From: Andrei Alexandrescu <andrei@erdani.com>
Date: Fri, 13 Mar 2026 22:22:57 -0400
Subject: [PATCH 02/12] Remove exec_place_cuda_stream shell class

The shell class added no value - just use the dynamic interface
directly and return exec_place from the factory methods.

Made-with: Cursor
---
 .../__stf/places/exec/cuda_stream.cuh         | 125 ++++++++----------
 .../cuda/experimental/__stf/places/places.cuh |   5 +-
 2 files changed, 58 insertions(+), 72 deletions(-)

diff --git a/cudax/include/cuda/experimental/__stf/places/exec/cuda_stream.cuh b/cudax/include/cuda/experimental/__stf/places/exec/cuda_stream.cuh
index 5cf256cb9ea..c4935000adc 100644
--- a/cudax/include/cuda/experimental/__stf/places/exec/cuda_stream.cuh
+++ b/cudax/include/cuda/experimental/__stf/places/exec/cuda_stream.cuh
@@ -30,97 +30,84 @@
 namespace cuda::experimental::stf
 {
 /**
- * @brief Designates execution that is to run on a specific CUDA stream
- *
+ * @brief Implementation for CUDA stream execution places
  */
-class exec_place_cuda_stream : public exec_place
+class exec_place_cuda_stream_impl : public exec_place::impl
 {
 public:
-  class impl : public exec_place::impl
+  exec_place_cuda_stream_impl(const decorated_stream& dstream)
+      : exec_place::impl(data_place::device(dstream.dev_id))
+      , dstream_(dstream)
+      , dummy_pool_(dstream)
+  {}
+
+  exec_place get_place(size_t idx) const override
   {
-  public:
-    impl(const decorated_stream& _dstream)
-        : exec_place::impl(data_place::device(_dstream.dev_id))
-        , dstream_(_dstream)
-        , dummy_pool_(_dstream)
-    {}
-
-    // Grid interface - cuda_stream is a 1-element grid
-    exec_place get_place(size_t idx) const override
-    {
-      EXPECT(idx == 0, "Index out of bounds for cuda_stream exec_place");
-      return exec_place::cuda_stream(dstream_);
-    }
+    EXPECT(idx == 0, "Index out of bounds for cuda_stream exec_place");
+    return exec_place::cuda_stream(dstream_);
+  }
 
-    exec_place activate(size_t idx) const override
-    {
-      EXPECT(idx == 0, "Index out of bounds for cuda_stream exec_place");
-      return exec_place::device(dstream_.dev_id).activate();
-    }
+  exec_place activate(size_t idx) const override
+  {
+    EXPECT(idx == 0, "Index out of bounds for cuda_stream exec_place");
+    return exec_place::device(dstream_.dev_id).activate();
+  }
 
-    void deactivate(size_t idx, const exec_place& prev) const override
-    {
-      EXPECT(idx == 0, "Index out of bounds for cuda_stream exec_place");
-      exec_place::device(dstream_.dev_id).deactivate(prev);
-    }
+  void deactivate(size_t idx, const exec_place& prev) const override
+  {
+    EXPECT(idx == 0, "Index out of bounds for cuda_stream exec_place");
+    exec_place::device(dstream_.dev_id).deactivate(prev);
+  }
 
-    stream_pool& get_stream_pool(bool) const override
-    {
-      return dummy_pool_;
-    }
+  stream_pool& get_stream_pool(bool) const override
+  {
+    return dummy_pool_;
+  }
 
-    ::std::string to_string() const override
-    {
-      return "exec(stream id=" + ::std::to_string(dstream_.id) + " dev=" + ::std::to_string(dstream_.dev_id) + ")";
-    }
+  ::std::string to_string() const override
+  {
+    return "exec(stream id=" + ::std::to_string(dstream_.id) + " dev=" + ::std::to_string(dstream_.dev_id) + ")";
+  }
 
-    bool operator==(const exec_place::impl& rhs) const override
+  bool operator==(const exec_place::impl& rhs) const override
+  {
+    if (typeid(*this) != typeid(rhs))
     {
-      if (typeid(*this) != typeid(rhs))
-      {
-        return false;
-      }
-      const auto& other = static_cast<const impl&>(rhs);
-      return dstream_.stream == other.dstream_.stream;
+      return false;
     }
+    const auto& other = static_cast<const exec_place_cuda_stream_impl&>(rhs);
+    return dstream_.stream == other.dstream_.stream;
+  }
 
-    size_t hash() const override
-    {
-      return ::std::hash<cudaStream_t>()(dstream_.stream);
-    }
+  size_t hash() const override
+  {
+    return ::std::hash<cudaStream_t>()(dstream_.stream);
+  }
 
-    bool operator<(const exec_place::impl& rhs) const override
+  bool operator<(const exec_place::impl& rhs) const override
+  {
+    if (typeid(*this) != typeid(rhs))
     {
-      if (typeid(*this) != typeid(rhs))
-      {
-        return typeid(*this).before(typeid(rhs));
-      }
-      const auto& other = static_cast<const impl&>(rhs);
-      return dstream_.stream < other.dstream_.stream;
+      return typeid(*this).before(typeid(rhs));
     }
-
-  private:
-    decorated_stream dstream_;
-    mutable stream_pool dummy_pool_;
-  };
-
-public:
-  exec_place_cuda_stream(const decorated_stream& dstream)
-      : exec_place(::std::make_shared<impl>(dstream))
-  {
-    static_assert(sizeof(exec_place_cuda_stream) == sizeof(exec_place),
-                  "exec_place_cuda_stream cannot add state; it would be sliced away.");
+    const auto& other = static_cast<const exec_place_cuda_stream_impl&>(rhs);
+    return dstream_.stream < other.dstream_.stream;
   }
+
+private:
+  decorated_stream dstream_;
+  mutable stream_pool dummy_pool_;
 };
 
-inline exec_place_cuda_stream exec_place::cuda_stream(cudaStream_t stream)
+inline exec_place exec_place::cuda_stream(cudaStream_t stream)
 {
   int devid = get_device_from_stream(stream);
-  return exec_place_cuda_stream(decorated_stream(stream, get_stream_id(stream), devid));
+  return exec_place(
+    ::std::make_shared<exec_place_cuda_stream_impl>(decorated_stream(stream, get_stream_id(stream), devid)));
 }
 
-inline exec_place_cuda_stream exec_place::cuda_stream(const decorated_stream& dstream)
+inline exec_place exec_place::cuda_stream(const decorated_stream& dstream)
 {
-  return exec_place_cuda_stream(dstream);
+  return exec_place(::std::make_shared<exec_place_cuda_stream_impl>(dstream));
 }
 } // end namespace cuda::experimental::stf
diff --git a/cudax/include/cuda/experimental/__stf/places/places.cuh b/cudax/include/cuda/experimental/__stf/places/places.cuh
index f1708188986..e83c22c8aba 100644
--- a/cudax/include/cuda/experimental/__stf/places/places.cuh
+++ b/cudax/include/cuda/experimental/__stf/places/places.cuh
@@ -51,7 +51,6 @@ namespace cuda::experimental::stf
 class exec_place;
 class exec_place_host;
 class exec_place_grid;
-class exec_place_cuda_stream;
 
 // Green contexts are only supported since CUDA 12.4
 #if _CCCL_CTK_AT_LEAST(12, 4)
@@ -854,8 +853,8 @@ public:
   static exec_place green_ctx(const green_ctx_view& gc_view, bool use_green_ctx_data_place = false);
 #endif // _CCCL_CTK_AT_LEAST(12, 4)
 
-  static exec_place_cuda_stream cuda_stream(cudaStream_t stream);
-  static exec_place_cuda_stream cuda_stream(const decorated_stream& dstream);
+  static exec_place cuda_stream(cudaStream_t stream);
+  static exec_place cuda_stream(const decorated_stream& dstream);
 
   /**
    * @brief Returns the currently active device.

From eb16cfeaf922480e12a1cc7e86cf4616f12a4e82 Mon Sep 17 00:00:00 2001
From: Andrei Alexandrescu <andrei@erdani.com>
Date: Fri, 13 Mar 2026 22:46:50 -0400
Subject: [PATCH 03/12] Remove exec_place_green_ctx shell class

Same as exec_place_cuda_stream - the shell class adds no value.
Use the dynamic interface directly and return exec_place from factory.

Made-with: Cursor
---
 .../__stf/places/exec/green_context.cuh       | 173 ++++++++----------
 .../cuda/experimental/__stf/places/places.cuh |   3 -
 2 files changed, 80 insertions(+), 96 deletions(-)

diff --git a/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh b/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh
index 7a7c6b62c8d..b0dc687be6d 100644
--- a/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh
+++ b/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh
@@ -252,127 +252,114 @@ private:
 };
 
 /**
- * @brief Designates execution that is to run on a green context. Initialize with the device ordinal and green_context
+ * @brief Implementation for green context execution places
  */
-class exec_place_green_ctx : public exec_place
+class exec_place_green_ctx_impl : public exec_place::impl
 {
 public:
-  class impl : public exec_place::impl
+  /**
+   * @brief Construct a green context execution place
+   *
+   * @param gc_view The green context view
+   * @param use_green_ctx_data_place If true, use a green context data place as the
+   *        affine data place. If false (default), use a regular device data place instead.
+   */
+  exec_place_green_ctx_impl(green_ctx_view gc_view, bool use_green_ctx_data_place = false)
+      : exec_place::impl(
+          use_green_ctx_data_place ? make_green_ctx_data_place(gc_view) : data_place::device(gc_view.devid))
+      , devid_(gc_view.devid)
+      , g_ctx_(gc_view.g_ctx)
+      , pool_(mv(gc_view.pool))
+  {}
+
+  // This is used to implement deactivate and wrap an existing context
+  exec_place_green_ctx_impl(CUcontext saved_context)
+      : driver_context_(saved_context)
+  {}
+
+  exec_place get_place(size_t idx) const override
   {
-  public:
-    /**
-     * @brief Construct a green context execution place
-     *
-     * @param gc_view The green context view
-     * @param use_green_ctx_data_place If true, use a green context data place as the
-     *        affine data place. If false (default), use a regular device data place instead.
-     */
-    impl(green_ctx_view gc_view, bool use_green_ctx_data_place = false)
-        : exec_place::impl(
-            use_green_ctx_data_place ? make_green_ctx_data_place(gc_view) : data_place::device(gc_view.devid))
-        , devid_(gc_view.devid)
-        , g_ctx_(gc_view.g_ctx)
-        , pool_(mv(gc_view.pool))
-    {}
-
-    // This is used to implement deactivate and wrap an existing context
-    impl(CUcontext saved_context)
-        : driver_context_(saved_context)
-    {}
-
-    // Grid interface - green_ctx is a 1-element grid
-    exec_place get_place(size_t idx) const override
-    {
-      EXPECT(idx == 0, "Index out of bounds for green_ctx exec_place");
-      return exec_place::green_ctx(green_ctx_view(g_ctx_, pool_, devid_));
-    }
+    EXPECT(idx == 0, "Index out of bounds for green_ctx exec_place");
+    return exec_place::green_ctx(green_ctx_view(g_ctx_, pool_, devid_));
+  }
 
-    exec_place activate(size_t idx) const override
-    {
-      EXPECT(idx == 0, "Index out of bounds for green_ctx exec_place");
+  exec_place activate(size_t idx) const override
+  {
+    EXPECT(idx == 0, "Index out of bounds for green_ctx exec_place");
 
-      // Save the current context and transform it into a fake green context place
-      CUcontext current_ctx;
-      cuda_safe_call(cuCtxGetCurrent(&current_ctx));
-      exec_place result = exec_place(::std::make_shared<impl>(current_ctx));
+    // Save the current context and transform it into a fake green context place
+    CUcontext current_ctx;
+    cuda_safe_call(cuCtxGetCurrent(&current_ctx));
+    exec_place result = exec_place(::std::make_shared<exec_place_green_ctx_impl>(current_ctx));
 
-      // Convert the green context to a primary context
-      cuda_safe_call(cuCtxFromGreenCtx(&driver_context_, g_ctx_));
-      cuda_safe_call(cuCtxSetCurrent(driver_context_));
+    // Convert the green context to a primary context
+    cuda_safe_call(cuCtxFromGreenCtx(&driver_context_, g_ctx_));
+    cuda_safe_call(cuCtxSetCurrent(driver_context_));
 
-      return result;
-    }
+    return result;
+  }
 
-    void deactivate(size_t idx, const exec_place& prev) const override
-    {
-      EXPECT(idx == 0, "Index out of bounds for green_ctx exec_place");
+  void deactivate(size_t idx, const exec_place& prev) const override
+  {
+    EXPECT(idx == 0, "Index out of bounds for green_ctx exec_place");
 
-      auto prev_impl      = ::std::static_pointer_cast<impl>(prev.get_impl());
-      CUcontext saved_ctx = prev_impl->driver_context_;
+    auto prev_impl      = ::std::static_pointer_cast<exec_place_green_ctx_impl>(prev.get_impl());
+    CUcontext saved_ctx = prev_impl->driver_context_;
 
 #  ifdef DEBUG
-      CUcontext current_ctx;
-      cuda_safe_call(cuCtxGetCurrent(&current_ctx));
-      assert(get_cuda_context_id(current_ctx) == get_cuda_context_id(driver_context_));
+    CUcontext current_ctx;
+    cuda_safe_call(cuCtxGetCurrent(&current_ctx));
+    assert(get_cuda_context_id(current_ctx) == get_cuda_context_id(driver_context_));
 #  endif
 
-      cuda_safe_call(cuCtxSetCurrent(saved_ctx));
-    }
+    cuda_safe_call(cuCtxSetCurrent(saved_ctx));
+  }
 
-    ::std::string to_string() const override
-    {
-      return "green_ctx(id=" + ::std::to_string(get_cuda_context_id(g_ctx_)) + " dev=" + ::std::to_string(devid_) + ")";
-    }
+  ::std::string to_string() const override
+  {
+    return "green_ctx(id=" + ::std::to_string(get_cuda_context_id(g_ctx_)) + " dev=" + ::std::to_string(devid_) + ")";
+  }
 
-    stream_pool& get_stream_pool(bool) const override
-    {
-      return pool_;
-    }
+  stream_pool& get_stream_pool(bool) const override
+  {
+    return pool_;
+  }
 
-    bool operator==(const exec_place::impl& rhs) const override
+  bool operator==(const exec_place::impl& rhs) const override
+  {
+    if (typeid(*this) != typeid(rhs))
     {
-      if (typeid(*this) != typeid(rhs))
-      {
-        return false;
-      }
-      const auto& other = static_cast<const impl&>(rhs);
-      return g_ctx_ == other.g_ctx_;
+      return false;
     }
+    const auto& other = static_cast<const exec_place_green_ctx_impl&>(rhs);
+    return g_ctx_ == other.g_ctx_;
+  }
 
-    size_t hash() const override
-    {
-      return ::std::hash<CUgreenCtx>()(g_ctx_);
-    }
+  size_t hash() const override
+  {
+    return ::std::hash<CUgreenCtx>()(g_ctx_);
+  }
 
-    bool operator<(const exec_place::impl& rhs) const override
+  bool operator<(const exec_place::impl& rhs) const override
+  {
+    if (typeid(*this) != typeid(rhs))
     {
-      if (typeid(*this) != typeid(rhs))
-      {
-        return typeid(*this).before(typeid(rhs));
-      }
-      const auto& other = static_cast<const impl&>(rhs);
-      return g_ctx_ < other.g_ctx_;
+      return typeid(*this).before(typeid(rhs));
     }
-
-  private:
-    int devid_                        = -1;
-    CUgreenCtx g_ctx_                 = {};
-    mutable CUcontext driver_context_ = {};
-    mutable stream_pool pool_;
-  };
-
-public:
-  exec_place_green_ctx(green_ctx_view gc_view, bool use_green_ctx_data_place = false)
-      : exec_place(::std::make_shared<impl>(mv(gc_view), use_green_ctx_data_place))
-  {
-    static_assert(sizeof(exec_place_green_ctx) <= sizeof(exec_place),
-                  "exec_place_green_ctx cannot add state; it would be sliced away.");
+    const auto& other = static_cast<const exec_place_green_ctx_impl&>(rhs);
+    return g_ctx_ < other.g_ctx_;
   }
+
+private:
+  int devid_                        = -1;
+  CUgreenCtx g_ctx_                 = {};
+  mutable CUcontext driver_context_ = {};
+  mutable stream_pool pool_;
 };
 
 inline exec_place exec_place::green_ctx(const green_ctx_view& gc_view, bool use_green_ctx_data_place)
 {
-  return exec_place_green_ctx(gc_view, use_green_ctx_data_place);
+  return exec_place(::std::make_shared<exec_place_green_ctx_impl>(gc_view, use_green_ctx_data_place));
 }
 
 inline ::std::shared_ptr<void> green_ctx_data_place_impl::get_affine_exec_impl() const
diff --git a/cudax/include/cuda/experimental/__stf/places/places.cuh b/cudax/include/cuda/experimental/__stf/places/places.cuh
index e83c22c8aba..467c1b2d474 100644
--- a/cudax/include/cuda/experimental/__stf/places/places.cuh
+++ b/cudax/include/cuda/experimental/__stf/places/places.cuh
@@ -53,9 +53,6 @@ class exec_place_host;
 class exec_place_grid;
 
 // Green contexts are only supported since CUDA 12.4
-#if _CCCL_CTK_AT_LEAST(12, 4)
-class exec_place_green_ctx;
-#endif // _CCCL_CTK_AT_LEAST(12, 4)
 
 //! Function type for computing executor placement from data coordinates
 using get_executor_func_t = pos4 (*)(pos4, dim4, dim4);

From a99fd0639f29f227f78bb94a8ccb973a4acfb063 Mon Sep 17 00:00:00 2001
From: Andrei Alexandrescu <andrei@erdani.com>
Date: Fri, 13 Mar 2026 23:37:12 -0400
Subject: [PATCH 04/12] Clean up affine_data_place() return type and remove
 virtual is_grid()

- Remove meaningless const from return-by-value affine_data_place()
- Remove virtual is_grid() from impl - just use size() > 1 directly

Made-with: Cursor
---
 .../cuda/experimental/__stf/places/places.cuh   | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/cudax/include/cuda/experimental/__stf/places/places.cuh b/cudax/include/cuda/experimental/__stf/places/places.cuh
index 467c1b2d474..912c5737f1d 100644
--- a/cudax/include/cuda/experimental/__stf/places/places.cuh
+++ b/cudax/include/cuda/experimental/__stf/places/places.cuh
@@ -466,7 +466,7 @@ public:
 
     // ===== Properties =====
 
-    virtual const data_place affine_data_place() const
+    virtual data_place affine_data_place() const
     {
       return affine;
     }
@@ -486,15 +486,6 @@ public:
       return affine.is_device();
     }
 
-    /**
-     * @brief Check if this is a multi-element grid (size > 1)
-     * @deprecated Use size() > 1 instead
-     */
-    virtual bool is_grid() const
-    {
-      return size() > 1;
-    }
-
     virtual void set_affine_data_place(data_place place)
     {
       affine = mv(place);
@@ -760,7 +751,7 @@ public:
     return pimpl->to_string();
   }
 
-  const data_place affine_data_place() const
+  data_place affine_data_place() const
   {
     return pimpl->affine_data_place();
   }
@@ -803,7 +794,7 @@ public:
    */
   bool is_grid() const
   {
-    return pimpl->is_grid();
+    return size() > 1;
   }
 
   /**
@@ -1037,7 +1028,7 @@ public:
       _CCCL_ASSERT(!prev.get_impl(), "Host deactivate expects empty prev");
     }
 
-    const data_place affine_data_place() const override
+    data_place affine_data_place() const override
     {
       return data_place::host();
     }

From f7d6c0520f18cf053f5e00b4fa75552ebdc2f8da Mon Sep 17 00:00:00 2001
From: Andrei Alexandrescu <andrei@erdani.com>
Date: Fri, 13 Mar 2026 23:43:16 -0400
Subject: [PATCH 05/12] Simplify exec_place comparison using three-way cmp()

Replace separate operator== and operator< virtual methods with a single
cmp() method that returns -1/0/1, consistent with data_place_interface.

Made-with: Cursor
---
 .../__stf/places/exec/cuda_stream.cuh         | 24 +++---
 .../__stf/places/exec/green_context.cuh       | 24 +++---
 .../cuda/experimental/__stf/places/places.cuh | 85 +++++++++++--------
 3 files changed, 73 insertions(+), 60 deletions(-)

diff --git a/cudax/include/cuda/experimental/__stf/places/exec/cuda_stream.cuh b/cudax/include/cuda/experimental/__stf/places/exec/cuda_stream.cuh
index c4935000adc..a490f57054f 100644
--- a/cudax/include/cuda/experimental/__stf/places/exec/cuda_stream.cuh
+++ b/cudax/include/cuda/experimental/__stf/places/exec/cuda_stream.cuh
@@ -69,14 +69,22 @@ public:
     return "exec(stream id=" + ::std::to_string(dstream_.id) + " dev=" + ::std::to_string(dstream_.dev_id) + ")";
   }
 
-  bool operator==(const exec_place::impl& rhs) const override
+  int cmp(const exec_place::impl& rhs) const override
   {
     if (typeid(*this) != typeid(rhs))
     {
-      return false;
+      return typeid(*this).before(typeid(rhs)) ? -1 : 1;
     }
     const auto& other = static_cast<const exec_place_cuda_stream_impl&>(rhs);
-    return dstream_.stream == other.dstream_.stream;
+    if (dstream_.stream < other.dstream_.stream)
+    {
+      return -1;
+    }
+    if (other.dstream_.stream < dstream_.stream)
+    {
+      return 1;
+    }
+    return 0;
   }
 
   size_t hash() const override
@@ -84,16 +92,6 @@ public:
     return ::std::hash<cudaStream_t>()(dstream_.stream);
   }
 
-  bool operator<(const exec_place::impl& rhs) const override
-  {
-    if (typeid(*this) != typeid(rhs))
-    {
-      return typeid(*this).before(typeid(rhs));
-    }
-    const auto& other = static_cast<const exec_place_cuda_stream_impl&>(rhs);
-    return dstream_.stream < other.dstream_.stream;
-  }
-
 private:
   decorated_stream dstream_;
   mutable stream_pool dummy_pool_;
diff --git a/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh b/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh
index b0dc687be6d..284cb3f134e 100644
--- a/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh
+++ b/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh
@@ -325,14 +325,22 @@ public:
     return pool_;
   }
 
-  bool operator==(const exec_place::impl& rhs) const override
+  int cmp(const exec_place::impl& rhs) const override
   {
     if (typeid(*this) != typeid(rhs))
     {
-      return false;
+      return typeid(*this).before(typeid(rhs)) ? -1 : 1;
     }
     const auto& other = static_cast<const exec_place_green_ctx_impl&>(rhs);
-    return g_ctx_ == other.g_ctx_;
+    if (g_ctx_ < other.g_ctx_)
+    {
+      return -1;
+    }
+    if (other.g_ctx_ < g_ctx_)
+    {
+      return 1;
+    }
+    return 0;
   }
 
   size_t hash() const override
@@ -340,16 +348,6 @@ public:
     return ::std::hash<CUgreenCtx>()(g_ctx_);
   }
 
-  bool operator<(const exec_place::impl& rhs) const override
-  {
-    if (typeid(*this) != typeid(rhs))
-    {
-      return typeid(*this).before(typeid(rhs));
-    }
-    const auto& other = static_cast<const exec_place_green_ctx_impl&>(rhs);
-    return g_ctx_ < other.g_ctx_;
-  }
-
 private:
   int devid_                        = -1;
   CUgreenCtx g_ctx_                 = {};
diff --git a/cudax/include/cuda/experimental/__stf/places/places.cuh b/cudax/include/cuda/experimental/__stf/places/places.cuh
index 912c5737f1d..a07ac3ffc97 100644
--- a/cudax/include/cuda/experimental/__stf/places/places.cuh
+++ b/cudax/include/cuda/experimental/__stf/places/places.cuh
@@ -493,9 +493,25 @@ public:
 
     // ===== Comparison =====
 
-    virtual bool operator==(const impl& rhs) const
+    /**
+     * @brief Three-way comparison
+     * @return -1 if *this < rhs, 0 if *this == rhs, 1 if *this > rhs
+     */
+    virtual int cmp(const impl& rhs) const
     {
-      return affine == rhs.affine;
+      if (typeid(*this) != typeid(rhs))
+      {
+        return typeid(*this).before(typeid(rhs)) ? -1 : 1;
+      }
+      if (affine < rhs.affine)
+      {
+        return -1;
+      }
+      if (rhs.affine < affine)
+      {
+        return 1;
+      }
+      return 0;
     }
 
     virtual size_t hash() const
@@ -503,15 +519,6 @@ public:
       return affine.hash();
     }
 
-    virtual bool operator<(const impl& rhs) const
-    {
-      if (typeid(*this) != typeid(rhs))
-      {
-        return typeid(*this).before(typeid(rhs));
-      }
-      return device_ordinal(affine) < device_ordinal(rhs.affine);
-    }
-
     // ===== Stream management =====
 
     virtual stream_pool& get_stream_pool(bool for_computation) const
@@ -543,8 +550,13 @@ public:
 
   bool operator==(const exec_place& rhs) const
   {
-    return *pimpl == *rhs.pimpl;
+    if (pimpl.get() == rhs.pimpl.get())
+    {
+      return true;
+    }
+    return pimpl->cmp(*rhs.pimpl) == 0;
   }
+
   bool operator!=(const exec_place& rhs) const
   {
     return !(*this == rhs);
@@ -552,7 +564,7 @@ public:
 
   bool operator<(const exec_place& rhs) const
   {
-    return *pimpl < *rhs.pimpl;
+    return pimpl->cmp(*rhs.pimpl) < 0;
   }
 
   bool operator>(const exec_place& rhs) const
@@ -1249,14 +1261,34 @@ public:
 
     // ===== Comparison =====
 
-    bool operator==(const exec_place::impl& rhs) const override
+    int cmp(const exec_place::impl& rhs) const override
     {
-      auto other = dynamic_cast<const impl*>(&rhs);
-      if (!other)
+      if (typeid(*this) != typeid(rhs))
+      {
+        return typeid(*this).before(typeid(rhs)) ? -1 : 1;
+      }
+      const auto& other = static_cast<const impl&>(rhs);
+      // Compare dims first
+      auto this_dims  = ::std::tie(dims_.x, dims_.y, dims_.z, dims_.t);
+      auto other_dims = ::std::tie(other.dims_.x, other.dims_.y, other.dims_.z, other.dims_.t);
+      if (this_dims < other_dims)
+      {
+        return -1;
+      }
+      if (other_dims < this_dims)
+      {
+        return 1;
+      }
+      // Then compare places
+      if (places_ < other.places_)
+      {
+        return -1;
+      }
+      if (other.places_ < places_)
       {
-        return false;
+        return 1;
       }
-      return dims_ == other->dims_ && places_ == other->places_;
+      return 0;
     }
 
     size_t hash() const override
@@ -1269,21 +1301,6 @@ public:
       return h;
     }
 
-    bool operator<(const exec_place::impl& rhs) const override
-    {
-      if (typeid(*this) != typeid(rhs))
-      {
-        return typeid(*this).before(typeid(rhs));
-      }
-      const auto& other = static_cast<const impl&>(rhs);
-      if (!(dims_ == other.dims_))
-      {
-        return ::std::tie(dims_.x, dims_.y, dims_.z, dims_.t)
-             < ::std::tie(other.dims_.x, other.dims_.y, other.dims_.z, other.dims_.t);
-      }
-      return places_ < other.places_;
-    }
-
     // ===== Stream management =====
 
     stream_pool& get_stream_pool(bool for_computation) const override
@@ -1312,7 +1329,7 @@ public:
 
   bool operator==(const exec_place_grid& rhs) const
   {
-    return *get_impl() == *(rhs.get_impl());
+    return get_impl()->cmp(*rhs.get_impl()) == 0;
   }
 
   /**

From 86aab4d26375c7fce064bb835b2d8746f2988283 Mon Sep 17 00:00:00 2001
From: Andrei Alexandrescu <andrei@erdani.com>
Date: Fri, 13 Mar 2026 23:47:44 -0400
Subject: [PATCH 06/12] Fix place_partition for 1-element grids

With the unified grid model, 1-element grids have size()==1 but
is_device()==false. Update place_partition to handle this case by
extracting the underlying scalar place from 1-element grids.

Made-with: Cursor
---
 .../__stf/places/place_partition.cuh          | 78 +++++++++++++++----
 1 file changed, 64 insertions(+), 14 deletions(-)

diff --git a/cudax/include/cuda/experimental/__stf/places/place_partition.cuh b/cudax/include/cuda/experimental/__stf/places/place_partition.cuh
index e0d2afed705..002b4134830 100644
--- a/cudax/include/cuda/experimental/__stf/places/place_partition.cuh
+++ b/cudax/include/cuda/experimental/__stf/places/place_partition.cuh
@@ -222,9 +222,9 @@ private:
   /** @brief Compute the subplaces of a place at the specified granularity (scope) into the sub_places vector */
   void compute_subplaces(async_resources_handle& handle, const exec_place& place, place_partition_scope scope)
   {
+    // Handle multi-element grids by recursively partitioning
     if (place.size() > 1 && scope == place_partition_scope::cuda_stream)
     {
-      // Recursively partition grid into devices, then into streams
       for (auto& device_p : place_partition(place, handle, place_partition_scope::cuda_device))
       {
         auto device_p_places = place_partition(device_p, handle, place_partition_scope::cuda_stream).sub_places;
@@ -233,6 +233,27 @@ private:
       return;
     }
 
+    // Handle scalar places (including 1-element grids) for cuda_stream scope
+    if (place.size() == 1 && scope == place_partition_scope::cuda_stream)
+    {
+      // Get the underlying scalar place (for 1-element grids, get the single element)
+      exec_place scalar_place = place.is_device() ? place : place.get_place(0);
+      if (!scalar_place.is_device())
+      {
+        // Host or other non-device place - no streams to partition into
+        sub_places.push_back(place);
+        return;
+      }
+      auto& pool = scalar_place.get_stream_pool(true);
+      for (size_t i = 0; i < pool.size(); i++)
+      {
+        decorated_stream dstream = pool.next(scalar_place);
+        sub_places.push_back(exec_place::cuda_stream(dstream));
+      }
+      return;
+    }
+
+    // Legacy path for explicit device check (kept for compatibility)
     if (place.is_device() && scope == place_partition_scope::cuda_stream)
     {
       auto& pool = place.get_stream_pool(true);
@@ -258,18 +279,40 @@ private:
       return;
     }
 
+    // Handle scalar places (including 1-element grids) for green_context scope
+    if (place.size() == 1 && scope == place_partition_scope::green_context)
+    {
+      exec_place scalar_place = place.is_device() ? place : place.get_place(0);
+      if (!scalar_place.is_device())
+      {
+        sub_places.push_back(place);
+        return;
+      }
+      int dev_id = device_ordinal(scalar_place.affine_data_place());
+
+      const char* env = getenv("CUDASTF_GREEN_CONTEXT_SIZE");
+      int sm_cnt      = env ? atoi(env) : 8;
+
+      auto h = handle.get_gc_helper(dev_id, sm_cnt);
+
+      size_t cnt = h->get_count();
+      for (size_t i = 0; i < cnt; i++)
+      {
+        sub_places.push_back(exec_place::green_ctx(h->get_view(i)));
+      }
+      return;
+    }
+
+    // Legacy path for explicit device check (kept for compatibility)
     if (place.is_device() && scope == place_partition_scope::green_context)
     {
-      // Find the device associated to the place, and get the green context helper
       int dev_id = device_ordinal(place.affine_data_place());
 
-      // 8 SMs per green context is a granularity that should work on any arch.
       const char* env = getenv("CUDASTF_GREEN_CONTEXT_SIZE");
       int sm_cnt      = env ? atoi(env) : 8;
 
       auto h = handle.get_gc_helper(dev_id, sm_cnt);
 
-      // Get views of green context out of the helper to create execution places
       size_t cnt = h->get_count();
       for (size_t i = 0; i < cnt; i++)
       {
@@ -291,19 +334,26 @@ private:
 #endif // _CCCL_CTK_BELOW(12, 4)
     _CCCL_ASSERT(scope != place_partition_scope::cuda_stream, "CUDA stream scope needs an async resource handle.");
 
-    if (place.size() > 1 && scope == place_partition_scope::cuda_device)
+    if (scope == place_partition_scope::cuda_device)
     {
-      // Get places from the grid
-      for (size_t i = 0; i < place.size(); ++i)
+      if (place.size() > 1)
       {
-        sub_places.push_back(place.get_place(i));
+        // Multi-element grid: extract all places
+        for (size_t i = 0; i < place.size(); ++i)
+        {
+          sub_places.push_back(place.get_place(i));
+        }
+      }
+      else if (place.is_device())
+      {
+        // Scalar device place
+        sub_places.push_back(place);
+      }
+      else
+      {
+        // 1-element grid or other scalar place: extract the underlying place
+        sub_places.push_back(place.get_place(0));
       }
-      return;
-    }
-
-    if (place.is_device() && scope == place_partition_scope::cuda_device)
-    {
-      sub_places.push_back(place);
       return;
     }
 

From 0ef69031d488f2579d8342b49a9b655e9e76b3f5 Mon Sep 17 00:00:00 2001
From: Andrei Alexandrescu <andrei@erdani.com>
Date: Sat, 14 Mar 2026 00:18:15 -0400
Subject: [PATCH 07/12] Simplify exec_place::impl by removing unnecessary
 virtuals

- Remove virtual is_host()/is_device() from impl; move logic to shell
  (base impl already returns correct values via affine data place)
- Move grid iteration state (current_idx, saved_prev_impl) to grid impl
  since only multi-element grids use iteration
- Add virtual accessors for grid state with assertions for misuse

Made-with: Cursor
---
 .../cuda/experimental/__stf/places/places.cuh | 102 +++++++++++-------
 1 file changed, 61 insertions(+), 41 deletions(-)

diff --git a/cudax/include/cuda/experimental/__stf/places/places.cuh b/cudax/include/cuda/experimental/__stf/places/places.cuh
index a07ac3ffc97..767c810e6b9 100644
--- a/cudax/include/cuda/experimental/__stf/places/places.cuh
+++ b/cudax/include/cuda/experimental/__stf/places/places.cuh
@@ -443,8 +443,7 @@ public:
       {
         cuda_safe_call(cudaSetDevice(new_dev_id));
       }
-      auto old_dev = data_place::device(old_dev_id);
-      return exec_place(mv(old_dev));
+      return exec_place(data_place::device(old_dev_id));
     }
 
     /**
@@ -476,16 +475,6 @@ public:
       return "exec(" + affine.to_string() + ")";
     }
 
-    virtual bool is_host() const
-    {
-      return affine.is_host();
-    }
-
-    virtual bool is_device() const
-    {
-      return affine.is_device();
-    }
-
     virtual void set_affine_data_place(data_place place)
     {
       affine = mv(place);
@@ -529,15 +518,29 @@ public:
     static constexpr size_t pool_size      = 4;
     static constexpr size_t data_pool_size = 4;
 
+    // Grid iteration state - only meaningful for multi-element grids
+    virtual ::std::ptrdiff_t get_current_idx() const
+    {
+      return -1;
+    }
+    virtual void set_current_idx(::std::ptrdiff_t) const
+    {
+      _CCCL_ASSERT(false, "set_current_idx called on non-grid exec_place");
+    }
+    virtual ::std::shared_ptr<impl> get_saved_prev_impl() const
+    {
+      return nullptr;
+    }
+    virtual void set_saved_prev_impl(::std::shared_ptr<impl>) const
+    {
+      _CCCL_ASSERT(false, "set_saved_prev_impl called on non-grid exec_place");
+    }
+
   protected:
     friend class exec_place;
     data_place affine = data_place::invalid();
     mutable stream_pool pool_compute;
     mutable stream_pool pool_data;
-
-    // Current place state for grid iteration
-    mutable ::std::ptrdiff_t current_idx = -1;
-    mutable ::std::shared_ptr<impl> saved_prev_impl;
   };
 
   exec_place() = default;
@@ -710,14 +713,15 @@ public:
    */
   void set_current_place(size_t idx)
   {
-    if (pimpl->current_idx >= 0)
+    auto cur_idx = pimpl->get_current_idx();
+    if (cur_idx >= 0)
     {
-      exec_place saved_prev(pimpl->saved_prev_impl);
-      pimpl->deactivate(pimpl->current_idx, saved_prev);
+      exec_place saved_prev(pimpl->get_saved_prev_impl());
+      pimpl->deactivate(cur_idx, saved_prev);
     }
-    pimpl->current_idx     = static_cast<::std::ptrdiff_t>(idx);
-    exec_place prev        = pimpl->activate(idx);
-    pimpl->saved_prev_impl = prev.pimpl;
+    pimpl->set_current_idx(static_cast<::std::ptrdiff_t>(idx));
+    exec_place prev = pimpl->activate(idx);
+    pimpl->set_saved_prev_impl(prev.pimpl);
   }
 
   /**
@@ -733,10 +737,11 @@ public:
    */
   void unset_current_place()
   {
-    EXPECT(pimpl->current_idx >= 0, "unset_current_place() called without corresponding set_current_place()");
-    exec_place saved_prev(pimpl->saved_prev_impl);
-    pimpl->deactivate(pimpl->current_idx, saved_prev);
-    pimpl->current_idx = -1;
+    auto cur_idx = pimpl->get_current_idx();
+    EXPECT(cur_idx >= 0, "unset_current_place() called without corresponding set_current_place()");
+    exec_place saved_prev(pimpl->get_saved_prev_impl());
+    pimpl->deactivate(cur_idx, saved_prev);
+    pimpl->set_current_idx(-1);
   }
 
   /**
@@ -744,8 +749,9 @@ public:
    */
   exec_place get_current_place() const
   {
-    EXPECT(pimpl->current_idx >= 0, "No current place set");
-    return get_place(pimpl->current_idx);
+    auto cur_idx = pimpl->get_current_idx();
+    EXPECT(cur_idx >= 0, "No current place set");
+    return get_place(cur_idx);
   }
 
   /**
@@ -753,7 +759,7 @@ public:
    */
   ::std::ptrdiff_t current_place_id() const
   {
-    return pimpl->current_idx;
+    return pimpl->get_current_idx();
   }
 
   // ===== Properties =====
@@ -792,12 +798,12 @@ public:
 
   bool is_host() const
   {
-    return pimpl->is_host();
+    return affine_data_place().is_host();
   }
 
   bool is_device() const
   {
-    return pimpl->is_device();
+    return affine_data_place().is_device();
   }
 
   /**
@@ -1249,16 +1255,6 @@ public:
            + "x" + ::std::to_string(dims_.t) + ")";
     }
 
-    bool is_device() const override
-    {
-      return false;
-    }
-
-    bool is_host() const override
-    {
-      return false;
-    }
-
     // ===== Comparison =====
 
     int cmp(const exec_place::impl& rhs) const override
@@ -1317,9 +1313,33 @@ public:
       return places_;
     }
 
+    // ===== Grid iteration state =====
+
+    ::std::ptrdiff_t get_current_idx() const override
+    {
+      return current_idx_;
+    }
+
+    void set_current_idx(::std::ptrdiff_t idx) const override
+    {
+      current_idx_ = idx;
+    }
+
+    ::std::shared_ptr<exec_place::impl> get_saved_prev_impl() const override
+    {
+      return saved_prev_impl_;
+    }
+
+    void set_saved_prev_impl(::std::shared_ptr<exec_place::impl> p) const override
+    {
+      saved_prev_impl_ = mv(p);
+    }
+
   private:
     dim4 dims_;
     ::std::vector<exec_place> places_;
+    mutable ::std::ptrdiff_t current_idx_ = -1;
+    mutable ::std::shared_ptr<exec_place::impl> saved_prev_impl_;
   };
 
   explicit operator bool() const

From 672cf33d0a7f03331fc327c91992b10808e4533e Mon Sep 17 00:00:00 2001
From: Andrei Alexandrescu <andrei@erdani.com>
Date: Mon, 16 Mar 2026 13:42:21 -0400
Subject: [PATCH 08/12] Fix is_grid() and collapse 1-element grids in factory
 functions

The condition `is_grid()` was changed to `size() > 1` but this broke
1-element grids (e.g. `all_devices()` on single-GPU systems).

Fix 1: Restore is_grid() to detect any grid by checking if
affine_data_place() is invalid (only grids have invalid affine).

Fix 2: Factory functions now collapse 1-element grids to scalars:
- make_grid(), all_devices(), n_devices(), repeat()
- partition_cyclic(), partition_tile()

This ensures that by construction, any true grid has size() > 1,
making `size() > 1` equivalent to `is_grid()` in practice.

Benefits:
- Simpler mental model: grids always have multiple elements
- No edge cases for 1-element grids
- Single-GPU `all_devices()` returns `device(0)` directly

Return type changes (exec_place_grid -> exec_place):
- all_devices(), n_devices(), repeat(), make_grid()
- partition_cyclic(), partition_tile()
- place_partition::as_grid()

Made-with: Cursor
---
 .../experimental/__stf/internal/launch.cuh    |  4 +-
 .../__stf/internal/parallel_for_scope.cuh     | 11 ++--
 .../__stf/places/place_partition.cuh          |  6 +--
 .../cuda/experimental/__stf/places/places.cuh | 50 +++++++++++++------
 cudax/test/stf/places/recursion.cu            |  2 +-
 5 files changed, 46 insertions(+), 27 deletions(-)

diff --git a/cudax/include/cuda/experimental/__stf/internal/launch.cuh b/cudax/include/cuda/experimental/__stf/internal/launch.cuh
index f20205acd59..3888efef429 100644
--- a/cudax/include/cuda/experimental/__stf/internal/launch.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/launch.cuh
@@ -331,11 +331,11 @@ public:
     assert(e_place.affine_data_place() == t.get_affine_data_place());
 
     /*
-     * If we have a multi-place grid, the implicit affine partitioner is the blocked_partition.
+     * If we have a grid (including 1-element grids), the implicit affine partitioner is the blocked_partition.
      *
      * An explicit composite data place is required per data dependency to customize this behaviour.
      */
-    if (e_place.size() > 1)
+    if (e_place.is_grid())
     {
       // Create a composite data place defined by the grid of places + the partitioning function
       t.set_affine_data_place(data_place::composite(blocked_partition(), e_place.as_grid()));
diff --git a/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh
index 90acd49ad22..50d4f523946 100644
--- a/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh
@@ -550,8 +550,8 @@ public:
     // If there is a partitioner, we ensure there is a proper affine data place for this execution place
     if constexpr (!::std::is_same_v<partitioner_t, null_partition>)
     {
-      // This is only meaningful for multi-place grids
-      if (e_place.size() > 1)
+      // Grids (including 1-element grids) need a composite data place
+      if (e_place.is_grid())
       {
         // Create a composite data place defined by the grid of places + the partitioning function
         t.set_affine_data_place(data_place::composite(partitioner_t(), e_place.as_grid()));
@@ -662,14 +662,15 @@ public:
       if (e_place.size() == 1)
       {
         // Apply the parallel_for construct over the entire shape on the
-        // execution place of the task
+        // execution place of the task. For 1-element grids, extract the element.
+        const exec_place& scalar_place = e_place.is_grid() ? e_place.get_place(0) : e_place;
         if constexpr (need_reduction)
         {
-          do_parallel_for_redux(f, e_place, shape, t);
+          do_parallel_for_redux(f, scalar_place, shape, t);
         }
         else
         {
-          do_parallel_for(f, e_place, shape, t);
+          do_parallel_for(f, scalar_place, shape, t);
         }
       }
       else
diff --git a/cudax/include/cuda/experimental/__stf/places/place_partition.cuh b/cudax/include/cuda/experimental/__stf/places/place_partition.cuh
index 002b4134830..35a0e7e5c32 100644
--- a/cudax/include/cuda/experimental/__stf/places/place_partition.cuh
+++ b/cudax/include/cuda/experimental/__stf/places/place_partition.cuh
@@ -210,10 +210,10 @@ public:
     return sub_places[i];
   }
 
-  /** @brief Build an exec_place_grid from the subplaces.
-   * @return A grid view of the partitioned execution places.
+  /** @brief Build an exec_place from the subplaces.
+   * @return A grid view of the partitioned execution places, or single place if size == 1.
    */
-  exec_place_grid as_grid() const
+  exec_place as_grid() const
   {
     return make_grid(sub_places);
   }
diff --git a/cudax/include/cuda/experimental/__stf/places/places.cuh b/cudax/include/cuda/experimental/__stf/places/places.cuh
index 767c810e6b9..63dd0c958d8 100644
--- a/cudax/include/cuda/experimental/__stf/places/places.cuh
+++ b/cudax/include/cuda/experimental/__stf/places/places.cuh
@@ -807,12 +807,13 @@ public:
   }
 
   /**
-   * @brief Check if this is a multi-element grid (size > 1)
-   * @deprecated Use size() > 1 instead. All places are now grids.
+   * @brief Check if this is a grid execution place
+   *
+   * Returns true for any grid, including 1-element grids.
    */
   bool is_grid() const
   {
-    return size() > 1;
+    return affine_data_place().is_invalid();
   }
 
   /**
@@ -872,14 +873,14 @@ public:
     return exec_place::device(cuda_try<cudaGetDevice>());
   }
 
-  static exec_place_grid all_devices();
+  static exec_place all_devices();
 
-  static exec_place_grid n_devices(size_t n, dim4 dims);
+  static exec_place n_devices(size_t n, dim4 dims);
 
-  static exec_place_grid n_devices(size_t n);
+  static exec_place n_devices(size_t n);
 
   // For debug purpose on a machine with a single GPU, for example
-  static exec_place_grid repeat(const exec_place& e, size_t cnt);
+  static exec_place repeat(const exec_place& e, size_t cnt);
 
   template <typename... Args>
   auto partition_by_scope(Args&&... args);
@@ -1384,13 +1385,20 @@ public:
 };
 
 //! Creates a grid of execution places with specified dimensions
-inline exec_place_grid make_grid(::std::vector<exec_place> places, const dim4& dims)
+//! Returns the single element if size == 1 (no grid wrapper needed)
+inline exec_place make_grid(::std::vector<exec_place> places, const dim4& dims)
 {
+  _CCCL_ASSERT(!places.empty(), "invalid places");
+  if (places.size() == 1)
+  {
+    return mv(places[0]);
+  }
   return exec_place_grid(mv(places), dims);
 }
 
 //! Creates a linear grid from a vector of execution places
-inline exec_place_grid make_grid(::std::vector<exec_place> places)
+//! Returns the single element if size == 1 (no grid wrapper needed)
+inline exec_place make_grid(::std::vector<exec_place> places)
 {
   _CCCL_ASSERT(!places.empty(), "invalid places");
   auto grid_dim = dim4(places.size(), 1, 1, 1);
@@ -1462,20 +1470,26 @@ inline exec_place exec_place_device::impl::get_place(size_t idx) const
 }
 
 //! Creates a grid by replicating an execution place multiple times
-inline exec_place_grid exec_place::repeat(const exec_place& e, size_t cnt)
+//! Returns the original place if cnt == 1 (no grid wrapper needed)
+inline exec_place exec_place::repeat(const exec_place& e, size_t cnt)
 {
+  if (cnt == 1)
+  {
+    return e;
+  }
   return make_grid(::std::vector<exec_place>(cnt, e));
 }
 
 /* Deferred implementation : ::std::static_pointer_cast requires that exec_place_grid is a complete type */
 inline exec_place_grid exec_place::as_grid() const
 {
-  EXPECT(size() > 1, "as_grid() called on scalar exec_place");
+  EXPECT(is_grid(), "as_grid() called on scalar exec_place");
   return exec_place_grid(::std::static_pointer_cast<exec_place_grid::impl>(pimpl));
 }
 
 /* Get the first N available devices */
-inline exec_place_grid exec_place::n_devices(size_t n, dim4 dims)
+//! Returns single device if n == 1 (no grid wrapper needed)
+inline exec_place exec_place::n_devices(size_t n, dim4 dims)
 {
   const int ndevs = cuda_try<cudaGetDeviceCount>();
 
@@ -1492,18 +1506,21 @@ inline exec_place_grid exec_place::n_devices(size_t n, dim4 dims)
 }
 
 /* Get the first N available devices */
-inline exec_place_grid exec_place::n_devices(size_t n)
+//! Returns single device if n == 1 (no grid wrapper needed)
+inline exec_place exec_place::n_devices(size_t n)
 {
   return n_devices(n, dim4(n, 1, 1, 1));
 }
 
-inline exec_place_grid exec_place::all_devices()
+//! Returns all available devices, or single device if only one GPU
+inline exec_place exec_place::all_devices()
 {
   return n_devices(cuda_try<cudaGetDeviceCount>());
 }
 
 //! Creates a cyclic partition of an execution place grid with specified strides
-inline exec_place_grid partition_cyclic(const exec_place_grid& e_place, dim4 strides, pos4 tile_id)
+//! Returns single place if partition contains only one element
+inline exec_place partition_cyclic(const exec_place& e_place, dim4 strides, pos4 tile_id)
 {
   dim4 g_dims = e_place.get_dims();
 
@@ -1547,10 +1564,11 @@ inline exec_place_grid partition_cyclic(const exec_place_grid& e_place, dim4 str
 }
 
 //! Creates a tiled partition of an execution place grid with specified tile sizes
+//! Returns single place if partition contains only one element
 //!
 //! example :
 //! auto sub_g = partition_tile(g, dim4(2,2), dim4(0,1))
-inline exec_place_grid partition_tile(const exec_place_grid& e_place, dim4 tile_sizes, pos4 tile_id)
+inline exec_place partition_tile(const exec_place& e_place, dim4 tile_sizes, pos4 tile_id)
 {
   dim4 g_dims = e_place.get_dims();
 
diff --git a/cudax/test/stf/places/recursion.cu b/cudax/test/stf/places/recursion.cu
index 3af51e4dc98..de86d7c22bf 100644
--- a/cudax/test/stf/places/recursion.cu
+++ b/cudax/test/stf/places/recursion.cu
@@ -12,7 +12,7 @@
 
 using namespace cuda::experimental::stf;
 
-void rec_func(exec_place_grid places)
+void rec_func(exec_place places)
 {
   if (places.size() == 1)
   {

From 36ea11ee06f63ebc6fbff9f777c3e569287d88a2 Mon Sep 17 00:00:00 2001
From: Andrei Alexandrescu <andrei@erdani.com>
Date: Mon, 16 Mar 2026 15:46:29 -0400
Subject: [PATCH 09/12] Remove is_grid() method - use size() > 1 instead

With 1-element grids now collapsed to scalars by factory functions,
is_grid() is equivalent to size() > 1. Remove the method and use
the simpler size check directly.

Made-with: Cursor
---
 .../cuda/experimental/__stf/internal/launch.cuh      |  2 +-
 .../__stf/internal/parallel_for_scope.cuh            |  8 ++++----
 .../cuda/experimental/__stf/places/places.cuh        | 12 +-----------
 3 files changed, 6 insertions(+), 16 deletions(-)

diff --git a/cudax/include/cuda/experimental/__stf/internal/launch.cuh b/cudax/include/cuda/experimental/__stf/internal/launch.cuh
index 3888efef429..235219943db 100644
--- a/cudax/include/cuda/experimental/__stf/internal/launch.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/launch.cuh
@@ -335,7 +335,7 @@ public:
      *
      * An explicit composite data place is required per data dependency to customize this behaviour.
      */
-    if (e_place.is_grid())
+    if (e_place.size() > 1)
     {
       // Create a composite data place defined by the grid of places + the partitioning function
       t.set_affine_data_place(data_place::composite(blocked_partition(), e_place.as_grid()));
diff --git a/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh
index 50d4f523946..53da4f55f9f 100644
--- a/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh
@@ -550,8 +550,8 @@ public:
     // If there is a partitioner, we ensure there is a proper affine data place for this execution place
     if constexpr (!::std::is_same_v<partitioner_t, null_partition>)
     {
-      // Grids (including 1-element grids) need a composite data place
-      if (e_place.is_grid())
+      // Grids need a composite data place
+      if (e_place.size() > 1)
       {
         // Create a composite data place defined by the grid of places + the partitioning function
         t.set_affine_data_place(data_place::composite(partitioner_t(), e_place.as_grid()));
@@ -662,8 +662,8 @@ public:
       if (e_place.size() == 1)
       {
         // Apply the parallel_for construct over the entire shape on the
-        // execution place of the task. For 1-element grids, extract the element.
-        const exec_place& scalar_place = e_place.is_grid() ? e_place.get_place(0) : e_place;
+        // execution place of the task.
+        const exec_place& scalar_place = e_place;
         if constexpr (need_reduction)
         {
           do_parallel_for_redux(f, scalar_place, shape, t);
diff --git a/cudax/include/cuda/experimental/__stf/places/places.cuh b/cudax/include/cuda/experimental/__stf/places/places.cuh
index 63dd0c958d8..e49d690cf06 100644
--- a/cudax/include/cuda/experimental/__stf/places/places.cuh
+++ b/cudax/include/cuda/experimental/__stf/places/places.cuh
@@ -806,16 +806,6 @@ public:
     return affine_data_place().is_device();
   }
 
-  /**
-   * @brief Check if this is a grid execution place
-   *
-   * Returns true for any grid, including 1-element grids.
-   */
-  bool is_grid() const
-  {
-    return affine_data_place().is_invalid();
-  }
-
   /**
    * @brief Get the dimension along a specific axis
    * @deprecated Use get_dims().get(axis_id) instead
@@ -1483,7 +1473,7 @@ inline exec_place exec_place::repeat(const exec_place& e, size_t cnt)
 /* Deferred implementation : ::std::static_pointer_cast requires that exec_place_grid is a complete type */
 inline exec_place_grid exec_place::as_grid() const
 {
-  EXPECT(is_grid(), "as_grid() called on scalar exec_place");
+  EXPECT(size() > 1, "as_grid() called on scalar exec_place");
   return exec_place_grid(::std::static_pointer_cast<exec_place_grid::impl>(pimpl));
 }
 

From 383957adcd2f62558c789e0b0784f02082660e66 Mon Sep 17 00:00:00 2001
From: Andrei Alexandrescu <andrei@erdani.com>
Date: Mon, 16 Mar 2026 18:34:22 -0400
Subject: [PATCH 10/12] Change data_place::composite and get_grid to use
 exec_place

Update data_place::composite() to accept const exec_place& instead of
const exec_place_grid&. This allows passing exec_place from factory
functions that now return exec_place (like repeat(), all_devices()).

Also update:
- data_place_composite to store exec_place instead of exec_place_grid
- get_grid() to return const exec_place&
- localized_array constructor parameter
- slice interface local variables

Made-with: Cursor
---
 .../__stf/graph/interfaces/slice.cuh             |  2 +-
 .../__stf/localization/composite_slice.cuh       |  4 ++--
 .../__stf/places/data_place_interface.cuh        |  2 +-
 .../cuda/experimental/__stf/places/places.cuh    | 16 ++++++++--------
 .../__stf/stream/interfaces/slice.cuh            |  2 +-
 5 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/cudax/include/cuda/experimental/__stf/graph/interfaces/slice.cuh b/cudax/include/cuda/experimental/__stf/graph/interfaces/slice.cuh
index 6aa1b142e9f..4a053ea53b3 100644
--- a/cudax/include/cuda/experimental/__stf/graph/interfaces/slice.cuh
+++ b/cudax/include/cuda/experimental/__stf/graph/interfaces/slice.cuh
@@ -87,7 +87,7 @@ public:
       return;
     }
 
-    exec_place_grid grid = memory_node.get_grid();
+    exec_place grid = memory_node.get_grid();
     size_t total_size    = this->shape.size();
 
     // position (x,y,z,t) on (nx,ny,nz,nt)
diff --git a/cudax/include/cuda/experimental/__stf/localization/composite_slice.cuh b/cudax/include/cuda/experimental/__stf/localization/composite_slice.cuh
index c4298f35459..b9d7771573a 100644
--- a/cudax/include/cuda/experimental/__stf/localization/composite_slice.cuh
+++ b/cudax/include/cuda/experimental/__stf/localization/composite_slice.cuh
@@ -75,7 +75,7 @@ public:
   // ::std::function<pos4(size_t)> delinearize : translate the index in a buffer into a position in the data
   // TODO pass mv(place)
   template <typename F>
-  localized_array(exec_place_grid grid,
+  localized_array(exec_place grid,
                   get_executor_func_t mapper,
                   F&& delinearize,
                   size_t total_size,
@@ -422,7 +422,7 @@ private:
   }
 
   event_list prereqs; // To allow reuse in a cache
-  exec_place_grid grid;
+  exec_place grid;
   get_executor_func_t mapper = nullptr;
   ::std::vector<metadata> meta;
 
diff --git a/cudax/include/cuda/experimental/__stf/places/data_place_interface.cuh b/cudax/include/cuda/experimental/__stf/places/data_place_interface.cuh
index ecf6d8f7b45..b5ff565c251 100644
--- a/cudax/include/cuda/experimental/__stf/places/data_place_interface.cuh
+++ b/cudax/include/cuda/experimental/__stf/places/data_place_interface.cuh
@@ -184,7 +184,7 @@ public:
    * @brief Get the grid for composite places
    * @throws std::logic_error if not a composite place
    */
-  virtual const exec_place_grid& get_grid() const
+  virtual const exec_place& get_grid() const
   {
     throw ::std::logic_error("get_grid() called on non-composite data_place");
   }
diff --git a/cudax/include/cuda/experimental/__stf/places/places.cuh b/cudax/include/cuda/experimental/__stf/places/places.cuh
index e49d690cf06..58bc80ec9f8 100644
--- a/cudax/include/cuda/experimental/__stf/places/places.cuh
+++ b/cudax/include/cuda/experimental/__stf/places/places.cuh
@@ -165,9 +165,9 @@ public:
 
   // User-visible API when using a different partitioner than the one of the grid
   template <typename partitioner_t /*, typename scalar_exec_place_t */>
-  static data_place composite(partitioner_t p, const exec_place_grid& g);
+  static data_place composite(partitioner_t p, const exec_place& g);
 
-  static data_place composite(get_executor_func_t f, const exec_place_grid& grid);
+  static data_place composite(get_executor_func_t f, const exec_place& grid);
 
 #if _CCCL_CTK_AT_LEAST(12, 4)
   static data_place green_ctx(const green_ctx_view& gc_view);
@@ -296,7 +296,7 @@ public:
     return p.pimpl_->get_device_ordinal();
   }
 
-  const exec_place_grid& get_grid() const
+  const exec_place& get_grid() const
   {
     return pimpl_->get_grid();
   }
@@ -1619,7 +1619,7 @@ inline exec_place partition_tile(const exec_place& e_place, dim4 tile_sizes, pos
 class data_place_composite final : public data_place_interface
 {
 public:
-  data_place_composite(exec_place_grid grid, get_executor_func_t partitioner_func)
+  data_place_composite(exec_place grid, get_executor_func_t partitioner_func)
       : grid_(mv(grid))
       , partitioner_func_(mv(partitioner_func))
   {}
@@ -1679,7 +1679,7 @@ public:
     return false;
   }
 
-  const exec_place_grid& get_grid() const override
+  const exec_place& get_grid() const override
   {
     return grid_;
   }
@@ -1690,7 +1690,7 @@ public:
   }
 
 private:
-  exec_place_grid grid_;
+  exec_place grid_;
   get_executor_func_t partitioner_func_;
 };
 
@@ -1700,14 +1700,14 @@ inline bool data_place::is_composite() const
   return typeid(ref) == typeid(data_place_composite);
 }
 
-inline data_place data_place::composite(get_executor_func_t f, const exec_place_grid& grid)
+inline data_place data_place::composite(get_executor_func_t f, const exec_place& grid)
 {
   return data_place(::std::make_shared<data_place_composite>(grid, f));
 }
 
 // User-visible API when the same partitioner as the one of the grid
 template <typename partitioner_t>
-data_place data_place::composite(partitioner_t, const exec_place_grid& g)
+data_place data_place::composite(partitioner_t, const exec_place& g)
 {
   return data_place::composite(&partitioner_t::get_executor, g);
 }
diff --git a/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh b/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh
index 21ac9da53a6..5a01d242608 100644
--- a/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh
+++ b/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh
@@ -94,7 +94,7 @@ public:
       return;
     }
 
-    exec_place_grid grid = memory_node.get_grid();
+    exec_place grid = memory_node.get_grid();
     size_t total_size    = this->shape.size();
 
     // position (x,y,z,t) on (nx,ny,nz,nt)

From 65346793c5886d139240798bfd8734c0b06e58be Mon Sep 17 00:00:00 2001
From: Andrei Alexandrescu <andrei@erdani.com>
Date: Mon, 16 Mar 2026 18:52:32 -0400
Subject: [PATCH 11/12] Remove exec_place_grid class, use exec_place everywhere

exec_place_grid was a shell class that provided no additional value over
exec_place now that all places support the unified grid model. This change:

- Removes exec_place_grid class, keeps impl as exec_place_grid_impl
- Updates make_grid() to return exec_place directly
- Changes as_grid() to return const exec_place& (just returns *this)
- Updates place_partition constructor to take const exec_place&
- Updates loop_dispatch template parameter to exec_place
- Removes deleted parallel_for(exec_place_grid, ...) overload
- Updates C API (stf.cu) to use exec_place* instead of exec_place_grid*
- Removes forward declarations of exec_place_grid

The C API types (stf_exec_place_grid_handle) are unchanged as they are
opaque void* handles that don't depend on the C++ class name.

Made-with: Cursor
---
 c/experimental/stf/src/stf.cu                 |  12 +-
 .../__stf/internal/backend_ctx.cuh            |   3 -
 .../__stf/internal/loop_dispatch.cuh          |   2 +-
 .../__stf/places/data_place_interface.cuh     |   1 -
 .../__stf/places/place_partition.cuh          |  11 +-
 .../cuda/experimental/__stf/places/places.cuh | 263 +++++++-----------
 6 files changed, 115 insertions(+), 177 deletions(-)

diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu
index f57ad22db37..27a07cfab0a 100644
--- a/c/experimental/stf/src/stf.cu
+++ b/c/experimental/stf/src/stf.cu
@@ -58,7 +58,7 @@ static data_place to_data_place(stf_data_place* data_p)
       {
         return data_place::invalid();
       }
-      exec_place_grid* grid_ptr = static_cast<exec_place_grid*>(grid_handle);
+      exec_place* grid_ptr = static_cast<exec_place*>(grid_handle);
       // Layout-compatible: pass C mapper directly so the runtime calls it
       get_executor_func_t cpp_mapper = reinterpret_cast<get_executor_func_t>(mapper);
       return data_place::composite(cpp_mapper, *grid_ptr);
@@ -425,8 +425,8 @@ stf_exec_place_grid_handle stf_exec_place_grid_from_devices(const int* device_id
   {
     places.push_back(exec_place::device(device_ids[i]));
   }
-  exec_place_grid grid = make_grid(::std::move(places));
-  return new exec_place_grid(::std::move(grid));
+  exec_place grid = make_grid(::std::move(places));
+  return new exec_place(::std::move(grid));
 }
 
 stf_exec_place_grid_handle
@@ -439,18 +439,18 @@ stf_exec_place_grid_create(const stf_exec_place* places, size_t count, const stf
   {
     cpp_places.push_back(to_exec_place(const_cast<stf_exec_place*>(&places[i])));
   }
-  exec_place_grid grid =
+  exec_place grid =
     (grid_dims != nullptr)
       ? make_grid(::std::move(cpp_places), dim4(grid_dims->x, grid_dims->y, grid_dims->z, grid_dims->t))
       : make_grid(::std::move(cpp_places));
-  return new exec_place_grid(::std::move(grid));
+  return new exec_place(::std::move(grid));
 }
 
 void stf_exec_place_grid_destroy(stf_exec_place_grid_handle grid)
 {
   if (grid != nullptr)
   {
-    delete static_cast<exec_place_grid*>(grid);
+    delete static_cast<exec_place*>(grid);
   }
 }
 
diff --git a/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh b/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh
index b3659b7c829..3ef18a36803 100644
--- a/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh
@@ -1117,9 +1117,6 @@ public:
     }
   }
 
-  template <typename S, typename... Deps>
-  auto parallel_for(exec_place_grid e_place, S shape, Deps... deps) = delete;
-
   template <typename S, typename... Deps>
   auto parallel_for(S shape, Deps... deps)
   {
diff --git a/cudax/include/cuda/experimental/__stf/internal/loop_dispatch.cuh b/cudax/include/cuda/experimental/__stf/internal/loop_dispatch.cuh
index 3359a8ea03c..f9acdc65fcc 100644
--- a/cudax/include/cuda/experimental/__stf/internal/loop_dispatch.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/loop_dispatch.cuh
@@ -161,7 +161,7 @@ inline void loop_dispatch(context_t ctx, size_t start, size_t end, ::std::functi
   }
   else
   {
-    loop_dispatch<context_t, exec_place_grid, use_threads>(
+    loop_dispatch<context_t, exec_place, use_threads>(
       mv(ctx), exec_place::all_devices(), scope, start, end, mv(func));
   }
 }
diff --git a/cudax/include/cuda/experimental/__stf/places/data_place_interface.cuh b/cudax/include/cuda/experimental/__stf/places/data_place_interface.cuh
index b5ff565c251..14b20f4a707 100644
--- a/cudax/include/cuda/experimental/__stf/places/data_place_interface.cuh
+++ b/cudax/include/cuda/experimental/__stf/places/data_place_interface.cuh
@@ -46,7 +46,6 @@ namespace cuda::experimental::stf
 {
 // Forward declarations
 class exec_place;
-class exec_place_grid;
 class pos4;
 class dim4;
 
diff --git a/cudax/include/cuda/experimental/__stf/places/place_partition.cuh b/cudax/include/cuda/experimental/__stf/places/place_partition.cuh
index 35a0e7e5c32..df67b92396b 100644
--- a/cudax/include/cuda/experimental/__stf/places/place_partition.cuh
+++ b/cudax/include/cuda/experimental/__stf/places/place_partition.cuh
@@ -80,7 +80,7 @@ inline ::std::string place_partition_scope_to_string(place_partition_scope scope
  * `cuda_device` scope. Green context scope requires CUDA 12.4 or later.
  *
  * Iteration over subplaces is provided via `begin()` / `end()`; `as_grid()` builds
- * an `exec_place_grid` from the subplaces.
+ * an `exec_place` grid from the subplaces.
  */
 class place_partition
 {
@@ -131,14 +131,13 @@ public:
    * @param grid Input execution place grid to partition
    * @param scope Partitioning granularity
    */
-  place_partition(async_resources_handle& handle, const exec_place_grid& grid, place_partition_scope scope)
+  place_partition(async_resources_handle& handle, const exec_place& grid, place_partition_scope scope)
   {
     ::std::vector<::std::shared_ptr<exec_place>> places;
-    const auto& grid_places = grid.get_places();
-    places.reserve(grid_places.size());
-    for (const auto& ep : grid_places)
+    places.reserve(grid.size());
+    for (size_t i = 0; i < grid.size(); ++i)
     {
-      places.push_back(::std::make_shared<exec_place>(ep));
+      places.push_back(::std::make_shared<exec_place>(grid.get_place(i)));
     }
     for (const auto& place : places)
     {
diff --git a/cudax/include/cuda/experimental/__stf/places/places.cuh b/cudax/include/cuda/experimental/__stf/places/places.cuh
index 58bc80ec9f8..f5a45ba892f 100644
--- a/cudax/include/cuda/experimental/__stf/places/places.cuh
+++ b/cudax/include/cuda/experimental/__stf/places/places.cuh
@@ -50,7 +50,6 @@ namespace cuda::experimental::stf
 {
 class exec_place;
 class exec_place_host;
-class exec_place_grid;
 
 // Green contexts are only supported since CUDA 12.4
 
@@ -825,10 +824,14 @@ public:
   }
 
   /**
-   * @brief Convert to exec_place_grid type
+   * @brief Returns *this for compatibility
    * @deprecated All places are grids now; use exec_place methods directly
    */
-  exec_place_grid as_grid() const;
+  const exec_place& as_grid() const
+  {
+    EXPECT(size() > 1, "as_grid() called on scalar exec_place");
+    return *this;
+  }
 
   /* These helper methods provide convenient way to express execution places,
    * for example exec_place::host or exec_place::device(4).
@@ -1181,197 +1184,146 @@ UNITTEST("exec_place copyable")
 };
 #endif // UNITTESTED_FILE
 
-//! A multidimensional grid of execution places for structured parallel computation
-class exec_place_grid : public exec_place
+/**
+ * Implementation class for multi-device execution place grids.
+ * This is used internally by make_grid() and related factory functions.
+ */
+class exec_place_grid_impl : public exec_place::impl
 {
 public:
-  /*
-   * Implementation of the exec_place_grid
-   */
-  class impl : public exec_place::impl
+  exec_place_grid_impl(::std::vector<exec_place> _places)
+      : dims_(_places.size(), 1, 1, 1)
+      , places_(mv(_places))
   {
-  public:
-    impl(::std::vector<exec_place> _places)
-        : dims_(_places.size(), 1, 1, 1)
-        , places_(mv(_places))
-    {
-      _CCCL_ASSERT(!places_.empty(), "Grid must have at least one place");
-      _CCCL_ASSERT(dims_.x > 0, "Grid dimensions must be positive");
-    }
+    _CCCL_ASSERT(!places_.empty(), "Grid must have at least one place");
+    _CCCL_ASSERT(dims_.x > 0, "Grid dimensions must be positive");
+  }
 
-    impl(::std::vector<exec_place> _places, const dim4& _dims)
-        : dims_(_dims)
-        , places_(mv(_places))
-    {
-      _CCCL_ASSERT(dims_.x > 0, "Grid dimensions must be positive");
-    }
+  exec_place_grid_impl(::std::vector<exec_place> _places, const dim4& _dims)
+      : dims_(_dims)
+      , places_(mv(_places))
+  {
+    _CCCL_ASSERT(dims_.x > 0, "Grid dimensions must be positive");
+  }
 
-    // ===== Grid interface =====
+  // ===== Grid interface =====
 
-    dim4 get_dims() const override
-    {
-      return dims_;
-    }
+  dim4 get_dims() const override
+  {
+    return dims_;
+  }
 
-    size_t size() const override
-    {
-      return dims_.size();
-    }
+  size_t size() const override
+  {
+    return dims_.size();
+  }
 
-    exec_place get_place(size_t idx) const override
-    {
-      EXPECT(idx < places_.size(), "Index out of bounds");
-      return places_[idx];
-    }
+  exec_place get_place(size_t idx) const override
+  {
+    EXPECT(idx < places_.size(), "Index out of bounds");
+    return places_[idx];
+  }
 
-    // ===== Activation (delegates to sub-places) =====
+  // ===== Activation (delegates to sub-places) =====
 
-    exec_place activate(size_t idx) const override
-    {
-      EXPECT(idx < places_.size(), "Index out of bounds");
-      return places_[idx].activate(0);
-    }
+  exec_place activate(size_t idx) const override
+  {
+    EXPECT(idx < places_.size(), "Index out of bounds");
+    return places_[idx].activate(0);
+  }
 
-    void deactivate(size_t idx, const exec_place& prev) const override
-    {
-      EXPECT(idx < places_.size(), "Index out of bounds");
-      places_[idx].deactivate(0, prev);
-    }
+  void deactivate(size_t idx, const exec_place& prev) const override
+  {
+    EXPECT(idx < places_.size(), "Index out of bounds");
+    places_[idx].deactivate(0, prev);
+  }
 
-    // ===== Properties =====
+  // ===== Properties =====
 
-    ::std::string to_string() const override
-    {
-      return "grid(" + ::std::to_string(dims_.x) + "x" + ::std::to_string(dims_.y) + "x" + ::std::to_string(dims_.z)
-           + "x" + ::std::to_string(dims_.t) + ")";
-    }
+  ::std::string to_string() const override
+  {
+    return "grid(" + ::std::to_string(dims_.x) + "x" + ::std::to_string(dims_.y) + "x" + ::std::to_string(dims_.z)
+         + "x" + ::std::to_string(dims_.t) + ")";
+  }
 
-    // ===== Comparison =====
+  // ===== Comparison =====
 
-    int cmp(const exec_place::impl& rhs) const override
+  int cmp(const exec_place::impl& rhs) const override
+  {
+    if (typeid(*this) != typeid(rhs))
     {
-      if (typeid(*this) != typeid(rhs))
-      {
-        return typeid(*this).before(typeid(rhs)) ? -1 : 1;
-      }
-      const auto& other = static_cast<const impl&>(rhs);
-      // Compare dims first
-      auto this_dims  = ::std::tie(dims_.x, dims_.y, dims_.z, dims_.t);
-      auto other_dims = ::std::tie(other.dims_.x, other.dims_.y, other.dims_.z, other.dims_.t);
-      if (this_dims < other_dims)
-      {
-        return -1;
-      }
-      if (other_dims < this_dims)
-      {
-        return 1;
-      }
-      // Then compare places
-      if (places_ < other.places_)
-      {
-        return -1;
-      }
-      if (other.places_ < places_)
-      {
-        return 1;
-      }
-      return 0;
+      return typeid(*this).before(typeid(rhs)) ? -1 : 1;
     }
-
-    size_t hash() const override
+    const auto& other = static_cast<const exec_place_grid_impl&>(rhs);
+    // Compare dims first
+    auto this_dims  = ::std::tie(dims_.x, dims_.y, dims_.z, dims_.t);
+    auto other_dims = ::std::tie(other.dims_.x, other.dims_.y, other.dims_.z, other.dims_.t);
+    if (this_dims < other_dims)
     {
-      size_t h = ::cuda::experimental::stf::hash<dim4>{}(dims_);
-      for (const auto& p : places_)
-      {
-        hash_combine(h, p.hash());
-      }
-      return h;
+      return -1;
     }
-
-    // ===== Stream management =====
-
-    stream_pool& get_stream_pool(bool for_computation) const override
+    if (other_dims < this_dims)
     {
-      _CCCL_ASSERT(!for_computation, "Expected data transfer stream pool");
-      _CCCL_ASSERT(!places_.empty(), "Grid must have at least one place");
-      return places_[0].get_stream_pool(for_computation);
+      return 1;
     }
-
-    // ===== Grid-specific accessors =====
-
-    const ::std::vector<exec_place>& get_places() const
+    // Then compare places
+    if (places_ < other.places_)
     {
-      return places_;
+      return -1;
     }
-
-    // ===== Grid iteration state =====
-
-    ::std::ptrdiff_t get_current_idx() const override
+    if (other.places_ < places_)
     {
-      return current_idx_;
+      return 1;
     }
+    return 0;
+  }
 
-    void set_current_idx(::std::ptrdiff_t idx) const override
+  size_t hash() const override
+  {
+    size_t h = ::cuda::experimental::stf::hash<dim4>{}(dims_);
+    for (const auto& p : places_)
     {
-      current_idx_ = idx;
+      hash_combine(h, p.hash());
     }
+    return h;
+  }
 
-    ::std::shared_ptr<exec_place::impl> get_saved_prev_impl() const override
-    {
-      return saved_prev_impl_;
-    }
+  // ===== Stream management =====
 
-    void set_saved_prev_impl(::std::shared_ptr<exec_place::impl> p) const override
-    {
-      saved_prev_impl_ = mv(p);
-    }
+  stream_pool& get_stream_pool(bool for_computation) const override
+  {
+    _CCCL_ASSERT(!for_computation, "Expected data transfer stream pool");
+    _CCCL_ASSERT(!places_.empty(), "Grid must have at least one place");
+    return places_[0].get_stream_pool(for_computation);
+  }
 
-  private:
-    dim4 dims_;
-    ::std::vector<exec_place> places_;
-    mutable ::std::ptrdiff_t current_idx_ = -1;
-    mutable ::std::shared_ptr<exec_place::impl> saved_prev_impl_;
-  };
+  // ===== Grid iteration state =====
 
-  explicit operator bool() const
+  ::std::ptrdiff_t get_current_idx() const override
   {
-    return exec_place::get_impl() != nullptr;
+    return current_idx_;
   }
 
-  bool operator==(const exec_place_grid& rhs) const
+  void set_current_idx(::std::ptrdiff_t idx) const override
   {
-    return get_impl()->cmp(*rhs.get_impl()) == 0;
+    current_idx_ = idx;
   }
 
-  /**
-   * @brief Get the vector of sub-places (grid-specific)
-   */
-  const ::std::vector<exec_place>& get_places() const
+  ::std::shared_ptr<exec_place::impl> get_saved_prev_impl() const override
   {
-    return get_impl()->get_places();
+    return saved_prev_impl_;
   }
 
-  /**
-   * @brief Get the typed impl (for grid-specific operations)
-   */
-  ::std::shared_ptr<impl> get_impl() const
+  void set_saved_prev_impl(::std::shared_ptr<exec_place::impl> p) const override
   {
-    _CCCL_ASSERT(::std::dynamic_pointer_cast<impl>(exec_place::get_impl()), "Invalid exec_place_grid impl");
-    return ::std::static_pointer_cast<impl>(exec_place::get_impl());
+    saved_prev_impl_ = mv(p);
   }
 
-  // Default constructor
-  exec_place_grid()
-      : exec_place(nullptr)
-  {}
-
-  exec_place_grid(::std::shared_ptr<impl> p)
-      : exec_place(mv(p))
-  {}
-
-  exec_place_grid(::std::vector<exec_place> p, const dim4& d)
-      : exec_place(::std::make_shared<impl>(mv(p), d))
-  {}
+private:
+  dim4 dims_;
+  ::std::vector<exec_place> places_;
+  mutable ::std::ptrdiff_t current_idx_ = -1;
+  mutable ::std::shared_ptr<exec_place::impl> saved_prev_impl_;
 };
 
 //! Creates a grid of execution places with specified dimensions
@@ -1383,7 +1335,7 @@ inline exec_place make_grid(::std::vector<exec_place> places, const dim4& dims)
   {
     return mv(places[0]);
   }
-  return exec_place_grid(mv(places), dims);
+  return exec_place(::std::make_shared<exec_place_grid_impl>(mv(places), dims));
 }
 
 //! Creates a linear grid from a vector of execution places
@@ -1396,7 +1348,6 @@ inline exec_place make_grid(::std::vector<exec_place> places)
 }
 
 // === data_place::affine_exec_place implementation ===
-// Defined here after exec_place_grid is complete
 
 inline exec_place data_place::affine_exec_place() const
 {
@@ -1414,7 +1365,6 @@ inline exec_place data_place::affine_exec_place() const
   if (is_composite())
   {
     // Return the grid of places associated to this composite data place
-    // exec_place_grid inherits from exec_place, so this works via slicing
     return get_grid();
   }
 
@@ -1470,13 +1420,6 @@ inline exec_place exec_place::repeat(const exec_place& e, size_t cnt)
   return make_grid(::std::vector<exec_place>(cnt, e));
 }
 
-/* Deferred implementation : ::std::static_pointer_cast requires that exec_place_grid is a complete type */
-inline exec_place_grid exec_place::as_grid() const
-{
-  EXPECT(size() > 1, "as_grid() called on scalar exec_place");
-  return exec_place_grid(::std::static_pointer_cast<exec_place_grid::impl>(pimpl));
-}
-
 /* Get the first N available devices */
 //! Returns single device if n == 1 (no grid wrapper needed)
 inline exec_place exec_place::n_devices(size_t n, dim4 dims)

From 77589aee862d819fe6ec4e80469f04328d9ed785 Mon Sep 17 00:00:00 2001
From: Andrei Alexandrescu <andrei@erdani.com>
Date: Mon, 16 Mar 2026 21:29:03 -0400
Subject: [PATCH 12/12] pre-commit

---
 c/experimental/stf/src/stf.cu                             | 7 +++----
 .../cuda/experimental/__stf/graph/interfaces/slice.cuh    | 4 ++--
 .../cuda/experimental/__stf/internal/loop_dispatch.cuh    | 3 +--
 .../experimental/__stf/localization/composite_slice.cuh   | 8 ++------
 cudax/include/cuda/experimental/__stf/places/places.cuh   | 4 ++--
 .../cuda/experimental/__stf/stream/interfaces/slice.cuh   | 4 ++--
 6 files changed, 12 insertions(+), 18 deletions(-)

diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu
index 27a07cfab0a..7441e508916 100644
--- a/c/experimental/stf/src/stf.cu
+++ b/c/experimental/stf/src/stf.cu
@@ -439,10 +439,9 @@ stf_exec_place_grid_create(const stf_exec_place* places, size_t count, const stf
   {
     cpp_places.push_back(to_exec_place(const_cast<stf_exec_place*>(&places[i])));
   }
-  exec_place grid =
-    (grid_dims != nullptr)
-      ? make_grid(::std::move(cpp_places), dim4(grid_dims->x, grid_dims->y, grid_dims->z, grid_dims->t))
-      : make_grid(::std::move(cpp_places));
+  exec_place grid = (grid_dims != nullptr)
+                    ? make_grid(::std::move(cpp_places), dim4(grid_dims->x, grid_dims->y, grid_dims->z, grid_dims->t))
+                    : make_grid(::std::move(cpp_places));
   return new exec_place(::std::move(grid));
 }
 
diff --git a/cudax/include/cuda/experimental/__stf/graph/interfaces/slice.cuh b/cudax/include/cuda/experimental/__stf/graph/interfaces/slice.cuh
index 4a053ea53b3..f7211b724a0 100644
--- a/cudax/include/cuda/experimental/__stf/graph/interfaces/slice.cuh
+++ b/cudax/include/cuda/experimental/__stf/graph/interfaces/slice.cuh
@@ -87,8 +87,8 @@ public:
       return;
     }
 
-    exec_place grid = memory_node.get_grid();
-    size_t total_size    = this->shape.size();
+    exec_place grid   = memory_node.get_grid();
+    size_t total_size = this->shape.size();
 
     // position (x,y,z,t) on (nx,ny,nz,nt)
     // * index = x + nx*y + nx*ny*z + nx*ny*nz*t
diff --git a/cudax/include/cuda/experimental/__stf/internal/loop_dispatch.cuh b/cudax/include/cuda/experimental/__stf/internal/loop_dispatch.cuh
index f9acdc65fcc..9df09e99430 100644
--- a/cudax/include/cuda/experimental/__stf/internal/loop_dispatch.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/loop_dispatch.cuh
@@ -161,8 +161,7 @@ inline void loop_dispatch(context_t ctx, size_t start, size_t end, ::std::functi
   }
   else
   {
-    loop_dispatch<context_t, exec_place, use_threads>(
-      mv(ctx), exec_place::all_devices(), scope, start, end, mv(func));
+    loop_dispatch<context_t, exec_place, use_threads>(mv(ctx), exec_place::all_devices(), scope, start, end, mv(func));
   }
 }
 #endif // _CCCL_DOXYGEN_INVOKED
diff --git a/cudax/include/cuda/experimental/__stf/localization/composite_slice.cuh b/cudax/include/cuda/experimental/__stf/localization/composite_slice.cuh
index b9d7771573a..23ed4384f5f 100644
--- a/cudax/include/cuda/experimental/__stf/localization/composite_slice.cuh
+++ b/cudax/include/cuda/experimental/__stf/localization/composite_slice.cuh
@@ -75,12 +75,8 @@ public:
   // ::std::function<pos4(size_t)> delinearize : translate the index in a buffer into a position in the data
   // TODO pass mv(place)
   template <typename F>
-  localized_array(exec_place grid,
-                  get_executor_func_t mapper,
-                  F&& delinearize,
-                  size_t total_size,
-                  size_t elemsize,
-                  dim4 data_dims)
+  localized_array(
+    exec_place grid, get_executor_func_t mapper, F&& delinearize, size_t total_size, size_t elemsize, dim4 data_dims)
       : grid(mv(grid))
       , mapper(mv(mapper))
       , total_size_bytes(total_size * elemsize)
diff --git a/cudax/include/cuda/experimental/__stf/places/places.cuh b/cudax/include/cuda/experimental/__stf/places/places.cuh
index f5a45ba892f..903d15481cf 100644
--- a/cudax/include/cuda/experimental/__stf/places/places.cuh
+++ b/cudax/include/cuda/experimental/__stf/places/places.cuh
@@ -1242,8 +1242,8 @@ public:
 
   ::std::string to_string() const override
   {
-    return "grid(" + ::std::to_string(dims_.x) + "x" + ::std::to_string(dims_.y) + "x" + ::std::to_string(dims_.z)
-         + "x" + ::std::to_string(dims_.t) + ")";
+    return "grid(" + ::std::to_string(dims_.x) + "x" + ::std::to_string(dims_.y) + "x" + ::std::to_string(dims_.z) + "x"
+         + ::std::to_string(dims_.t) + ")";
   }
 
   // ===== Comparison =====
diff --git a/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh b/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh
index 5a01d242608..3d71c5c6993 100644
--- a/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh
+++ b/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh
@@ -94,8 +94,8 @@ public:
       return;
     }
 
-    exec_place grid = memory_node.get_grid();
-    size_t total_size    = this->shape.size();
+    exec_place grid   = memory_node.get_grid();
+    size_t total_size = this->shape.size();
 
     // position (x,y,z,t) on (nx,ny,nz,nt)
     // * index = x + nx*y + nx*ny*z + nx*ny*nz*t