diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu
index f57ad22db37..7441e508916 100644
--- a/c/experimental/stf/src/stf.cu
+++ b/c/experimental/stf/src/stf.cu
@@ -58,7 +58,7 @@ static data_place to_data_place(stf_data_place* data_p)
       {
         return data_place::invalid();
       }
-      exec_place_grid* grid_ptr = static_cast<exec_place_grid*>(grid_handle);
+      exec_place* grid_ptr = static_cast<exec_place*>(grid_handle);
       // Layout-compatible: pass C mapper directly so the runtime calls it
       get_executor_func_t cpp_mapper = reinterpret_cast<get_executor_func_t>(mapper);
       return data_place::composite(cpp_mapper, *grid_ptr);
@@ -425,8 +425,8 @@ stf_exec_place_grid_handle stf_exec_place_grid_from_devices(const int* device_id
   {
     places.push_back(exec_place::device(device_ids[i]));
   }
-  exec_place_grid grid = make_grid(::std::move(places));
-  return new exec_place_grid(::std::move(grid));
+  exec_place grid = make_grid(::std::move(places));
+  return new exec_place(::std::move(grid));
 }
 
 stf_exec_place_grid_handle
@@ -439,18 +439,17 @@ stf_exec_place_grid_create(const stf_exec_place* places, size_t count, const stf
   {
     cpp_places.push_back(to_exec_place(const_cast<stf_exec_place*>(&places[i])));
   }
-  exec_place_grid grid =
-    (grid_dims != nullptr)
-      ? make_grid(::std::move(cpp_places), dim4(grid_dims->x, grid_dims->y, grid_dims->z, grid_dims->t))
-      : make_grid(::std::move(cpp_places));
-  return new exec_place_grid(::std::move(grid));
+  exec_place grid = (grid_dims != nullptr)
+                    ? make_grid(::std::move(cpp_places), dim4(grid_dims->x, grid_dims->y, grid_dims->z, grid_dims->t))
+                    : make_grid(::std::move(cpp_places));
+  return new exec_place(::std::move(grid));
 }
 
 void stf_exec_place_grid_destroy(stf_exec_place_grid_handle grid)
 {
   if (grid != nullptr)
   {
-    delete static_cast<exec_place_grid*>(grid);
+    delete static_cast<exec_place*>(grid);
   }
 }
 
diff --git a/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh b/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh
index 39375f6db2d..3251b32eded 100644
--- a/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh
+++ b/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh
@@ -456,17 +456,17 @@ public:
 
   void set_current_place(pos4 p)
   {
-    get_exec_place().as_grid().set_current_place(p);
+    get_exec_place().set_current_place(p);
   }
 
   void unset_current_place()
   {
-    get_exec_place().as_grid().unset_current_place();
+    get_exec_place().unset_current_place();
   }
 
-  const exec_place& get_current_place() const
+  exec_place get_current_place() const
   {
-    return get_exec_place().as_grid().get_current_place();
+    return get_exec_place().get_current_place();
   }
 
 private:
diff --git a/cudax/include/cuda/experimental/__stf/graph/interfaces/slice.cuh b/cudax/include/cuda/experimental/__stf/graph/interfaces/slice.cuh
index 6aa1b142e9f..f7211b724a0 100644
--- a/cudax/include/cuda/experimental/__stf/graph/interfaces/slice.cuh
+++ b/cudax/include/cuda/experimental/__stf/graph/interfaces/slice.cuh
@@ -87,8 +87,8 @@ public:
       return;
     }
 
-    exec_place_grid grid = memory_node.get_grid();
-    size_t total_size    = this->shape.size();
+    exec_place grid   = memory_node.get_grid();
+    size_t total_size = this->shape.size();
 
     // position (x,y,z,t) on (nx,ny,nz,nt)
     // * index = x + nx*y + nx*ny*z + nx*ny*nz*t
diff --git a/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh b/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh
index b3659b7c829..3ef18a36803 100644
--- a/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh
@@ -1117,9 +1117,6 @@ public:
     }
   }
 
-  template <typename S, typename... Deps>
-  auto parallel_for(exec_place_grid e_place, S shape, Deps... deps) = delete;
-
   template <typename S, typename... Deps>
   auto parallel_for(S shape, Deps... deps)
   {
diff --git a/cudax/include/cuda/experimental/__stf/internal/launch.cuh b/cudax/include/cuda/experimental/__stf/internal/launch.cuh
index b33b14929c7..235219943db 100644
--- a/cudax/include/cuda/experimental/__stf/internal/launch.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/launch.cuh
@@ -95,7 +95,7 @@ void cuda_launcher_graph(interpreted_spec interpreted_policy, Fun&& f, void** ar
 template <typename Fun, typename interpreted_spec, typename Arg>
 void launch_impl(interpreted_spec interpreted_policy, exec_place& p, Fun f, Arg arg, cudaStream_t stream, size_t rank)
 {
-  assert(!p.is_grid());
+  assert(p.size() == 1);
 
   p->*[&] {
     auto th = thread_hierarchy(static_cast<int>(rank), interpreted_policy);
@@ -140,7 +140,7 @@ void launch_impl(interpreted_spec interpreted_policy, exec_place& p, Fun f, Arg
 template <typename task_t, typename Fun, typename interpreted_spec, typename Arg>
 void graph_launch_impl(task_t& t, interpreted_spec interpreted_policy, exec_place& p, Fun f, Arg arg, size_t rank)
 {
-  assert(!p.is_grid());
+  assert(p.size() == 1);
 
   auto kernel_args = tuple_prepend(thread_hierarchy(static_cast<int>(rank), interpreted_policy), mv(arg));
   using args_type  = decltype(kernel_args);
@@ -331,11 +331,11 @@ public:
     assert(e_place.affine_data_place() == t.get_affine_data_place());
 
     /*
-     * If we have a grid of places, the implicit affine partitioner is the blocked_partition.
+     * If we have a grid (including 1-element grids), the implicit affine partitioner is the blocked_partition.
      *
      * An explicit composite data place is required per data dependency to customize this behaviour.
      */
-    if (e_place.is_grid())
+    if (e_place.size() > 1)
     {
       // Create a composite data place defined by the grid of places + the partitioning function
       t.set_affine_data_place(data_place::composite(blocked_partition(), e_place.as_grid()));
diff --git a/cudax/include/cuda/experimental/__stf/internal/loop_dispatch.cuh b/cudax/include/cuda/experimental/__stf/internal/loop_dispatch.cuh
index 3359a8ea03c..9df09e99430 100644
--- a/cudax/include/cuda/experimental/__stf/internal/loop_dispatch.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/loop_dispatch.cuh
@@ -161,8 +161,7 @@ inline void loop_dispatch(context_t ctx, size_t start, size_t end, ::std::functi
   }
   else
   {
-    loop_dispatch<context_t, exec_place_grid, use_threads>(
-      mv(ctx), exec_place::all_devices(), scope, start, end, mv(func));
+    loop_dispatch<context_t, exec_place, use_threads>(mv(ctx), exec_place::all_devices(), scope, start, end, mv(func));
   }
 }
 #endif // _CCCL_DOXYGEN_INVOKED
diff --git a/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh
index ccc420f6609..53da4f55f9f 100644
--- a/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh
@@ -550,8 +550,8 @@ public:
     // If there is a partitioner, we ensure there is a proper affine data place for this execution place
     if constexpr (!::std::is_same_v<partitioner_t, null_partition>)
     {
-      // This is only meaningful for grid of places
-      if (e_place.is_grid())
+      // Grids need a composite data place
+      if (e_place.size() > 1)
       {
         // Create a composite data place defined by the grid of places + the partitioning function
         t.set_affine_data_place(data_place::composite(partitioner_t(), e_place.as_grid()));
@@ -629,7 +629,7 @@ public:
     if constexpr (need_reduction)
     {
       _CCCL_ASSERT(e_place != exec_place::host(), "Reduce access mode currently unimplemented on host.");
-      _CCCL_ASSERT(!e_place.is_grid(), "Reduce access mode currently unimplemented on grid of places.");
+      _CCCL_ASSERT(e_place.size() == 1, "Reduce access mode currently unimplemented on grid of places.");
       do_parallel_for_redux(f, e_place, shape, t);
       return;
     }
@@ -659,17 +659,18 @@ public:
     if constexpr (!::std::is_same_v<exec_place_t, exec_place_host> && is_extended_host_device_lambda_closure_type
                   || is_extended_device_lambda_closure_type)
     {
-      if (!e_place.is_grid())
+      if (e_place.size() == 1)
       {
         // Apply the parallel_for construct over the entire shape on the
-        // execution place of the task
+        // execution place of the task.
+        const exec_place& scalar_place = e_place;
         if constexpr (need_reduction)
         {
-          do_parallel_for_redux(f, e_place, shape, t);
+          do_parallel_for_redux(f, scalar_place, shape, t);
         }
         else
         {
-          do_parallel_for(f, e_place, shape, t);
+          do_parallel_for(f, scalar_place, shape, t);
         }
       }
       else
@@ -681,11 +682,12 @@ public:
         }
         else
         {
-          size_t grid_size = t.grid_dims().size();
+          const auto& t_place = t.get_exec_place();
+          size_t grid_size    = t_place.size();
           for (size_t i = 0; i < grid_size; i++)
           {
             t.set_current_place(pos4(i));
-            const auto sub_shape = partitioner_t::apply(shape, pos4(i), t.grid_dims());
+            const auto sub_shape = partitioner_t::apply(shape, pos4(i), t_place.get_dims());
             do_parallel_for(f, t.get_current_place(), sub_shape, t);
             t.unset_current_place();
           }
diff --git a/cudax/include/cuda/experimental/__stf/localization/composite_slice.cuh b/cudax/include/cuda/experimental/__stf/localization/composite_slice.cuh
index c4298f35459..23ed4384f5f 100644
--- a/cudax/include/cuda/experimental/__stf/localization/composite_slice.cuh
+++ b/cudax/include/cuda/experimental/__stf/localization/composite_slice.cuh
@@ -75,12 +75,8 @@ public:
   // ::std::function<pos4(size_t)> delinearize : translate the index in a buffer into a position in the data
   // TODO pass mv(place)
   template <typename F>
-  localized_array(exec_place_grid grid,
-                  get_executor_func_t mapper,
-                  F&& delinearize,
-                  size_t total_size,
-                  size_t elemsize,
-                  dim4 data_dims)
+  localized_array(
+    exec_place grid, get_executor_func_t mapper, F&& delinearize, size_t total_size, size_t elemsize, dim4 data_dims)
       : grid(mv(grid))
       , mapper(mv(mapper))
       , total_size_bytes(total_size * elemsize)
@@ -422,7 +418,7 @@ private:
   }
 
   event_list prereqs; // To allow reuse in a cache
-  exec_place_grid grid;
+  exec_place grid;
   get_executor_func_t mapper = nullptr;
   ::std::vector<metadata> meta;
 
diff --git a/cudax/include/cuda/experimental/__stf/places/data_place_interface.cuh b/cudax/include/cuda/experimental/__stf/places/data_place_interface.cuh
index ecf6d8f7b45..14b20f4a707 100644
--- a/cudax/include/cuda/experimental/__stf/places/data_place_interface.cuh
+++ b/cudax/include/cuda/experimental/__stf/places/data_place_interface.cuh
@@ -46,7 +46,6 @@ namespace cuda::experimental::stf
 {
 // Forward declarations
 class exec_place;
-class exec_place_grid;
 class pos4;
 class dim4;
 
@@ -184,7 +183,7 @@ public:
    * @brief Get the grid for composite places
    * @throws std::logic_error if not a composite place
    */
-  virtual const exec_place_grid& get_grid() const
+  virtual const exec_place& get_grid() const
   {
     throw ::std::logic_error("get_grid() called on non-composite data_place");
   }
diff --git a/cudax/include/cuda/experimental/__stf/places/exec/cuda_stream.cuh b/cudax/include/cuda/experimental/__stf/places/exec/cuda_stream.cuh
index a18545e4014..a490f57054f 100644
--- a/cudax/include/cuda/experimental/__stf/places/exec/cuda_stream.cuh
+++ b/cudax/include/cuda/experimental/__stf/places/exec/cuda_stream.cuh
@@ -30,92 +30,82 @@
 namespace cuda::experimental::stf
 {
 /**
- * @brief Designates execution that is to run on a specific CUDA stream
- *
+ * @brief Implementation for CUDA stream execution places
  */
-class exec_place_cuda_stream : public exec_place
+class exec_place_cuda_stream_impl : public exec_place::impl
 {
 public:
-  class impl : public exec_place::impl
+  exec_place_cuda_stream_impl(const decorated_stream& dstream)
+      : exec_place::impl(data_place::device(dstream.dev_id))
+      , dstream_(dstream)
+      , dummy_pool_(dstream)
+  {}
+
+  exec_place get_place(size_t idx) const override
   {
-  public:
-    impl(const decorated_stream& _dstream)
-        : exec_place::impl(data_place::device(_dstream.dev_id))
-        , dstream(_dstream)
-        , dummy_pool(_dstream)
-    {}
-
-    /* We set the current device to be the device on which the CUDA stream was created */
-    exec_place activate() const override
-    {
-      return exec_place::device(dstream.dev_id).activate();
-    }
+    EXPECT(idx == 0, "Index out of bounds for cuda_stream exec_place");
+    return exec_place::cuda_stream(dstream_);
+  }
 
-    void deactivate(const exec_place& prev) const override
-    {
-      return exec_place::device(dstream.dev_id).deactivate(prev);
-    }
+  exec_place activate(size_t idx) const override
+  {
+    EXPECT(idx == 0, "Index out of bounds for cuda_stream exec_place");
+    return exec_place::device(dstream_.dev_id).activate();
+  }
 
-    stream_pool& get_stream_pool(bool) const override
-    {
-      return dummy_pool;
-    }
+  void deactivate(size_t idx, const exec_place& prev) const override
+  {
+    EXPECT(idx == 0, "Index out of bounds for cuda_stream exec_place");
+    exec_place::device(dstream_.dev_id).deactivate(prev);
+  }
 
-    ::std::string to_string() const override
-    {
-      return "exec(stream id=" + ::std::to_string(dstream.id) + " dev=" + ::std::to_string(dstream.dev_id) + ")";
-    }
+  stream_pool& get_stream_pool(bool) const override
+  {
+    return dummy_pool_;
+  }
 
-    bool operator==(const exec_place::impl& rhs) const override
+  ::std::string to_string() const override
+  {
+    return "exec(stream id=" + ::std::to_string(dstream_.id) + " dev=" + ::std::to_string(dstream_.dev_id) + ")";
+  }
+
+  int cmp(const exec_place::impl& rhs) const override
+  {
+    if (typeid(*this) != typeid(rhs))
     {
-      if (typeid(*this) != typeid(rhs))
-      {
-        return false;
-      }
-      const auto& other = static_cast<const impl&>(rhs);
-      // Compare by stream handle
-      return dstream.stream == other.dstream.stream;
+      return typeid(*this).before(typeid(rhs)) ? -1 : 1;
     }
-
-    size_t hash() const override
+    const auto& other = static_cast<const exec_place_cuda_stream_impl&>(rhs);
+    if (dstream_.stream < other.dstream_.stream)
     {
-      // Hash the stream handle, not the affine data place
-      return ::std::hash<cudaStream_t>()(dstream.stream);
+      return -1;
     }
-
-    bool operator<(const exec_place::impl& rhs) const override
+    if (other.dstream_.stream < dstream_.stream)
     {
-      if (typeid(*this) != typeid(rhs))
-      {
-        return typeid(*this).before(typeid(rhs));
-      }
-      const auto& other = static_cast<const impl&>(rhs);
-      return dstream.stream < other.dstream.stream;
+      return 1;
     }
+    return 0;
+  }
 
-  private:
-    decorated_stream dstream;
-    // We create a dummy pool of streams which only consists in a single stream in practice.
-    mutable stream_pool dummy_pool;
-  };
-
-public:
-  exec_place_cuda_stream(const decorated_stream& dstream)
-      : exec_place(::std::make_shared<impl>(dstream))
+  size_t hash() const override
   {
-    static_assert(sizeof(exec_place_cuda_stream) == sizeof(exec_place),
-                  "exec_place_cuda_stream cannot add state; it would be sliced away.");
+    return ::std::hash<cudaStream_t>()(dstream_.stream);
   }
+
+private:
+  decorated_stream dstream_;
+  mutable stream_pool dummy_pool_;
 };
 
-inline exec_place_cuda_stream exec_place::cuda_stream(cudaStream_t stream)
+inline exec_place exec_place::cuda_stream(cudaStream_t stream)
 {
   int devid = get_device_from_stream(stream);
-  return exec_place_cuda_stream(decorated_stream(stream, get_stream_id(stream), devid));
+  return exec_place(
+    ::std::make_shared<exec_place_cuda_stream_impl>(decorated_stream(stream, get_stream_id(stream), devid)));
 }
 
-inline exec_place_cuda_stream exec_place::cuda_stream(const decorated_stream& dstream)
+inline exec_place exec_place::cuda_stream(const decorated_stream& dstream)
 {
-  return exec_place_cuda_stream(dstream);
+  return exec_place(::std::make_shared<exec_place_cuda_stream_impl>(dstream));
 }
 } // end namespace cuda::experimental::stf
diff --git a/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh b/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh
index f5e5531eb52..284cb3f134e 100644
--- a/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh
+++ b/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh
@@ -252,135 +252,112 @@ private:
 };
 
 /**
- * @brief Designates execution that is to run on a green context. Initialize with the device ordinal and green_context
+ * @brief Implementation for green context execution places
  */
-class exec_place_green_ctx : public exec_place
+class exec_place_green_ctx_impl : public exec_place::impl
 {
 public:
-  class impl : public exec_place::impl
+  /**
+   * @brief Construct a green context execution place
+   *
+   * @param gc_view The green context view
+   * @param use_green_ctx_data_place If true, use a green context data place as the
+   *        affine data place. If false (default), use a regular device data place instead.
+   */
+  exec_place_green_ctx_impl(green_ctx_view gc_view, bool use_green_ctx_data_place = false)
+      : exec_place::impl(
+          use_green_ctx_data_place ? make_green_ctx_data_place(gc_view) : data_place::device(gc_view.devid))
+      , devid_(gc_view.devid)
+      , g_ctx_(gc_view.g_ctx)
+      , pool_(mv(gc_view.pool))
+  {}
+
+  // This is used to implement deactivate and wrap an existing context
+  exec_place_green_ctx_impl(CUcontext saved_context)
+      : driver_context_(saved_context)
+  {}
+
+  exec_place get_place(size_t idx) const override
   {
-  public:
-    /**
-     * @brief Construct a green context execution place
-     *
-     * @param gc_view The green context view
-     * @param use_green_ctx_data_place If true, use a green context data place as the
-     *        affine data place. If false (default), use a regular device data place instead.
-     */
-    impl(green_ctx_view gc_view, bool use_green_ctx_data_place = false)
-        : exec_place::impl(
-            use_green_ctx_data_place ? make_green_ctx_data_place(gc_view) : data_place::device(gc_view.devid))
-        , devid(gc_view.devid)
-        , g_ctx(gc_view.g_ctx)
-        , pool(mv(gc_view.pool))
-    {}
-
-    // This is used to implement deactivate and wrap an existing context
-    impl(CUcontext saved_context)
-        : driver_context(saved_context)
-    {}
-
-    exec_place activate() const override
-    {
-      // Save the current context and transform it into a fake green context place
-      CUcontext current_ctx;
-      cuda_safe_call(cuCtxGetCurrent(&current_ctx));
-      exec_place result = exec_place(::std::make_shared<impl>(current_ctx));
-
-      // Convert the green context to a primary context (TODO cache this ?)
-      cuda_safe_call(cuCtxFromGreenCtx(&driver_context, g_ctx));
-
-#  if 0
-            // for debug purposes, display the affinity
-            {
-                CUdevResource check_resource;
-                cuda_safe_call(cuGreenCtxGetDevResource(g_ctx, &check_resource, CU_DEV_RESOURCE_TYPE_SM));
-                unsigned long long check_ctxId;
-                cuda_safe_call(cuCtxGetId(driver_context, &check_ctxId));
-                fprintf(stderr, "ACTIVATE : set affinity with %d SMs (ctx ID = %llu)\n", check_resource.sm.smCount,
-                        check_ctxId);
-            }
-#  endif
+    EXPECT(idx == 0, "Index out of bounds for green_ctx exec_place");
+    return exec_place::green_ctx(green_ctx_view(g_ctx_, pool_, devid_));
+  }
 
-      cuda_safe_call(cuCtxSetCurrent(driver_context));
+  exec_place activate(size_t idx) const override
+  {
+    EXPECT(idx == 0, "Index out of bounds for green_ctx exec_place");
 
-      return result;
-    }
+    // Save the current context and transform it into a fake green context place
+    CUcontext current_ctx;
+    cuda_safe_call(cuCtxGetCurrent(&current_ctx));
+    exec_place result = exec_place(::std::make_shared<exec_place_green_ctx_impl>(current_ctx));
 
-    void deactivate(const exec_place& prev) const override
-    {
-      auto prev_impl      = ::std::static_pointer_cast<impl>(prev.get_impl());
-      CUcontext saved_ctx = prev_impl->driver_context;
+    // Convert the green context to a primary context
+    cuda_safe_call(cuCtxFromGreenCtx(&driver_context_, g_ctx_));
+    cuda_safe_call(cuCtxSetCurrent(driver_context_));
+
+    return result;
+  }
+
+  void deactivate(size_t idx, const exec_place& prev) const override
+  {
+    EXPECT(idx == 0, "Index out of bounds for green_ctx exec_place");
+
+    auto prev_impl      = ::std::static_pointer_cast<exec_place_green_ctx_impl>(prev.get_impl());
+    CUcontext saved_ctx = prev_impl->driver_context_;
 
 #  ifdef DEBUG
-      // Ensure that the current context is the green context that we have activated before
-      CUcontext current_ctx;
-      cuda_safe_call(cuCtxGetCurrent(&current_ctx));
-      assert(get_cuda_context_id(current_ctx) == get_cuda_context_id(driver_context));
+    CUcontext current_ctx;
+    cuda_safe_call(cuCtxGetCurrent(&current_ctx));
+    assert(get_cuda_context_id(current_ctx) == get_cuda_context_id(driver_context_));
 #  endif
 
-      cuda_safe_call(cuCtxSetCurrent(saved_ctx));
-    }
+    cuda_safe_call(cuCtxSetCurrent(saved_ctx));
+  }
 
-    ::std::string to_string() const override
-    {
-      return "green ctx ( id=" + ::std::to_string(get_cuda_context_id(g_ctx)) + " dev_id =" + ::std::to_string(devid)
-           + ")";
-    }
+  ::std::string to_string() const override
+  {
+    return "green_ctx(id=" + ::std::to_string(get_cuda_context_id(g_ctx_)) + " dev=" + ::std::to_string(devid_) + ")";
+  }
 
-    stream_pool& get_stream_pool(bool) const override
-    {
-      return pool;
-    }
+  stream_pool& get_stream_pool(bool) const override
+  {
+    return pool_;
+  }
 
-    bool operator==(const exec_place::impl& rhs) const override
+  int cmp(const exec_place::impl& rhs) const override
+  {
+    if (typeid(*this) != typeid(rhs))
     {
-      if (typeid(*this) != typeid(rhs))
-      {
-        return false;
-      }
-      const auto& other = static_cast<const impl&>(rhs);
-      // Compare green context handles
-      return g_ctx == other.g_ctx;
+      return typeid(*this).before(typeid(rhs)) ? -1 : 1;
     }
-
-    size_t hash() const override
+    const auto& other = static_cast<const exec_place_green_ctx_impl&>(rhs);
+    if (g_ctx_ < other.g_ctx_)
     {
-      // Hash the green context handle, not the affine data place
-      return ::std::hash<CUgreenCtx>()(g_ctx);
+      return -1;
     }
-
-    bool operator<(const exec_place::impl& rhs) const override
+    if (other.g_ctx_ < g_ctx_)
     {
-      if (typeid(*this) != typeid(rhs))
-      {
-        return typeid(*this).before(typeid(rhs));
-      }
-      const auto& other = static_cast<const impl&>(rhs);
-      return g_ctx < other.g_ctx;
+      return 1;
     }
+    return 0;
+  }
 
-  private:
-    int devid        = -1;
-    CUgreenCtx g_ctx = {};
-    // a context created from the green context (or used to store an existing context to implement
-    // activate/deactivate)
-    mutable CUcontext driver_context = {};
-    mutable stream_pool pool;
-  };
-
-public:
-  exec_place_green_ctx(green_ctx_view gc_view, bool use_green_ctx_data_place = false)
-      : exec_place(::std::make_shared<impl>(mv(gc_view), use_green_ctx_data_place))
+  size_t hash() const override
   {
-    static_assert(sizeof(exec_place_green_ctx) <= sizeof(exec_place),
-                  "exec_place_green_ctx cannot add state; it would be sliced away.");
+    return ::std::hash<CUgreenCtx>()(g_ctx_);
   }
+
+private:
+  int devid_                        = -1;
+  CUgreenCtx g_ctx_                 = {};
+  mutable CUcontext driver_context_ = {};
+  mutable stream_pool pool_;
 };
 
 inline exec_place exec_place::green_ctx(const green_ctx_view& gc_view, bool use_green_ctx_data_place)
 {
-  return exec_place_green_ctx(gc_view, use_green_ctx_data_place);
+  return exec_place(::std::make_shared<exec_place_green_ctx_impl>(gc_view, use_green_ctx_data_place));
 }
 
 inline ::std::shared_ptr<void> green_ctx_data_place_impl::get_affine_exec_impl() const
diff --git a/cudax/include/cuda/experimental/__stf/places/place_partition.cuh b/cudax/include/cuda/experimental/__stf/places/place_partition.cuh
index d7cd70dca54..df67b92396b 100644
--- a/cudax/include/cuda/experimental/__stf/places/place_partition.cuh
+++ b/cudax/include/cuda/experimental/__stf/places/place_partition.cuh
@@ -80,7 +80,7 @@ inline ::std::string place_partition_scope_to_string(place_partition_scope scope
  * `cuda_device` scope. Green context scope requires CUDA 12.4 or later.
  *
  * Iteration over subplaces is provided via `begin()` / `end()`; `as_grid()` builds
- * an `exec_place_grid` from the subplaces.
+ * an `exec_place` grid from the subplaces.
  */
 class place_partition
 {
@@ -131,14 +131,13 @@ public:
    * @param grid Input execution place grid to partition
    * @param scope Partitioning granularity
    */
-  place_partition(async_resources_handle& handle, const exec_place_grid& grid, place_partition_scope scope)
+  place_partition(async_resources_handle& handle, const exec_place& grid, place_partition_scope scope)
   {
     ::std::vector<::std::shared_ptr<exec_place>> places;
-    const auto& grid_places = grid.get_places();
-    places.reserve(grid_places.size());
-    for (const auto& ep : grid_places)
+    places.reserve(grid.size());
+    for (size_t i = 0; i < grid.size(); ++i)
     {
-      places.push_back(::std::make_shared<exec_place>(ep));
+      places.push_back(::std::make_shared<exec_place>(grid.get_place(i)));
     }
     for (const auto& place : places)
     {
@@ -210,10 +209,10 @@ public:
     return sub_places[i];
   }
 
-  /** @brief Build an exec_place_grid from the subplaces.
-   * @return A grid view of the partitioned execution places.
+  /** @brief Build an exec_place from the subplaces.
+   * @return A grid view of the partitioned execution places, or single place if size == 1.
    */
-  exec_place_grid as_grid() const
+  exec_place as_grid() const
   {
     return make_grid(sub_places);
   }
@@ -222,9 +221,9 @@ private:
   /** @brief Compute the subplaces of a place at the specified granularity (scope) into the sub_places vector */
   void compute_subplaces(async_resources_handle& handle, const exec_place& place, place_partition_scope scope)
   {
-    if (place.is_grid() && scope == place_partition_scope::cuda_stream)
+    // Handle multi-element grids by recursively partitioning
+    if (place.size() > 1 && scope == place_partition_scope::cuda_stream)
     {
-      // Recursively partition grid into devices, then into streams
       for (auto& device_p : place_partition(place, handle, place_partition_scope::cuda_device))
       {
         auto device_p_places = place_partition(device_p, handle, place_partition_scope::cuda_stream).sub_places;
@@ -233,6 +232,27 @@ private:
       return;
     }
 
+    // Handle scalar places (including 1-element grids) for cuda_stream scope
+    if (place.size() == 1 && scope == place_partition_scope::cuda_stream)
+    {
+      // Get the underlying scalar place (for 1-element grids, get the single element)
+      exec_place scalar_place = place.is_device() ? place : place.get_place(0);
+      if (!scalar_place.is_device())
+      {
+        // Host or other non-device place - no streams to partition into
+        sub_places.push_back(place);
+        return;
+      }
+      auto& pool = scalar_place.get_stream_pool(true);
+      for (size_t i = 0; i < pool.size(); i++)
+      {
+        decorated_stream dstream = pool.next(scalar_place);
+        sub_places.push_back(exec_place::cuda_stream(dstream));
+      }
+      return;
+    }
+
+    // Legacy path for explicit device check (kept for compatibility)
     if (place.is_device() && scope == place_partition_scope::cuda_stream)
     {
       auto& pool = place.get_stream_pool(true);
@@ -247,7 +267,7 @@ private:
 
 // Green contexts are only supported since CUDA 12.4
 #if _CCCL_CTK_AT_LEAST(12, 4)
-    if (place.is_grid() && scope == place_partition_scope::green_context)
+    if (place.size() > 1 && scope == place_partition_scope::green_context)
     {
       // Recursively partition grid into devices, then into green contexts
       for (auto& device_p : place_partition(place, handle, place_partition_scope::cuda_device))
@@ -258,18 +278,40 @@ private:
       return;
     }
 
+    // Handle scalar places (including 1-element grids) for green_context scope
+    if (place.size() == 1 && scope == place_partition_scope::green_context)
+    {
+      exec_place scalar_place = place.is_device() ? place : place.get_place(0);
+      if (!scalar_place.is_device())
+      {
+        sub_places.push_back(place);
+        return;
+      }
+      int dev_id = device_ordinal(scalar_place.affine_data_place());
+
+      const char* env = getenv("CUDASTF_GREEN_CONTEXT_SIZE");
+      int sm_cnt      = env ? atoi(env) : 8;
+
+      auto h = handle.get_gc_helper(dev_id, sm_cnt);
+
+      size_t cnt = h->get_count();
+      for (size_t i = 0; i < cnt; i++)
+      {
+        sub_places.push_back(exec_place::green_ctx(h->get_view(i)));
+      }
+      return;
+    }
+
+    // Legacy path for explicit device check (kept for compatibility)
     if (place.is_device() && scope == place_partition_scope::green_context)
     {
-      // Find the device associated to the place, and get the green context helper
       int dev_id = device_ordinal(place.affine_data_place());
 
-      // 8 SMs per green context is a granularity that should work on any arch.
       const char* env = getenv("CUDASTF_GREEN_CONTEXT_SIZE");
       int sm_cnt      = env ? atoi(env) : 8;
 
       auto h = handle.get_gc_helper(dev_id, sm_cnt);
 
-      // Get views of green context out of the helper to create execution places
       size_t cnt = h->get_count();
       for (size_t i = 0; i < cnt; i++)
       {
@@ -291,17 +333,26 @@ private:
 #endif // _CCCL_CTK_BELOW(12, 4)
     _CCCL_ASSERT(scope != place_partition_scope::cuda_stream, "CUDA stream scope needs an async resource handle.");
 
-    if (place.is_grid() && scope == place_partition_scope::cuda_device)
-    {
-      exec_place_grid g = place.as_grid();
-      // Copy the vector of places
-      sub_places = g.get_places();
-      return;
-    }
-
-    if (place.is_device() && scope == place_partition_scope::cuda_device)
+    if (scope == place_partition_scope::cuda_device)
     {
-      sub_places.push_back(place);
+      if (place.size() > 1)
+      {
+        // Multi-element grid: extract all places
+        for (size_t i = 0; i < place.size(); ++i)
+        {
+          sub_places.push_back(place.get_place(i));
+        }
+      }
+      else if (place.is_device())
+      {
+        // Scalar device place
+        sub_places.push_back(place);
+      }
+      else
+      {
+        // 1-element grid or other scalar place: extract the underlying place
+        sub_places.push_back(place.get_place(0));
+      }
       return;
     }
 
diff --git a/cudax/include/cuda/experimental/__stf/places/places.cuh b/cudax/include/cuda/experimental/__stf/places/places.cuh
index 8b8a75c23cc..903d15481cf 100644
--- a/cudax/include/cuda/experimental/__stf/places/places.cuh
+++ b/cudax/include/cuda/experimental/__stf/places/places.cuh
@@ -50,13 +50,8 @@ namespace cuda::experimental::stf
 {
 class exec_place;
 class exec_place_host;
-class exec_place_grid;
-class exec_place_cuda_stream;
 
 // Green contexts are only supported since CUDA 12.4
-#if _CCCL_CTK_AT_LEAST(12, 4)
-class exec_place_green_ctx;
-#endif // _CCCL_CTK_AT_LEAST(12, 4)
 
 //! Function type for computing executor placement from data coordinates
 using get_executor_func_t = pos4 (*)(pos4, dim4, dim4);
@@ -169,9 +164,9 @@ public:
 
   // User-visible API when using a different partitioner than the one of the grid
   template <typename partitioner_t /*, typename scalar_exec_place_t */>
-  static data_place composite(partitioner_t p, const exec_place_grid& g);
+  static data_place composite(partitioner_t p, const exec_place& g);
 
-  static data_place composite(get_executor_func_t f, const exec_place_grid& grid);
+  static data_place composite(get_executor_func_t f, const exec_place& grid);
 
 #if _CCCL_CTK_AT_LEAST(12, 4)
   static data_place green_ctx(const green_ctx_view& gc_view);
@@ -300,7 +295,7 @@ public:
     return p.pimpl_->get_device_ordinal();
   }
 
-  const exec_place_grid& get_grid() const
+  const exec_place& get_grid() const
   {
     return pimpl_->get_grid();
   }
@@ -377,18 +372,19 @@ inline data_place from_index(size_t n);
 /**
  * @brief Indicates where a computation takes place (CPU, dev0, dev1, ...)
  *
- * Currently data and computation are together `(devid == int(data_place))`.
+ * All execution places are modeled as grids. Scalar places (host, single device)
+ * are simply 1-element grids. This unified model eliminates special-casing and
+ * allows uniform iteration over any exec_place.
  */
 class exec_place
 {
 public:
   /*
-   * @brief Using the pimpl idiom. Public because a number of classes inehrit from this.
+   * @brief Using the pimpl idiom. Public because a number of classes inherit from this.
    */
   class impl
   {
   public:
-    // Note that the default ctor assumes an invalid affine data place
     impl()                       = default;
     impl(const impl&)            = delete;
     impl& operator=(const impl&) = delete;
@@ -398,8 +394,44 @@ public:
         : affine(mv(place))
     {}
 
-    virtual exec_place activate() const
+    // ===== Grid interface (all places are grids) =====
+
+    /**
+     * @brief Get the dimensions of this grid
+     *
+     * For scalar places, returns dim4(1, 1, 1, 1).
+     */
+    virtual dim4 get_dims() const
+    {
+      return dim4(1, 1, 1, 1);
+    }
+
+    /**
+     * @brief Get the total number of places in this grid
+     */
+    virtual size_t size() const
     {
+      return 1;
+    }
+
+    /**
+     * @brief Get the sub-place at the given linear index
+     *
+     * For scalar places, idx must be 0.
+     */
+    virtual exec_place get_place(size_t idx) const;
+
+    // ===== Activation/deactivation (indexed) =====
+
+    /**
+     * @brief Activate the sub-place at the given index
+     *
+     * For scalar places, idx must be 0.
+     * Returns the previous execution state needed for deactivate().
+     */
+    virtual exec_place activate(size_t idx) const
+    {
+      EXPECT(idx == 0, "Index out of bounds for scalar exec_place");
       if (!affine.is_device())
       {
         return exec_place();
@@ -410,12 +442,15 @@ public:
       {
         cuda_safe_call(cudaSetDevice(new_dev_id));
       }
-      auto old_dev = data_place::device(old_dev_id);
-      return exec_place(mv(old_dev));
+      return exec_place(data_place::device(old_dev_id));
     }
 
-    virtual void deactivate(const exec_place& prev) const
+    /**
+     * @brief Deactivate the sub-place at the given index, restoring previous state
+     */
+    virtual void deactivate(size_t idx, const exec_place& prev) const
     {
+      EXPECT(idx == 0, "Index out of bounds for scalar exec_place");
       if (affine.is_device())
       {
         auto current_dev_id  = cuda_try<cudaGetDevice>();
@@ -427,7 +462,9 @@ public:
       }
     }
 
-    virtual const data_place affine_data_place() const
+    // ===== Properties =====
+
+    virtual data_place affine_data_place() const
     {
       return affine;
     }
@@ -437,34 +474,32 @@ public:
       return "exec(" + affine.to_string() + ")";
     }
 
-    virtual bool is_host() const
-    {
-      return affine.is_host();
-    }
-
-    virtual bool is_device() const
-    {
-      return affine.is_device();
-    }
-
-    virtual bool is_grid() const
-    {
-      return false;
-    }
-
-    virtual size_t size() const
-    {
-      return 1;
-    }
-
     virtual void set_affine_data_place(data_place place)
     {
       affine = mv(place);
     }
 
-    virtual bool operator==(const impl& rhs) const
+    // ===== Comparison =====
+
+    /**
+     * @brief Three-way comparison
+     * @return -1 if *this < rhs, 0 if *this == rhs, 1 if *this > rhs
+     */
+    virtual int cmp(const impl& rhs) const
     {
-      return affine == rhs.affine;
+      if (typeid(*this) != typeid(rhs))
+      {
+        return typeid(*this).before(typeid(rhs)) ? -1 : 1;
+      }
+      if (affine < rhs.affine)
+      {
+        return -1;
+      }
+      if (rhs.affine < affine)
+      {
+        return 1;
+      }
+      return 0;
     }
 
     virtual size_t hash() const
@@ -472,24 +507,8 @@ public:
       return affine.hash();
     }
 
-    virtual bool operator<(const impl& rhs) const
-    {
-      // Different types: order by typeid
-      if (typeid(*this) != typeid(rhs))
-      {
-        return typeid(*this).before(typeid(rhs));
-      }
-      // Same type (both base impl): compare by device ID
-      // (base impl stores devid in affine, so we extract it via device_ordinal)
-      return device_ordinal(affine) < device_ordinal(rhs.affine);
-    }
+    // ===== Stream management =====
 
-    /**
-     * @brief Get the stream pool for this execution place.
-     *
-     * The base implementation returns pool_compute or pool_data stored
-     * directly on the impl.
-     */
     virtual stream_pool& get_stream_pool(bool for_computation) const
     {
       return for_computation ? pool_compute : pool_data;
@@ -498,6 +517,24 @@ public:
     static constexpr size_t pool_size      = 4;
     static constexpr size_t data_pool_size = 4;
 
+    // Grid iteration state - only meaningful for multi-element grids
+    virtual ::std::ptrdiff_t get_current_idx() const
+    {
+      return -1;
+    }
+    virtual void set_current_idx(::std::ptrdiff_t) const
+    {
+      _CCCL_ASSERT(false, "set_current_idx called on non-grid exec_place");
+    }
+    virtual ::std::shared_ptr<impl> get_saved_prev_impl() const
+    {
+      return nullptr;
+    }
+    virtual void set_saved_prev_impl(::std::shared_ptr<impl>) const
+    {
+      _CCCL_ASSERT(false, "set_saved_prev_impl called on non-grid exec_place");
+    }
+
   protected:
     friend class exec_place;
     data_place affine = data_place::invalid();
@@ -515,17 +552,21 @@ public:
 
   bool operator==(const exec_place& rhs) const
   {
-    return *pimpl == *rhs.pimpl;
+    if (pimpl.get() == rhs.pimpl.get())
+    {
+      return true;
+    }
+    return pimpl->cmp(*rhs.pimpl) == 0;
   }
+
   bool operator!=(const exec_place& rhs) const
   {
     return !(*this == rhs);
   }
 
-  // To use in a ::std::map indexed by exec_place
   bool operator<(const exec_place& rhs) const
   {
-    return *pimpl < *rhs.pimpl;
+    return pimpl->cmp(*rhs.pimpl) < 0;
   }
 
   bool operator>(const exec_place& rhs) const
@@ -543,20 +584,51 @@ public:
     return !(*this < rhs);
   }
 
+  size_t hash() const
+  {
+    return pimpl->hash();
+  }
+
+  // ===== Grid interface (all places are grids) =====
+
   /**
-   * @brief Compute a hash value for this execution place
+   * @brief Get the dimensions of this grid
    *
-   * Used by std::hash specialization for unordered containers.
+   * For scalar places (host, single device), returns dim4(1, 1, 1, 1).
    */
-  size_t hash() const
+  dim4 get_dims() const
   {
-    return pimpl->hash();
+    return pimpl->get_dims();
   }
 
   /**
-   * @brief an iterator class which goes over all subplaces in an exec place.
+   * @brief Get the total number of places in this grid
+   */
+  size_t size() const
+  {
+    return pimpl->size();
+  }
+
+  /**
+   * @brief Get the sub-place at the given linear index
    *
-   * This is a trivial singleton unless we have a grid of places.
+   * For scalar places, idx must be 0 and returns the place itself.
+   */
+  exec_place get_place(size_t idx) const
+  {
+    return pimpl->get_place(idx);
+  }
+
+  /**
+   * @brief Get the sub-place at the given multi-dimensional position
+   */
+  exec_place get_place(pos4 p) const
+  {
+    return get_place(get_dims().get_index(p));
+  }
+
+  /**
+   * @brief an iterator class which goes over all subplaces in an exec place.
    */
   class iterator
   {
@@ -566,7 +638,10 @@ public:
         , index(index)
     {}
 
-    exec_place operator*();
+    exec_place operator*()
+    {
+      return it_impl->get_place(index);
+    }
 
     iterator& operator++()
     {
@@ -598,98 +673,165 @@ public:
     return iterator(pimpl, pimpl->size());
   }
 
+  // ===== Activation/deactivation =====
+
   /**
-   * @brief Returns a string representation of the execution place object.
+   * @brief Activate the sub-place at the given index
    *
-   * @return std::string
+   * @param idx The index of the sub-place to activate (default 0 for scalar places)
+   * @return The previous execution state needed for deactivate()
    */
-  ::std::string to_string() const
+  exec_place activate(size_t idx = 0) const
   {
-    return pimpl->to_string();
+    return pimpl->activate(idx);
   }
 
   /**
-   * @brief Returns the `data_place` naturally associated with this execution place.
+   * @brief Deactivate the sub-place at the given index, restoring previous state
+   *
+   * @param idx The index of the sub-place to deactivate (default 0 for scalar places)
+   * @param prev The previous state returned by activate()
    */
-  const data_place affine_data_place() const
+  void deactivate(size_t idx, const exec_place& prev) const
   {
-    return pimpl->affine_data_place();
+    pimpl->deactivate(idx, prev);
   }
 
-  void set_affine_data_place(data_place place)
+  /**
+   * @brief Convenience overload for scalar places (idx=0)
+   */
+  void deactivate(const exec_place& prev) const
   {
-    pimpl->set_affine_data_place(mv(place));
+    deactivate(0, prev);
   }
 
-  stream_pool& get_stream_pool(bool for_computation) const
+  /**
+   * @brief Set the current place for grid iteration
+   *
+   * Activates the place at the given index and saves state for later restoration.
+   */
+  void set_current_place(size_t idx)
   {
-    return pimpl->get_stream_pool(for_computation);
+    auto cur_idx = pimpl->get_current_idx();
+    if (cur_idx >= 0)
+    {
+      exec_place saved_prev(pimpl->get_saved_prev_impl());
+      pimpl->deactivate(cur_idx, saved_prev);
+    }
+    pimpl->set_current_idx(static_cast<::std::ptrdiff_t>(idx));
+    exec_place prev = pimpl->activate(idx);
+    pimpl->set_saved_prev_impl(prev.pimpl);
   }
 
   /**
-   * @brief Get a decorated stream from the stream pool associated to this execution place.
+   * @brief Set the current place using multi-dimensional position
    */
-  decorated_stream getStream(bool for_computation) const;
-
-  cudaStream_t pick_stream(bool for_computation = true) const
+  void set_current_place(pos4 p)
   {
-    return getStream(for_computation).stream;
+    set_current_place(get_dims().get_index(p));
   }
 
-  // TODO make protected !
-  const ::std::shared_ptr<impl>& get_impl() const
+  /**
+   * @brief Unset the current place, restoring previous execution context
+   */
+  void unset_current_place()
   {
-    return pimpl;
+    auto cur_idx = pimpl->get_current_idx();
+    EXPECT(cur_idx >= 0, "unset_current_place() called without corresponding set_current_place()");
+    exec_place saved_prev(pimpl->get_saved_prev_impl());
+    pimpl->deactivate(cur_idx, saved_prev);
+    pimpl->set_current_idx(-1);
   }
 
   /**
-   * @brief Set computation to run on this place.
-   *
-   * @return `exec_place` The previous execution place. See `deactivate` below.
+   * @brief Get the currently active sub-place
    */
-  exec_place activate() const
+  exec_place get_current_place() const
   {
-    return pimpl->activate();
+    auto cur_idx = pimpl->get_current_idx();
+    EXPECT(cur_idx >= 0, "No current place set");
+    return get_place(cur_idx);
   }
 
   /**
-   * @brief Undoes the effect of `activate`. Call with the previous `exec_place` object returned by `activate`.
-   *
-   * @warning Undefined behavior if you don't pass the result of `activate`.
+   * @brief Get the index of the currently active sub-place, or -1 if none
    */
-  void deactivate(const exec_place& p) const
+  ::std::ptrdiff_t current_place_id() const
+  {
+    return pimpl->get_current_idx();
+  }
+
+  // ===== Properties =====
+
+  ::std::string to_string() const
+  {
+    return pimpl->to_string();
+  }
+
+  data_place affine_data_place() const
   {
-    pimpl->deactivate(p);
+    return pimpl->affine_data_place();
+  }
+
+  void set_affine_data_place(data_place place)
+  {
+    pimpl->set_affine_data_place(mv(place));
+  }
+
+  stream_pool& get_stream_pool(bool for_computation) const
+  {
+    return pimpl->get_stream_pool(for_computation);
+  }
+
+  decorated_stream getStream(bool for_computation) const;
+
+  cudaStream_t pick_stream(bool for_computation = true) const
+  {
+    return getStream(for_computation).stream;
+  }
+
+  const ::std::shared_ptr<impl>& get_impl() const
+  {
+    return pimpl;
   }
 
   bool is_host() const
   {
-    return pimpl->is_host();
+    return affine_data_place().is_host();
   }
 
   bool is_device() const
   {
-    return pimpl->is_device();
+    return affine_data_place().is_device();
   }
 
-  bool is_grid() const
+  /**
+   * @brief Get the dimension along a specific axis
+   * @deprecated Use get_dims().get(axis_id) instead
+   */
+  size_t grid_dim(int axis_id) const
   {
-    return pimpl->is_grid();
+    return get_dims().get(axis_id);
   }
 
-  size_t size() const
+  /**
+   * @brief Get all dimensions
+   * @deprecated Use get_dims() instead
+   */
+  dim4 grid_dims() const
   {
-    return pimpl->size();
+    return get_dims();
   }
 
-  // Get the implementation assuming this is a grid
-  // We need to defer the implementation after exec_place_grid has been
-  // defined because this requires a ::std::static_pointer_cast from the base
-  // class to exec_place_grid
-  exec_place_grid as_grid() const;
-
-  size_t grid_dim(int axid_is) const;
-  dim4 grid_dims() const;
+  /**
+   * @brief Returns *this for compatibility
+   * @deprecated All places are grids now; use exec_place methods directly
+   */
+  const exec_place& as_grid() const
+  {
+    EXPECT(size() > 1, "as_grid() called on scalar exec_place");
+    return *this;
+  }
 
   /* These helper methods provide convenient way to express execution places,
    * for example exec_place::host or exec_place::device(4).
@@ -711,8 +853,8 @@ public:
   static exec_place green_ctx(const green_ctx_view& gc_view, bool use_green_ctx_data_place = false);
 #endif // _CCCL_CTK_AT_LEAST(12, 4)
 
-  static exec_place_cuda_stream cuda_stream(cudaStream_t stream);
-  static exec_place_cuda_stream cuda_stream(const decorated_stream& dstream);
+  static exec_place cuda_stream(cudaStream_t stream);
+  static exec_place cuda_stream(const decorated_stream& dstream);
 
   /**
    * @brief Returns the currently active device.
@@ -724,14 +866,14 @@ public:
     return exec_place::device(cuda_try<cudaGetDevice>());
   }
 
-  static exec_place_grid all_devices();
+  static exec_place all_devices();
 
-  static exec_place_grid n_devices(size_t n, dim4 dims);
+  static exec_place n_devices(size_t n, dim4 dims);
 
-  static exec_place_grid n_devices(size_t n);
+  static exec_place n_devices(size_t n);
 
   // For debug purpose on a machine with a single GPU, for example
-  static exec_place_grid repeat(const exec_place& e, size_t cnt);
+  static exec_place repeat(const exec_place& e, size_t cnt);
 
   template <typename... Args>
   auto partition_by_scope(Args&&... args);
@@ -870,11 +1012,11 @@ inline decorated_stream exec_place::getStream(bool for_computation) const
 /**
  * @brief Designates execution that is to run on the host.
  *
+ * Host is modeled as a 1-element grid containing the host execution context.
  */
 class exec_place_host : public exec_place
 {
 public:
-  // Implementation of the exec_place_host class
   class impl : public exec_place::impl
   {
   public:
@@ -882,21 +1024,27 @@ public:
         : exec_place::impl(data_place::host())
     {}
 
-    // operator<: base class implementation is correct (compares typeid, then device_ordinal).
-    // Since host is a singleton, all instances compare equal.
+    // Grid interface - host is a 1-element grid
+    exec_place get_place(size_t idx) const override;
 
-    exec_place activate() const override
+    // Activation - no-op for host
+    exec_place activate(size_t idx) const override
     {
+      EXPECT(idx == 0, "Index out of bounds for host exec_place");
       return exec_place();
-    } // no-op
-    void deactivate(const exec_place& p) const override
+    }
+
+    void deactivate(size_t idx, const exec_place& prev) const override
     {
-      _CCCL_ASSERT(!p.get_impl(), "");
-    } // no-op
-    virtual const data_place affine_data_place() const override
+      EXPECT(idx == 0, "Index out of bounds for host exec_place");
+      _CCCL_ASSERT(!prev.get_impl(), "Host deactivate expects empty prev");
+    }
+
+    data_place affine_data_place() const override
     {
       return data_place::host();
     }
+
     stream_pool& get_stream_pool(bool for_computation) const override
     {
       return exec_place::current_device().get_stream_pool(for_computation);
@@ -943,6 +1091,8 @@ UNITTEST("exec_place_host::operator->*")
 
 /**
  * @brief Designates execution that is to run on a specific CUDA device.
+ *
+ * Device is modeled as a 1-element grid containing that device.
  */
 class exec_place_device : public exec_place
 {
@@ -952,10 +1102,22 @@ public:
   public:
     explicit impl(int devid)
         : exec_place::impl(data_place::device(devid))
+        , devid_(devid)
     {
       pool_compute = stream_pool(pool_size);
       pool_data    = stream_pool(data_pool_size);
     }
+
+    // Grid interface - device is a 1-element grid
+    exec_place get_place(size_t idx) const override;
+
+    int get_devid() const
+    {
+      return devid_;
+    }
+
+  private:
+    int devid_;
   };
 };
 
@@ -1022,335 +1184,163 @@ UNITTEST("exec_place copyable")
 };
 #endif // UNITTESTED_FILE
 
-//! A multidimensional grid of execution places for structured parallel computation
-class exec_place_grid : public exec_place
+/**
+ * Implementation class for multi-device execution place grids.
+ * This is used internally by make_grid() and related factory functions.
+ */
+class exec_place_grid_impl : public exec_place::impl
 {
 public:
-  /*
-   * Implementation of the exec_place_grid
-   */
-  class impl : public exec_place::impl
+  exec_place_grid_impl(::std::vector<exec_place> _places)
+      : dims_(_places.size(), 1, 1, 1)
+      , places_(mv(_places))
   {
-  public:
-    // Define a grid directly from a vector of places
-    // This creates an execution grid automatically
-    impl(::std::vector<exec_place> _places)
-        : dims(_places.size(), 1, 1, 1)
-        , places(mv(_places))
-    {
-      _CCCL_ASSERT(!places.empty(), "");
-      _CCCL_ASSERT(dims.x > 0, "");
-      _CCCL_ASSERT(affine.is_invalid(), "");
-    }
-
-    // With a "dim4 shape"
-    impl(::std::vector<exec_place> _places, const dim4& _dims)
-        : dims(_dims)
-        , places(mv(_places))
-    {
-      _CCCL_ASSERT(dims.x > 0, "");
-      _CCCL_ASSERT(affine.is_invalid(), "");
-    }
-
-    // TODO improve with a better description
-    ::std::string to_string() const final
-    {
-      return ::std::string("GRID place");
-    }
-
-    exec_place activate() const override
-    {
-      // No-op
-      return exec_place();
-    }
-
-    // TODO : shall we deactivate the current place, if any ?
-    void deactivate(const exec_place& _prev) const override
-    {
-      // No-op
-      EXPECT(!_prev.get_impl(), "Invalid execution place.");
-    }
-
-    /* Dynamically checks whether an execution place is a device */
-    bool is_device() const override
-    {
-      return false;
-    }
-
-    /* Dynamically checks whether an execution place is a grid */
-    bool is_grid() const override
-    {
-      return true;
-    }
-
-    bool operator==(const exec_place::impl& rhs) const override
-    {
-      // First, check if rhs is of type exec_place_grid::impl
-      auto other = dynamic_cast<const impl*>(&rhs);
-      if (!other)
-      {
-        return false; // rhs is not a grid, so they are not equal
-      }
-
-      // Compare two grids
-      return *this == *other;
-    }
-
-    // Compare two grids
-    bool operator==(const impl& rhs) const
-    {
-      // Compare grid-specific properties
-      // Note: for grids, equality is determined by dims and places, not the affine data place
-      return dims == rhs.dims && places == rhs.places;
-    }
-
-    size_t hash() const override
-    {
-      // Hash based on dims and places, consistent with operator==
-      size_t h = ::cuda::experimental::stf::hash<dim4>{}(dims);
-      for (const auto& p : places)
-      {
-        hash_combine(h, p.hash());
-      }
-      return h;
-    }
-
-    bool operator<(const exec_place::impl& rhs) const override
-    {
-      // Different types: order by typeid
-      if (typeid(*this) != typeid(rhs))
-      {
-        return typeid(*this).before(typeid(rhs));
-      }
-      // Same type: safe to cast
-      const auto& other = static_cast<const impl&>(rhs);
-      // Compare dims first, then places
-      if (!(dims == other.dims))
-      {
-        // Use tuple comparison for consistent ordering
-        return ::std::tie(dims.x, dims.y, dims.z, dims.t)
-             < ::std::tie(other.dims.x, other.dims.y, other.dims.z, other.dims.t);
-      }
-      return places < other.places;
-    }
-
-    const ::std::vector<exec_place>& get_places() const
-    {
-      return places;
-    }
-
-    stream_pool& get_stream_pool(bool for_computation) const override
-    {
-      _CCCL_ASSERT(!for_computation, "Expected data transfer stream pool");
-      const auto& v = get_places();
-      _CCCL_ASSERT(v.size() > 0, "Grid must have at least one place");
-      return v[0].get_stream_pool(for_computation);
-    }
-
-    exec_place grid_activate(size_t i) const
-    {
-      const auto& v = get_places();
-      return v[i].activate();
-    }
+    _CCCL_ASSERT(!places_.empty(), "Grid must have at least one place");
+    _CCCL_ASSERT(dims_.x > 0, "Grid dimensions must be positive");
+  }
 
-    void grid_deactivate(size_t i, exec_place p) const
-    {
-      const auto& v = get_places();
-      v[i].deactivate(p);
-    }
+  exec_place_grid_impl(::std::vector<exec_place> _places, const dim4& _dims)
+      : dims_(_dims)
+      , places_(mv(_places))
+  {
+    _CCCL_ASSERT(dims_.x > 0, "Grid dimensions must be positive");
+  }
 
-    const exec_place& get_current_place()
-    {
-      return get_places()[current_p_1d];
-    }
+  // ===== Grid interface =====
 
-    // Set the current place from the 1D index within the grid (flattened grid)
-    void set_current_place(size_t p_index)
-    {
-      // Unset the previous place, if any
-      if (current_p_1d >= 0)
-      {
-        // First deactivate the previous place
-        grid_deactivate(current_p_1d, old_place);
-      }
+  dim4 get_dims() const override
+  {
+    return dims_;
+  }
 
-      // get the 1D index for that position
-      current_p_1d = (::std::ptrdiff_t) p_index;
+  size_t size() const override
+  {
+    return dims_.size();
+  }
 
-      // The returned value contains the state to restore when we deactivate the place
-      old_place = grid_activate(current_p_1d);
-    }
+  exec_place get_place(size_t idx) const override
+  {
+    EXPECT(idx < places_.size(), "Index out of bounds");
+    return places_[idx];
+  }
 
-    // Set the current place, given the position in the grid
-    void set_current_place(pos4 p)
-    {
-      size_t p_index = dims.get_index(p);
-      set_current_place(p_index);
-    }
+  // ===== Activation (delegates to sub-places) =====
 
-    void unset_current_place()
-    {
-      EXPECT(current_p_1d >= 0, "unset_current_place() called without corresponding call to set_current_place()");
+  exec_place activate(size_t idx) const override
+  {
+    EXPECT(idx < places_.size(), "Index out of bounds");
+    return places_[idx].activate(0);
+  }
 
-      // First deactivate the previous place
-      grid_deactivate(current_p_1d, old_place);
-      current_p_1d = -1;
-    }
+  void deactivate(size_t idx, const exec_place& prev) const override
+  {
+    EXPECT(idx < places_.size(), "Index out of bounds");
+    places_[idx].deactivate(0, prev);
+  }
 
-    ::std::ptrdiff_t current_place_id() const
-    {
-      return current_p_1d;
-    }
+  // ===== Properties =====
 
-    dim4 get_dims() const
-    {
-      return dims;
-    }
+  ::std::string to_string() const override
+  {
+    return "grid(" + ::std::to_string(dims_.x) + "x" + ::std::to_string(dims_.y) + "x" + ::std::to_string(dims_.z) + "x"
+         + ::std::to_string(dims_.t) + ")";
+  }
 
-    size_t get_dim(int axis_id) const
-    {
-      return dims.get(axis_id);
-    }
+  // ===== Comparison =====
 
-    size_t size() const override
+  int cmp(const exec_place::impl& rhs) const override
+  {
+    if (typeid(*this) != typeid(rhs))
     {
-      return dims.size();
+      return typeid(*this).before(typeid(rhs)) ? -1 : 1;
     }
-
-    /* Get the place associated to this position in the grid */
-    const exec_place& get_place(pos4 p) const
+    const auto& other = static_cast<const exec_place_grid_impl&>(rhs);
+    // Compare dims first
+    auto this_dims  = ::std::tie(dims_.x, dims_.y, dims_.z, dims_.t);
+    auto other_dims = ::std::tie(other.dims_.x, other.dims_.y, other.dims_.z, other.dims_.t);
+    if (this_dims < other_dims)
     {
-      return coords_to_place(p);
+      return -1;
     }
-
-    const exec_place& get_place(size_t p_index) const
+    if (other_dims < this_dims)
     {
-      return coords_to_place(p_index);
+      return 1;
     }
-
-  private:
-    // What is the execution place at theses coordinates in the exec place grid ?
-    const exec_place& coords_to_place(size_t c0, size_t c1 = 0, size_t c2 = 0, size_t c3 = 0) const
+    // Then compare places
+    if (places_ < other.places_)
     {
-      // Flatten the (c0, c1, c2, c3) vector into a global index
-      size_t index = c0 + dims.get(0) * (c1 + dims.get(1) * (c2 + c3 * dims.get(2)));
-      return places[index];
+      return -1;
     }
-
-    const exec_place& coords_to_place(pos4 coords) const
+    if (other.places_ < places_)
     {
-      return coords_to_place(coords.x, coords.y, coords.z, coords.t);
+      return 1;
     }
-
-    // current position in the grid (flattened to 1D) if we have a grid of
-    // execution place. -1 indicates there is no current position.
-    ::std::ptrdiff_t current_p_1d = -1;
-
-    // saved state before setting the current place
-    exec_place old_place;
-
-    // dimensions of the "grid"
-    dim4 dims;
-    ::std::vector<exec_place> places;
-  };
-
-  ///@{ @name Constructors
-  dim4 get_dims() const
-  {
-    return get_impl()->get_dims();
+    return 0;
   }
 
-  size_t get_dim(int axis_id) const
-  {
-    return get_dims().get(axis_id);
-  }
-
-  size_t size() const
-  {
-    return get_dims().size();
-  }
-
-  explicit operator bool() const
-  {
-    return get_impl() != nullptr;
-  }
-
-  /* Note that we compare against the exact same implementation : we could
-   * have equivalent grids with the same execution places, but to avoid a
-   * costly comparison we here only look for actually identical grids.
-   */
-  bool operator==(const exec_place_grid& rhs) const
-  {
-    return *get_impl() == *(rhs.get_impl());
-  }
-
-  ::std::ptrdiff_t current_place_id() const
+  size_t hash() const override
   {
-    return get_impl()->current_place_id();
+    size_t h = ::cuda::experimental::stf::hash<dim4>{}(dims_);
+    for (const auto& p : places_)
+    {
+      hash_combine(h, p.hash());
+    }
+    return h;
   }
 
-  const exec_place& get_place(pos4 p) const
-  {
-    return get_impl()->get_place(p);
-  }
+  // ===== Stream management =====
 
-  const ::std::vector<exec_place>& get_places() const
+  stream_pool& get_stream_pool(bool for_computation) const override
   {
-    return get_impl()->get_places();
+    _CCCL_ASSERT(!for_computation, "Expected data transfer stream pool");
+    _CCCL_ASSERT(!places_.empty(), "Grid must have at least one place");
+    return places_[0].get_stream_pool(for_computation);
   }
 
-  // Set the current place from the 1D index within the grid (flattened grid)
-  void set_current_place(size_t p_index)
-  {
-    return get_impl()->set_current_place(p_index);
-  }
+  // ===== Grid iteration state =====
 
-  // Get the current execution place
-  const exec_place& get_current_place()
+  ::std::ptrdiff_t get_current_idx() const override
   {
-    return get_impl()->get_current_place();
+    return current_idx_;
   }
 
-  // Set the current place, given the position in the grid
-  void set_current_place(pos4 p)
+  void set_current_idx(::std::ptrdiff_t idx) const override
   {
-    return get_impl()->set_current_place(p);
+    current_idx_ = idx;
   }
 
-  void unset_current_place()
+  ::std::shared_ptr<exec_place::impl> get_saved_prev_impl() const override
   {
-    return get_impl()->unset_current_place();
+    return saved_prev_impl_;
   }
 
-  ::std::shared_ptr<impl> get_impl() const
+  void set_saved_prev_impl(::std::shared_ptr<exec_place::impl> p) const override
   {
-    _CCCL_ASSERT(::std::dynamic_pointer_cast<impl>(exec_place::get_impl()), "Invalid exec_place_grid impl");
-    return ::std::static_pointer_cast<impl>(exec_place::get_impl());
+    saved_prev_impl_ = mv(p);
   }
 
-  // Default constructor
-  exec_place_grid()
-      : exec_place(nullptr)
-  {}
-
-  // private:
-  exec_place_grid(::std::shared_ptr<impl> p)
-      : exec_place(mv(p))
-  {}
-
-  exec_place_grid(::std::vector<exec_place> p, const dim4& d)
-      : exec_place(::std::make_shared<impl>(mv(p), d))
-  {}
+private:
+  dim4 dims_;
+  ::std::vector<exec_place> places_;
+  mutable ::std::ptrdiff_t current_idx_ = -1;
+  mutable ::std::shared_ptr<exec_place::impl> saved_prev_impl_;
 };
 
 //! Creates a grid of execution places with specified dimensions
-inline exec_place_grid make_grid(::std::vector<exec_place> places, const dim4& dims)
+//! Returns the single element if size == 1 (no grid wrapper needed)
+inline exec_place make_grid(::std::vector<exec_place> places, const dim4& dims)
 {
-  return exec_place_grid(mv(places), dims);
+  _CCCL_ASSERT(!places.empty(), "invalid places");
+  if (places.size() == 1)
+  {
+    return mv(places[0]);
+  }
+  return exec_place(::std::make_shared<exec_place_grid_impl>(mv(places), dims));
 }
 
 //! Creates a linear grid from a vector of execution places
-inline exec_place_grid make_grid(::std::vector<exec_place> places)
+//! Returns the single element if size == 1 (no grid wrapper needed)
+inline exec_place make_grid(::std::vector<exec_place> places)
 {
   _CCCL_ASSERT(!places.empty(), "invalid places");
   auto grid_dim = dim4(places.size(), 1, 1, 1);
@@ -1358,7 +1348,6 @@ inline exec_place_grid make_grid(::std::vector<exec_place> places)
 }
 
 // === data_place::affine_exec_place implementation ===
-// Defined here after exec_place_grid is complete
 
 inline exec_place data_place::affine_exec_place() const
 {
@@ -1376,7 +1365,6 @@ inline exec_place data_place::affine_exec_place() const
   if (is_composite())
   {
     // Return the grid of places associated to this composite data place
-    // exec_place_grid inherits from exec_place, so this works via slicing
     return get_grid();
   }
 
@@ -1398,45 +1386,43 @@ inline exec_place data_place::affine_exec_place() const
                            + ::std::to_string(pimpl_->get_device_ordinal()));
 }
 
-/// Implementation deferred because we need the definition of exec_place_grid
-inline exec_place exec_place::iterator::operator*()
-{
-  EXPECT(index < it_impl->size());
-  if (it_impl->is_grid())
-  {
-    return ::std::static_pointer_cast<exec_place_grid::impl>(it_impl)->get_place(index);
-  }
-  return exec_place(it_impl);
-}
+// === Deferred implementations for get_place() ===
 
-//! Creates a grid by replicating an execution place multiple times
-inline exec_place_grid exec_place::repeat(const exec_place& e, size_t cnt)
+inline exec_place exec_place::impl::get_place(size_t idx) const
 {
-  return make_grid(::std::vector<exec_place>(cnt, e));
+  EXPECT(idx == 0, "Index out of bounds for scalar exec_place");
+  // For generic scalar places, we can't easily return self
+  // This should be overridden by concrete implementations
+  return exec_place(
+    ::std::const_pointer_cast<impl>(::std::shared_ptr<const impl>(::std::shared_ptr<const impl>{}, this)));
 }
 
-/* Deferred implementation : ::std::static_pointer_cast requires that exec_place_grid is a complete type */
-inline exec_place_grid exec_place::as_grid() const
+inline exec_place exec_place_host::impl::get_place(size_t idx) const
 {
-  // Make sure it is really a grid
-  EXPECT(is_grid());
-  return exec_place_grid(::std::static_pointer_cast<exec_place_grid::impl>(pimpl));
+  EXPECT(idx == 0, "Index out of bounds for host exec_place");
+  return exec_place::host();
 }
 
-inline dim4 exec_place::grid_dims() const
+inline exec_place exec_place_device::impl::get_place(size_t idx) const
 {
-  EXPECT(is_grid());
-  return ::std::static_pointer_cast<exec_place_grid::impl>(pimpl)->get_dims();
+  EXPECT(idx == 0, "Index out of bounds for device exec_place");
+  return exec_place::device(devid_);
 }
 
-inline size_t exec_place::grid_dim(int axis_id) const
+//! Creates a grid by replicating an execution place multiple times
+//! Returns the original place if cnt == 1 (no grid wrapper needed)
+inline exec_place exec_place::repeat(const exec_place& e, size_t cnt)
 {
-  EXPECT(is_grid());
-  return ::std::static_pointer_cast<exec_place_grid::impl>(pimpl)->get_dim(axis_id);
+  if (cnt == 1)
+  {
+    return e;
+  }
+  return make_grid(::std::vector<exec_place>(cnt, e));
 }
 
 /* Get the first N available devices */
-inline exec_place_grid exec_place::n_devices(size_t n, dim4 dims)
+//! Returns single device if n == 1 (no grid wrapper needed)
+inline exec_place exec_place::n_devices(size_t n, dim4 dims)
 {
   const int ndevs = cuda_try<cudaGetDeviceCount>();
 
@@ -1453,21 +1439,23 @@ inline exec_place_grid exec_place::n_devices(size_t n, dim4 dims)
 }
 
 /* Get the first N available devices */
-inline exec_place_grid exec_place::n_devices(size_t n)
+//! Returns single device if n == 1 (no grid wrapper needed)
+inline exec_place exec_place::n_devices(size_t n)
 {
   return n_devices(n, dim4(n, 1, 1, 1));
 }
 
-inline exec_place_grid exec_place::all_devices()
+//! Returns all available devices, or single device if only one GPU
+inline exec_place exec_place::all_devices()
 {
   return n_devices(cuda_try<cudaGetDeviceCount>());
 }
 
 //! Creates a cyclic partition of an execution place grid with specified strides
-inline exec_place_grid partition_cyclic(const exec_place_grid& e_place, dim4 strides, pos4 tile_id)
+//! Returns single place if partition contains only one element
+inline exec_place partition_cyclic(const exec_place& e_place, dim4 strides, pos4 tile_id)
 {
-  const auto& g = e_place.as_grid();
-  dim4 g_dims   = e_place.get_dims();
+  dim4 g_dims = e_place.get_dims();
 
   /*
    *  Example : strides = (3, 2). tile 1 id = (1, 0)
@@ -1479,15 +1467,10 @@ inline exec_place_grid partition_cyclic(const exec_place_grid& e_place, dim4 str
   // Dimension K_x of the new grid on axis x :
   // pos_x + K_x stride_x = dim_x
   // K_x = (dim_x - pos_x)/stride_x
-  dim4 size = dim4((g.get_dim(0) - tile_id.x + strides.x - 1) / strides.x,
-                   (g.get_dim(1) - tile_id.y + strides.y - 1) / strides.y,
-                   (g.get_dim(2) - tile_id.z + strides.z - 1) / strides.z,
-                   (g.get_dim(3) - tile_id.t + strides.t - 1) / strides.t);
-
-  //    fprintf(stderr, "G DIM %d STRIDE %d ID %d\n", g_dims.x, strides.x, tile_id.x);
-  //    fprintf(stderr, "G DIM %d STRIDE %d ID %d\n", g_dims.y, strides.y, tile_id.y);
-  //    fprintf(stderr, "G DIM %d STRIDE %d ID %d\n", g_dims.z, strides.z, tile_id.z);
-  //    fprintf(stderr, "G DIM %d STRIDE %d ID %d\n", g_dims.t, strides.t, tile_id.t);
+  dim4 size = dim4((g_dims.x - tile_id.x + strides.x - 1) / strides.x,
+                   (g_dims.y - tile_id.y + strides.y - 1) / strides.y,
+                   (g_dims.z - tile_id.z + strides.z - 1) / strides.z,
+                   (g_dims.t - tile_id.t + strides.t - 1) / strides.t);
 
   ::std::vector<exec_place> places;
   places.reserve(size.x * size.y * size.z * size.t);
@@ -1500,7 +1483,7 @@ inline exec_place_grid partition_cyclic(const exec_place_grid& e_place, dim4 str
       {
         for (size_t x = static_cast<size_t>(tile_id.x); x < g_dims.x; x += strides.x)
         {
-          places.push_back(g.get_place(pos4(x, y, z, t)));
+          places.push_back(e_place.get_place(pos4(x, y, z, t)));
         }
       }
     }
@@ -1514,23 +1497,21 @@ inline exec_place_grid partition_cyclic(const exec_place_grid& e_place, dim4 str
 }
 
 //! Creates a tiled partition of an execution place grid with specified tile sizes
+//! Returns single place if partition contains only one element
 //!
 //! example :
 //! auto sub_g = partition_tile(g, dim4(2,2), dim4(0,1))
-inline exec_place_grid partition_tile(const exec_place_grid& e_place, dim4 tile_sizes, pos4 tile_id)
+inline exec_place partition_tile(const exec_place& e_place, dim4 tile_sizes, pos4 tile_id)
 {
-  const auto& g = e_place.as_grid();
+  dim4 g_dims = e_place.get_dims();
 
-  // TODO define dim4=dim4 * dim4
   dim4 begin_coords(
     tile_id.x * tile_sizes.x, tile_id.y * tile_sizes.y, tile_id.z * tile_sizes.z, tile_id.t * tile_sizes.t);
 
-  // TODO define dim4=MIN(dim4,dim4)
-  // upper bound coordinate (excluded)
-  dim4 end_coords(::std::min((tile_id.x + 1) * tile_sizes.x, g.get_dim(0)),
-                  ::std::min((tile_id.y + 1) * tile_sizes.y, g.get_dim(1)),
-                  ::std::min((tile_id.z + 1) * tile_sizes.z, g.get_dim(2)),
-                  ::std::min((tile_id.t + 1) * tile_sizes.t, g.get_dim(3)));
+  dim4 end_coords(::std::min((tile_id.x + 1) * tile_sizes.x, g_dims.x),
+                  ::std::min((tile_id.y + 1) * tile_sizes.y, g_dims.y),
+                  ::std::min((tile_id.z + 1) * tile_sizes.z, g_dims.z),
+                  ::std::min((tile_id.t + 1) * tile_sizes.t, g_dims.t));
 
   //    fprintf(stderr, "G DIM %d TILE SIZE %d ID %d\n", g_dims.x, tile_sizes.x, tile_id.x);
   //    fprintf(stderr, "G DIM %d TILE SIZE %d ID %d\n", g_dims.y, tile_sizes.y, tile_id.y);
@@ -1559,7 +1540,7 @@ inline exec_place_grid partition_tile(const exec_place_grid& e_place, dim4 tile_
       {
         for (size_t x = static_cast<size_t>(begin_coords.x); x < end_coords.x; x++)
         {
-          places.push_back(g.get_place(pos4(x, y, z, t)));
+          places.push_back(e_place.get_place(pos4(x, y, z, t)));
         }
       }
     }
@@ -1581,7 +1562,7 @@ inline exec_place_grid partition_tile(const exec_place_grid& e_place, dim4 tile_
 class data_place_composite final : public data_place_interface
 {
 public:
-  data_place_composite(exec_place_grid grid, get_executor_func_t partitioner_func)
+  data_place_composite(exec_place grid, get_executor_func_t partitioner_func)
       : grid_(mv(grid))
       , partitioner_func_(mv(partitioner_func))
   {}
@@ -1641,7 +1622,7 @@ public:
     return false;
   }
 
-  const exec_place_grid& get_grid() const override
+  const exec_place& get_grid() const override
   {
     return grid_;
   }
@@ -1652,7 +1633,7 @@ public:
   }
 
 private:
-  exec_place_grid grid_;
+  exec_place grid_;
   get_executor_func_t partitioner_func_;
 };
 
@@ -1662,14 +1643,14 @@ inline bool data_place::is_composite() const
   return typeid(ref) == typeid(data_place_composite);
 }
 
-inline data_place data_place::composite(get_executor_func_t f, const exec_place_grid& grid)
+inline data_place data_place::composite(get_executor_func_t f, const exec_place& grid)
 {
   return data_place(::std::make_shared<data_place_composite>(grid, f));
 }
 
 // User-visible API when the same partitioner as the one of the grid
 template <typename partitioner_t>
-data_place data_place::composite(partitioner_t, const exec_place_grid& g)
+data_place data_place::composite(partitioner_t, const exec_place& g)
 {
   return data_place::composite(&partitioner_t::get_executor, g);
 }
diff --git a/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh b/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh
index 21ac9da53a6..3d71c5c6993 100644
--- a/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh
+++ b/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh
@@ -94,8 +94,8 @@ public:
       return;
     }
 
-    exec_place_grid grid = memory_node.get_grid();
-    size_t total_size    = this->shape.size();
+    exec_place grid   = memory_node.get_grid();
+    size_t total_size = this->shape.size();
 
     // position (x,y,z,t) on (nx,ny,nz,nt)
     // * index = x + nx*y + nx*ny*z + nx*ny*nz*t
diff --git a/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh b/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh
index 4176e74b01d..5c000862613 100644
--- a/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh
+++ b/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh
@@ -75,24 +75,23 @@ public:
   cudaStream_t get_stream() const
   {
     const auto& e_place = get_exec_place();
-    if (e_place.is_grid())
+    if (e_place.size() > 1)
     {
       // Even with a grid, when we have a ctx.task construct we have not
       // yet selected/activated a specific place. So we take the main
       // stream associated to the whole task in that case.
-      ::std::ptrdiff_t current_place_id = e_place.as_grid().current_place_id();
-      return (current_place_id < 0 ? dstream.stream : stream_grid[current_place_id].stream);
+      ::std::ptrdiff_t current_id = e_place.current_place_id();
+      return (current_id < 0 ? dstream.stream : stream_grid[current_id].stream);
     }
 
     return dstream.stream;
   }
 
-  // TODO use a pos4 and check that we have a grid, of the proper dimension
   cudaStream_t get_stream(size_t pos) const
   {
     const auto& e_place = get_exec_place();
 
-    if (e_place.is_grid())
+    if (e_place.size() > 1)
     {
       return stream_grid[pos].stream;
     }
@@ -116,19 +115,15 @@ public:
     event_list ready_prereqs = acquire(ctx);
 
     /* Select the stream(s) */
-    if (e_place.is_grid())
+    if (e_place.size() > 1)
     {
       // We have currently no way to pass an array of per-place streams
       _CCCL_ASSERT(automatic_stream, "automatic stream is not enabled");
 
-      // Note: we store grid in a variable to avoid dangling references
-      // because the compiler does not know we are making a reference to
-      // a vector that remains valid
-      const auto& grid   = e_place.as_grid();
-      const auto& places = grid.get_places();
-      for (const exec_place& p : places)
+      // Get stream for each place in the grid
+      for (size_t i = 0; i < e_place.size(); ++i)
       {
-        stream_grid.push_back(p.getStream(true));
+        stream_grid.push_back(e_place.get_place(i).getStream(true));
       }
 
       EXPECT(stream_grid.size() > 0UL);
@@ -187,7 +182,7 @@ public:
     }
 
     // Select one stream to sync with all prereqs
-    auto& s0 = e_place.is_grid() ? stream_grid[0] : dstream;
+    auto& s0 = (e_place.size() > 1) ? stream_grid[0] : dstream;
 
     /* Ensure that stream depend(s) on prereqs */
     submitted_events = stream_async_op(ctx, s0, ready_prereqs);
@@ -196,8 +191,8 @@ public:
       submitted_events.set_symbol("Submitted" + get_symbol());
     }
 
-    /* If this is a grid, all other streams must wait on s0 too */
-    if (e_place.is_grid())
+    /* If this is a multi-place grid, all other streams must wait on s0 too */
+    if (e_place.size() > 1)
     {
       insert_dependencies(stream_grid);
     }
@@ -215,17 +210,17 @@ public:
 
   void set_current_place(pos4 p)
   {
-    get_exec_place().as_grid().set_current_place(p);
+    get_exec_place().set_current_place(p);
   }
 
   void unset_current_place()
   {
-    return get_exec_place().as_grid().unset_current_place();
+    get_exec_place().unset_current_place();
   }
 
-  const exec_place& get_current_place()
+  exec_place get_current_place()
   {
-    return get_exec_place().as_grid().get_current_place();
+    return get_exec_place().get_current_place();
   }
 
   /* End the task, but do not clear its data structures yet */
@@ -236,9 +231,8 @@ public:
     event_list end_list;
 
     const auto& e_place = get_exec_place();
-    // Create an event with this stream
 
-    if (e_place.is_grid())
+    if (e_place.size() > 1)
     {
       // s0 depends on all other streams
       for (size_t i = 1; i < stream_grid.size(); i++)
diff --git a/cudax/test/stf/places/recursion.cu b/cudax/test/stf/places/recursion.cu
index 3af51e4dc98..de86d7c22bf 100644
--- a/cudax/test/stf/places/recursion.cu
+++ b/cudax/test/stf/places/recursion.cu
@@ -12,7 +12,7 @@
 
 using namespace cuda::experimental::stf;
 
-void rec_func(exec_place_grid places)
+void rec_func(exec_place places)
 {
   if (places.size() == 1)
   {