From dacd49e814a17e87c34d6e8d840e7c3224da3e38 Mon Sep 17 00:00:00 2001 From: Andrei Alexandrescu Date: Fri, 13 Mar 2026 22:02:59 -0400 Subject: [PATCH 01/12] [WIP] Refactor exec_place to unified grid model All execution places are now modeled as grids: - Scalar places (host, device) are 1-element grids - Multi-device grids remain as before Key changes: - Added get_dims(), get_place(idx) to exec_place::impl - Changed activate/deactivate to take index parameter - Moved set_current_place/unset_current_place to exec_place base - Deprecated is_grid() in favor of size() > 1 - Updated all client code to use new interface This eliminates special-casing for grids vs non-grids and allows uniform iteration over any exec_place. Made-with: Cursor --- .../experimental/__stf/graph/graph_task.cuh | 8 +- .../experimental/__stf/internal/launch.cuh | 8 +- .../__stf/internal/parallel_for_scope.cuh | 13 +- .../__stf/places/exec/cuda_stream.cuh | 39 +- .../__stf/places/exec/green_context.cuh | 70 +- .../__stf/places/place_partition.cuh | 14 +- .../cuda/experimental/__stf/places/places.cuh | 666 +++++++++--------- .../experimental/__stf/stream/stream_task.cuh | 38 +- 8 files changed, 428 insertions(+), 428 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh b/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh index 39375f6db2d..3251b32eded 100644 --- a/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh +++ b/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh @@ -456,17 +456,17 @@ public: void set_current_place(pos4 p) { - get_exec_place().as_grid().set_current_place(p); + get_exec_place().set_current_place(p); } void unset_current_place() { - get_exec_place().as_grid().unset_current_place(); + get_exec_place().unset_current_place(); } - const exec_place& get_current_place() const + exec_place get_current_place() const { - return get_exec_place().as_grid().get_current_place(); + return get_exec_place().get_current_place(); } private: diff --git a/cudax/include/cuda/experimental/__stf/internal/launch.cuh b/cudax/include/cuda/experimental/__stf/internal/launch.cuh index b33b14929c7..f20205acd59 100644 --- a/cudax/include/cuda/experimental/__stf/internal/launch.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/launch.cuh @@ -95,7 +95,7 @@ void cuda_launcher_graph(interpreted_spec interpreted_policy, Fun&& f, void** ar template void launch_impl(interpreted_spec interpreted_policy, exec_place& p, Fun f, Arg arg, cudaStream_t stream, size_t rank) { - assert(!p.is_grid()); + assert(p.size() == 1); p->*[&] { auto th = thread_hierarchy(static_cast(rank), interpreted_policy); @@ -140,7 +140,7 @@ void launch_impl(interpreted_spec interpreted_policy, exec_place& p, Fun f, Arg template void graph_launch_impl(task_t& t, interpreted_spec interpreted_policy, exec_place& p, Fun f, Arg arg, size_t rank) { - assert(!p.is_grid()); + assert(p.size() == 1); auto kernel_args = tuple_prepend(thread_hierarchy(static_cast(rank), interpreted_policy), mv(arg)); using args_type = decltype(kernel_args); @@ -331,11 +331,11 @@ public: assert(e_place.affine_data_place() == t.get_affine_data_place()); /* - * If we have a grid of places, the implicit affine partitioner is the blocked_partition. + * If we have a multi-place grid, the implicit affine partitioner is the blocked_partition. * * An explicit composite data place is required per data dependency to customize this behaviour. */ - if (e_place.is_grid()) + if (e_place.size() > 1) { // Create a composite data place defined by the grid of places + the partitioning function t.set_affine_data_place(data_place::composite(blocked_partition(), e_place.as_grid())); diff --git a/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh index ccc420f6609..90acd49ad22 100644 --- a/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh @@ -550,8 +550,8 @@ public: // If there is a partitioner, we ensure there is a proper affine data place for this execution place if constexpr (!::std::is_same_v) { - // This is only meaningful for grid of places - if (e_place.is_grid()) + // This is only meaningful for multi-place grids + if (e_place.size() > 1) { // Create a composite data place defined by the grid of places + the partitioning function t.set_affine_data_place(data_place::composite(partitioner_t(), e_place.as_grid())); @@ -629,7 +629,7 @@ public: if constexpr (need_reduction) { _CCCL_ASSERT(e_place != exec_place::host(), "Reduce access mode currently unimplemented on host."); - _CCCL_ASSERT(!e_place.is_grid(), "Reduce access mode currently unimplemented on grid of places."); + _CCCL_ASSERT(e_place.size() == 1, "Reduce access mode currently unimplemented on grid of places."); do_parallel_for_redux(f, e_place, shape, t); return; } @@ -659,7 +659,7 @@ public: if constexpr (!::std::is_same_v && is_extended_host_device_lambda_closure_type || is_extended_device_lambda_closure_type) { - if (!e_place.is_grid()) + if (e_place.size() == 1) { // Apply the parallel_for construct over the entire shape on the // execution place of the task @@ -681,11 +681,12 @@ public: } else { - size_t grid_size = t.grid_dims().size(); + const auto& t_place = t.get_exec_place(); + size_t grid_size = t_place.size(); for (size_t i = 0; i < grid_size; i++) { t.set_current_place(pos4(i)); - const auto sub_shape = partitioner_t::apply(shape, pos4(i), t.grid_dims()); + const auto sub_shape = partitioner_t::apply(shape, pos4(i), t_place.get_dims()); do_parallel_for(f, t.get_current_place(), sub_shape, t); t.unset_current_place(); } diff --git a/cudax/include/cuda/experimental/__stf/places/exec/cuda_stream.cuh b/cudax/include/cuda/experimental/__stf/places/exec/cuda_stream.cuh index a18545e4014..5cf256cb9ea 100644 --- a/cudax/include/cuda/experimental/__stf/places/exec/cuda_stream.cuh +++ b/cudax/include/cuda/experimental/__stf/places/exec/cuda_stream.cuh @@ -41,29 +41,37 @@ public: public: impl(const decorated_stream& _dstream) : exec_place::impl(data_place::device(_dstream.dev_id)) - , dstream(_dstream) - , dummy_pool(_dstream) + , dstream_(_dstream) + , dummy_pool_(_dstream) {} - /* We set the current device to be the device on which the CUDA stream was created */ - exec_place activate() const override + // Grid interface - cuda_stream is a 1-element grid + exec_place get_place(size_t idx) const override { - return exec_place::device(dstream.dev_id).activate(); + EXPECT(idx == 0, "Index out of bounds for cuda_stream exec_place"); + return exec_place::cuda_stream(dstream_); } - void deactivate(const exec_place& prev) const override + exec_place activate(size_t idx) const override { - return exec_place::device(dstream.dev_id).deactivate(prev); + EXPECT(idx == 0, "Index out of bounds for cuda_stream exec_place"); + return exec_place::device(dstream_.dev_id).activate(); + } + + void deactivate(size_t idx, const exec_place& prev) const override + { + EXPECT(idx == 0, "Index out of bounds for cuda_stream exec_place"); + exec_place::device(dstream_.dev_id).deactivate(prev); } stream_pool& get_stream_pool(bool) const override { - return dummy_pool; + return dummy_pool_; } ::std::string to_string() const override { - return "exec(stream id=" + ::std::to_string(dstream.id) + " dev=" + ::std::to_string(dstream.dev_id) + ")"; + return "exec(stream id=" + ::std::to_string(dstream_.id) + " dev=" + ::std::to_string(dstream_.dev_id) + ")"; } bool operator==(const exec_place::impl& rhs) const override @@ -73,14 +81,12 @@ public: return false; } const auto& other = static_cast(rhs); - // Compare by stream handle - return dstream.stream == other.dstream.stream; + return dstream_.stream == other.dstream_.stream; } size_t hash() const override { - // Hash the stream handle, not the affine data place - return ::std::hash()(dstream.stream); + return ::std::hash()(dstream_.stream); } bool operator<(const exec_place::impl& rhs) const override @@ -90,13 +96,12 @@ public: return typeid(*this).before(typeid(rhs)); } const auto& other = static_cast(rhs); - return dstream.stream < other.dstream.stream; + return dstream_.stream < other.dstream_.stream; } private: - decorated_stream dstream; - // We create a dummy pool of streams which only consists in a single stream in practice. - mutable stream_pool dummy_pool; + decorated_stream dstream_; + mutable stream_pool dummy_pool_; }; public: diff --git a/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh b/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh index f5e5531eb52..7a7c6b62c8d 100644 --- a/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh +++ b/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh @@ -270,53 +270,50 @@ public: impl(green_ctx_view gc_view, bool use_green_ctx_data_place = false) : exec_place::impl( use_green_ctx_data_place ? make_green_ctx_data_place(gc_view) : data_place::device(gc_view.devid)) - , devid(gc_view.devid) - , g_ctx(gc_view.g_ctx) - , pool(mv(gc_view.pool)) + , devid_(gc_view.devid) + , g_ctx_(gc_view.g_ctx) + , pool_(mv(gc_view.pool)) {} // This is used to implement deactivate and wrap an existing context impl(CUcontext saved_context) - : driver_context(saved_context) + : driver_context_(saved_context) {} - exec_place activate() const override + // Grid interface - green_ctx is a 1-element grid + exec_place get_place(size_t idx) const override { + EXPECT(idx == 0, "Index out of bounds for green_ctx exec_place"); + return exec_place::green_ctx(green_ctx_view(g_ctx_, pool_, devid_)); + } + + exec_place activate(size_t idx) const override + { + EXPECT(idx == 0, "Index out of bounds for green_ctx exec_place"); + // Save the current context and transform it into a fake green context place CUcontext current_ctx; cuda_safe_call(cuCtxGetCurrent(¤t_ctx)); exec_place result = exec_place(::std::make_shared(current_ctx)); - // Convert the green context to a primary context (TODO cache this ?) - cuda_safe_call(cuCtxFromGreenCtx(&driver_context, g_ctx)); - -# if 0 - // for debug purposes, display the affinity - { - CUdevResource check_resource; - cuda_safe_call(cuGreenCtxGetDevResource(g_ctx, &check_resource, CU_DEV_RESOURCE_TYPE_SM)); - unsigned long long check_ctxId; - cuda_safe_call(cuCtxGetId(driver_context, &check_ctxId)); - fprintf(stderr, "ACTIVATE : set affinity with %d SMs (ctx ID = %llu)\n", check_resource.sm.smCount, - check_ctxId); - } -# endif - - cuda_safe_call(cuCtxSetCurrent(driver_context)); + // Convert the green context to a primary context + cuda_safe_call(cuCtxFromGreenCtx(&driver_context_, g_ctx_)); + cuda_safe_call(cuCtxSetCurrent(driver_context_)); return result; } - void deactivate(const exec_place& prev) const override + void deactivate(size_t idx, const exec_place& prev) const override { + EXPECT(idx == 0, "Index out of bounds for green_ctx exec_place"); + auto prev_impl = ::std::static_pointer_cast(prev.get_impl()); - CUcontext saved_ctx = prev_impl->driver_context; + CUcontext saved_ctx = prev_impl->driver_context_; # ifdef DEBUG - // Ensure that the current context is the green context that we have activated before CUcontext current_ctx; cuda_safe_call(cuCtxGetCurrent(¤t_ctx)); - assert(get_cuda_context_id(current_ctx) == get_cuda_context_id(driver_context)); + assert(get_cuda_context_id(current_ctx) == get_cuda_context_id(driver_context_)); # endif cuda_safe_call(cuCtxSetCurrent(saved_ctx)); @@ -324,13 +321,12 @@ public: ::std::string to_string() const override { - return "green ctx ( id=" + ::std::to_string(get_cuda_context_id(g_ctx)) + " dev_id =" + ::std::to_string(devid) - + ")"; + return "green_ctx(id=" + ::std::to_string(get_cuda_context_id(g_ctx_)) + " dev=" + ::std::to_string(devid_) + ")"; } stream_pool& get_stream_pool(bool) const override { - return pool; + return pool_; } bool operator==(const exec_place::impl& rhs) const override @@ -340,14 +336,12 @@ public: return false; } const auto& other = static_cast(rhs); - // Compare green context handles - return g_ctx == other.g_ctx; + return g_ctx_ == other.g_ctx_; } size_t hash() const override { - // Hash the green context handle, not the affine data place - return ::std::hash()(g_ctx); + return ::std::hash()(g_ctx_); } bool operator<(const exec_place::impl& rhs) const override @@ -357,16 +351,14 @@ public: return typeid(*this).before(typeid(rhs)); } const auto& other = static_cast(rhs); - return g_ctx < other.g_ctx; + return g_ctx_ < other.g_ctx_; } private: - int devid = -1; - CUgreenCtx g_ctx = {}; - // a context created from the green context (or used to store an existing context to implement - // activate/deactivate) - mutable CUcontext driver_context = {}; - mutable stream_pool pool; + int devid_ = -1; + CUgreenCtx g_ctx_ = {}; + mutable CUcontext driver_context_ = {}; + mutable stream_pool pool_; }; public: diff --git a/cudax/include/cuda/experimental/__stf/places/place_partition.cuh b/cudax/include/cuda/experimental/__stf/places/place_partition.cuh index d7cd70dca54..e0d2afed705 100644 --- a/cudax/include/cuda/experimental/__stf/places/place_partition.cuh +++ b/cudax/include/cuda/experimental/__stf/places/place_partition.cuh @@ -222,7 +222,7 @@ private: /** @brief Compute the subplaces of a place at the specified granularity (scope) into the sub_places vector */ void compute_subplaces(async_resources_handle& handle, const exec_place& place, place_partition_scope scope) { - if (place.is_grid() && scope == place_partition_scope::cuda_stream) + if (place.size() > 1 && scope == place_partition_scope::cuda_stream) { // Recursively partition grid into devices, then into streams for (auto& device_p : place_partition(place, handle, place_partition_scope::cuda_device)) @@ -247,7 +247,7 @@ private: // Green contexts are only supported since CUDA 12.4 #if _CCCL_CTK_AT_LEAST(12, 4) - if (place.is_grid() && scope == place_partition_scope::green_context) + if (place.size() > 1 && scope == place_partition_scope::green_context) { // Recursively partition grid into devices, then into green contexts for (auto& device_p : place_partition(place, handle, place_partition_scope::cuda_device)) @@ -291,11 +291,13 @@ private: #endif // _CCCL_CTK_BELOW(12, 4) _CCCL_ASSERT(scope != place_partition_scope::cuda_stream, "CUDA stream scope needs an async resource handle."); - if (place.is_grid() && scope == place_partition_scope::cuda_device) + if (place.size() > 1 && scope == place_partition_scope::cuda_device) { - exec_place_grid g = place.as_grid(); - // Copy the vector of places - sub_places = g.get_places(); + // Get places from the grid + for (size_t i = 0; i < place.size(); ++i) + { + sub_places.push_back(place.get_place(i)); + } return; } diff --git a/cudax/include/cuda/experimental/__stf/places/places.cuh b/cudax/include/cuda/experimental/__stf/places/places.cuh index 8b8a75c23cc..f1708188986 100644 --- a/cudax/include/cuda/experimental/__stf/places/places.cuh +++ b/cudax/include/cuda/experimental/__stf/places/places.cuh @@ -377,18 +377,19 @@ inline data_place from_index(size_t n); /** * @brief Indicates where a computation takes place (CPU, dev0, dev1, ...) * - * Currently data and computation are together `(devid == int(data_place))`. + * All execution places are modeled as grids. Scalar places (host, single device) + * are simply 1-element grids. This unified model eliminates special-casing and + * allows uniform iteration over any exec_place. */ class exec_place { public: /* - * @brief Using the pimpl idiom. Public because a number of classes inehrit from this. + * @brief Using the pimpl idiom. Public because a number of classes inherit from this. */ class impl { public: - // Note that the default ctor assumes an invalid affine data place impl() = default; impl(const impl&) = delete; impl& operator=(const impl&) = delete; @@ -398,8 +399,44 @@ public: : affine(mv(place)) {} - virtual exec_place activate() const + // ===== Grid interface (all places are grids) ===== + + /** + * @brief Get the dimensions of this grid + * + * For scalar places, returns dim4(1, 1, 1, 1). + */ + virtual dim4 get_dims() const + { + return dim4(1, 1, 1, 1); + } + + /** + * @brief Get the total number of places in this grid + */ + virtual size_t size() const + { + return 1; + } + + /** + * @brief Get the sub-place at the given linear index + * + * For scalar places, idx must be 0. + */ + virtual exec_place get_place(size_t idx) const; + + // ===== Activation/deactivation (indexed) ===== + + /** + * @brief Activate the sub-place at the given index + * + * For scalar places, idx must be 0. + * Returns the previous execution state needed for deactivate(). + */ + virtual exec_place activate(size_t idx) const { + EXPECT(idx == 0, "Index out of bounds for scalar exec_place"); if (!affine.is_device()) { return exec_place(); @@ -414,8 +451,12 @@ public: return exec_place(mv(old_dev)); } - virtual void deactivate(const exec_place& prev) const + /** + * @brief Deactivate the sub-place at the given index, restoring previous state + */ + virtual void deactivate(size_t idx, const exec_place& prev) const { + EXPECT(idx == 0, "Index out of bounds for scalar exec_place"); if (affine.is_device()) { auto current_dev_id = cuda_try(); @@ -427,6 +468,8 @@ public: } } + // ===== Properties ===== + virtual const data_place affine_data_place() const { return affine; @@ -447,14 +490,13 @@ public: return affine.is_device(); } + /** + * @brief Check if this is a multi-element grid (size > 1) + * @deprecated Use size() > 1 instead + */ virtual bool is_grid() const { - return false; - } - - virtual size_t size() const - { - return 1; + return size() > 1; } virtual void set_affine_data_place(data_place place) @@ -462,6 +504,8 @@ public: affine = mv(place); } + // ===== Comparison ===== + virtual bool operator==(const impl& rhs) const { return affine == rhs.affine; @@ -474,22 +518,15 @@ public: virtual bool operator<(const impl& rhs) const { - // Different types: order by typeid if (typeid(*this) != typeid(rhs)) { return typeid(*this).before(typeid(rhs)); } - // Same type (both base impl): compare by device ID - // (base impl stores devid in affine, so we extract it via device_ordinal) return device_ordinal(affine) < device_ordinal(rhs.affine); } - /** - * @brief Get the stream pool for this execution place. - * - * The base implementation returns pool_compute or pool_data stored - * directly on the impl. - */ + // ===== Stream management ===== + virtual stream_pool& get_stream_pool(bool for_computation) const { return for_computation ? pool_compute : pool_data; @@ -503,6 +540,10 @@ public: data_place affine = data_place::invalid(); mutable stream_pool pool_compute; mutable stream_pool pool_data; + + // Current place state for grid iteration + mutable ::std::ptrdiff_t current_idx = -1; + mutable ::std::shared_ptr saved_prev_impl; }; exec_place() = default; @@ -522,7 +563,6 @@ public: return !(*this == rhs); } - // To use in a ::std::map indexed by exec_place bool operator<(const exec_place& rhs) const { return *pimpl < *rhs.pimpl; @@ -543,20 +583,51 @@ public: return !(*this < rhs); } + size_t hash() const + { + return pimpl->hash(); + } + + // ===== Grid interface (all places are grids) ===== + /** - * @brief Compute a hash value for this execution place + * @brief Get the dimensions of this grid * - * Used by std::hash specialization for unordered containers. + * For scalar places (host, single device), returns dim4(1, 1, 1, 1). */ - size_t hash() const + dim4 get_dims() const { - return pimpl->hash(); + return pimpl->get_dims(); } /** - * @brief an iterator class which goes over all subplaces in an exec place. + * @brief Get the total number of places in this grid + */ + size_t size() const + { + return pimpl->size(); + } + + /** + * @brief Get the sub-place at the given linear index * - * This is a trivial singleton unless we have a grid of places. + * For scalar places, idx must be 0 and returns the place itself. + */ + exec_place get_place(size_t idx) const + { + return pimpl->get_place(idx); + } + + /** + * @brief Get the sub-place at the given multi-dimensional position + */ + exec_place get_place(pos4 p) const + { + return get_place(get_dims().get_index(p)); + } + + /** + * @brief an iterator class which goes over all subplaces in an exec place. */ class iterator { @@ -566,7 +637,10 @@ public: , index(index) {} - exec_place operator*(); + exec_place operator*() + { + return it_impl->get_place(index); + } iterator& operator++() { @@ -598,68 +672,123 @@ public: return iterator(pimpl, pimpl->size()); } + // ===== Activation/deactivation ===== + /** - * @brief Returns a string representation of the execution place object. + * @brief Activate the sub-place at the given index * - * @return std::string + * @param idx The index of the sub-place to activate (default 0 for scalar places) + * @return The previous execution state needed for deactivate() */ - ::std::string to_string() const + exec_place activate(size_t idx = 0) const { - return pimpl->to_string(); + return pimpl->activate(idx); } /** - * @brief Returns the `data_place` naturally associated with this execution place. + * @brief Deactivate the sub-place at the given index, restoring previous state + * + * @param idx The index of the sub-place to deactivate (default 0 for scalar places) + * @param prev The previous state returned by activate() */ - const data_place affine_data_place() const + void deactivate(size_t idx, const exec_place& prev) const { - return pimpl->affine_data_place(); + pimpl->deactivate(idx, prev); } - void set_affine_data_place(data_place place) + /** + * @brief Convenience overload for scalar places (idx=0) + */ + void deactivate(const exec_place& prev) const { - pimpl->set_affine_data_place(mv(place)); + deactivate(0, prev); } - stream_pool& get_stream_pool(bool for_computation) const + /** + * @brief Set the current place for grid iteration + * + * Activates the place at the given index and saves state for later restoration. + */ + void set_current_place(size_t idx) { - return pimpl->get_stream_pool(for_computation); + if (pimpl->current_idx >= 0) + { + exec_place saved_prev(pimpl->saved_prev_impl); + pimpl->deactivate(pimpl->current_idx, saved_prev); + } + pimpl->current_idx = static_cast<::std::ptrdiff_t>(idx); + exec_place prev = pimpl->activate(idx); + pimpl->saved_prev_impl = prev.pimpl; } /** - * @brief Get a decorated stream from the stream pool associated to this execution place. + * @brief Set the current place using multi-dimensional position */ - decorated_stream getStream(bool for_computation) const; - - cudaStream_t pick_stream(bool for_computation = true) const + void set_current_place(pos4 p) { - return getStream(for_computation).stream; + set_current_place(get_dims().get_index(p)); } - // TODO make protected ! - const ::std::shared_ptr& get_impl() const + /** + * @brief Unset the current place, restoring previous execution context + */ + void unset_current_place() { - return pimpl; + EXPECT(pimpl->current_idx >= 0, "unset_current_place() called without corresponding set_current_place()"); + exec_place saved_prev(pimpl->saved_prev_impl); + pimpl->deactivate(pimpl->current_idx, saved_prev); + pimpl->current_idx = -1; } /** - * @brief Set computation to run on this place. - * - * @return `exec_place` The previous execution place. See `deactivate` below. + * @brief Get the currently active sub-place */ - exec_place activate() const + exec_place get_current_place() const { - return pimpl->activate(); + EXPECT(pimpl->current_idx >= 0, "No current place set"); + return get_place(pimpl->current_idx); } /** - * @brief Undoes the effect of `activate`. Call with the previous `exec_place` object returned by `activate`. - * - * @warning Undefined behavior if you don't pass the result of `activate`. + * @brief Get the index of the currently active sub-place, or -1 if none */ - void deactivate(const exec_place& p) const + ::std::ptrdiff_t current_place_id() const { - pimpl->deactivate(p); + return pimpl->current_idx; + } + + // ===== Properties ===== + + ::std::string to_string() const + { + return pimpl->to_string(); + } + + const data_place affine_data_place() const + { + return pimpl->affine_data_place(); + } + + void set_affine_data_place(data_place place) + { + pimpl->set_affine_data_place(mv(place)); + } + + stream_pool& get_stream_pool(bool for_computation) const + { + return pimpl->get_stream_pool(for_computation); + } + + decorated_stream getStream(bool for_computation) const; + + cudaStream_t pick_stream(bool for_computation = true) const + { + return getStream(for_computation).stream; + } + + const ::std::shared_ptr& get_impl() const + { + return pimpl; } bool is_host() const @@ -672,24 +801,38 @@ public: return pimpl->is_device(); } + /** + * @brief Check if this is a multi-element grid (size > 1) + * @deprecated Use size() > 1 instead. All places are now grids. + */ bool is_grid() const { return pimpl->is_grid(); } - size_t size() const + /** + * @brief Get the dimension along a specific axis + * @deprecated Use get_dims().get(axis_id) instead + */ + size_t grid_dim(int axis_id) const { - return pimpl->size(); + return get_dims().get(axis_id); } - // Get the implementation assuming this is a grid - // We need to defer the implementation after exec_place_grid has been - // defined because this requires a ::std::static_pointer_cast from the base - // class to exec_place_grid - exec_place_grid as_grid() const; + /** + * @brief Get all dimensions + * @deprecated Use get_dims() instead + */ + dim4 grid_dims() const + { + return get_dims(); + } - size_t grid_dim(int axid_is) const; - dim4 grid_dims() const; + /** + * @brief Convert to exec_place_grid type + * @deprecated All places are grids now; use exec_place methods directly + */ + exec_place_grid as_grid() const; /* These helper methods provide convenient way to express execution places, * for example exec_place::host or exec_place::device(4). @@ -870,11 +1013,11 @@ inline decorated_stream exec_place::getStream(bool for_computation) const /** * @brief Designates execution that is to run on the host. * + * Host is modeled as a 1-element grid containing the host execution context. */ class exec_place_host : public exec_place { public: - // Implementation of the exec_place_host class class impl : public exec_place::impl { public: @@ -882,21 +1025,27 @@ public: : exec_place::impl(data_place::host()) {} - // operator<: base class implementation is correct (compares typeid, then device_ordinal). - // Since host is a singleton, all instances compare equal. + // Grid interface - host is a 1-element grid + exec_place get_place(size_t idx) const override; - exec_place activate() const override + // Activation - no-op for host + exec_place activate(size_t idx) const override { + EXPECT(idx == 0, "Index out of bounds for host exec_place"); return exec_place(); - } // no-op - void deactivate(const exec_place& p) const override + } + + void deactivate(size_t idx, const exec_place& prev) const override { - _CCCL_ASSERT(!p.get_impl(), ""); - } // no-op - virtual const data_place affine_data_place() const override + EXPECT(idx == 0, "Index out of bounds for host exec_place"); + _CCCL_ASSERT(!prev.get_impl(), "Host deactivate expects empty prev"); + } + + const data_place affine_data_place() const override { return data_place::host(); } + stream_pool& get_stream_pool(bool for_computation) const override { return exec_place::current_device().get_stream_pool(for_computation); @@ -943,6 +1092,8 @@ UNITTEST("exec_place_host::operator->*") /** * @brief Designates execution that is to run on a specific CUDA device. + * + * Device is modeled as a 1-element grid containing that device. */ class exec_place_device : public exec_place { @@ -952,10 +1103,22 @@ public: public: explicit impl(int devid) : exec_place::impl(data_place::device(devid)) + , devid_(devid) { pool_compute = stream_pool(pool_size); pool_data = stream_pool(data_pool_size); } + + // Grid interface - device is a 1-element grid + exec_place get_place(size_t idx) const override; + + int get_devid() const + { + return devid_; + } + + private: + int devid_; }; }; @@ -1032,83 +1195,87 @@ public: class impl : public exec_place::impl { public: - // Define a grid directly from a vector of places - // This creates an execution grid automatically impl(::std::vector _places) - : dims(_places.size(), 1, 1, 1) - , places(mv(_places)) + : dims_(_places.size(), 1, 1, 1) + , places_(mv(_places)) { - _CCCL_ASSERT(!places.empty(), ""); - _CCCL_ASSERT(dims.x > 0, ""); - _CCCL_ASSERT(affine.is_invalid(), ""); + _CCCL_ASSERT(!places_.empty(), "Grid must have at least one place"); + _CCCL_ASSERT(dims_.x > 0, "Grid dimensions must be positive"); } - // With a "dim4 shape" impl(::std::vector _places, const dim4& _dims) - : dims(_dims) - , places(mv(_places)) + : dims_(_dims) + , places_(mv(_places)) + { + _CCCL_ASSERT(dims_.x > 0, "Grid dimensions must be positive"); + } + + // ===== Grid interface ===== + + dim4 get_dims() const override { - _CCCL_ASSERT(dims.x > 0, ""); - _CCCL_ASSERT(affine.is_invalid(), ""); + return dims_; } - // TODO improve with a better description - ::std::string to_string() const final + size_t size() const override { - return ::std::string("GRID place"); + return dims_.size(); } - exec_place activate() const override + exec_place get_place(size_t idx) const override { - // No-op - return exec_place(); + EXPECT(idx < places_.size(), "Index out of bounds"); + return places_[idx]; + } + + // ===== Activation (delegates to sub-places) ===== + + exec_place activate(size_t idx) const override + { + EXPECT(idx < places_.size(), "Index out of bounds"); + return places_[idx].activate(0); } - // TODO : shall we deactivate the current place, if any ? - void deactivate(const exec_place& _prev) const override + void deactivate(size_t idx, const exec_place& prev) const override { - // No-op - EXPECT(!_prev.get_impl(), "Invalid execution place."); + EXPECT(idx < places_.size(), "Index out of bounds"); + places_[idx].deactivate(0, prev); + } + + // ===== Properties ===== + + ::std::string to_string() const override + { + return "grid(" + ::std::to_string(dims_.x) + "x" + ::std::to_string(dims_.y) + "x" + ::std::to_string(dims_.z) + + "x" + ::std::to_string(dims_.t) + ")"; } - /* Dynamically checks whether an execution place is a device */ bool is_device() const override { return false; } - /* Dynamically checks whether an execution place is a grid */ - bool is_grid() const override + bool is_host() const override { - return true; + return false; } + // ===== Comparison ===== + bool operator==(const exec_place::impl& rhs) const override { - // First, check if rhs is of type exec_place_grid::impl auto other = dynamic_cast(&rhs); if (!other) { - return false; // rhs is not a grid, so they are not equal + return false; } - - // Compare two grids - return *this == *other; - } - - // Compare two grids - bool operator==(const impl& rhs) const - { - // Compare grid-specific properties - // Note: for grids, equality is determined by dims and places, not the affine data place - return dims == rhs.dims && places == rhs.places; + return dims_ == other->dims_ && places_ == other->places_; } size_t hash() const override { - // Hash based on dims and places, consistent with operator== - size_t h = ::cuda::experimental::stf::hash{}(dims); - for (const auto& p : places) + size_t h = ::cuda::experimental::stf::hash{}(dims_); + for (const auto& p : places_) { hash_combine(h, p.hash()); } @@ -1117,211 +1284,61 @@ public: bool operator<(const exec_place::impl& rhs) const override { - // Different types: order by typeid if (typeid(*this) != typeid(rhs)) { return typeid(*this).before(typeid(rhs)); } - // Same type: safe to cast const auto& other = static_cast(rhs); - // Compare dims first, then places - if (!(dims == other.dims)) + if (!(dims_ == other.dims_)) { - // Use tuple comparison for consistent ordering - return ::std::tie(dims.x, dims.y, dims.z, dims.t) - < ::std::tie(other.dims.x, other.dims.y, other.dims.z, other.dims.t); + return ::std::tie(dims_.x, dims_.y, dims_.z, dims_.t) + < ::std::tie(other.dims_.x, other.dims_.y, other.dims_.z, other.dims_.t); } - return places < other.places; + return places_ < other.places_; } - const ::std::vector& get_places() const - { - return places; - } + // ===== Stream management ===== stream_pool& get_stream_pool(bool for_computation) const override { _CCCL_ASSERT(!for_computation, "Expected data transfer stream pool"); - const auto& v = get_places(); - _CCCL_ASSERT(v.size() > 0, "Grid must have at least one place"); - return v[0].get_stream_pool(for_computation); - } - - exec_place grid_activate(size_t i) const - { - const auto& v = get_places(); - return v[i].activate(); - } - - void grid_deactivate(size_t i, exec_place p) const - { - const auto& v = get_places(); - v[i].deactivate(p); - } - - const exec_place& get_current_place() - { - return get_places()[current_p_1d]; - } - - // Set the current place from the 1D index within the grid (flattened grid) - void set_current_place(size_t p_index) - { - // Unset the previous place, if any - if (current_p_1d >= 0) - { - // First deactivate the previous place - grid_deactivate(current_p_1d, old_place); - } - - // get the 1D index for that position - current_p_1d = (::std::ptrdiff_t) p_index; - - // The returned value contains the state to restore when we deactivate the place - old_place = grid_activate(current_p_1d); - } - - // Set the current place, given the position in the grid - void set_current_place(pos4 p) - { - size_t p_index = dims.get_index(p); - set_current_place(p_index); - } - - void unset_current_place() - { - EXPECT(current_p_1d >= 0, "unset_current_place() called without corresponding call to set_current_place()"); - - // First deactivate the previous place - grid_deactivate(current_p_1d, old_place); - current_p_1d = -1; - } - - ::std::ptrdiff_t current_place_id() const - { - return current_p_1d; - } - - dim4 get_dims() const - { - return dims; - } - - size_t get_dim(int axis_id) const - { - return dims.get(axis_id); + _CCCL_ASSERT(!places_.empty(), "Grid must have at least one place"); + return places_[0].get_stream_pool(for_computation); } - size_t size() const override - { - return dims.size(); - } + // ===== Grid-specific accessors ===== - /* Get the place associated to this position in the grid */ - const exec_place& get_place(pos4 p) const - { - return coords_to_place(p); - } - - const exec_place& get_place(size_t p_index) const + const ::std::vector& get_places() const { - return coords_to_place(p_index); + return places_; } private: - // What is the execution place at theses coordinates in the exec place grid ? - const exec_place& coords_to_place(size_t c0, size_t c1 = 0, size_t c2 = 0, size_t c3 = 0) const - { - // Flatten the (c0, c1, c2, c3) vector into a global index - size_t index = c0 + dims.get(0) * (c1 + dims.get(1) * (c2 + c3 * dims.get(2))); - return places[index]; - } - - const exec_place& coords_to_place(pos4 coords) const - { - return coords_to_place(coords.x, coords.y, coords.z, coords.t); - } - - // current position in the grid (flattened to 1D) if we have a grid of - // execution place. -1 indicates there is no current position. - ::std::ptrdiff_t current_p_1d = -1; - - // saved state before setting the current place - exec_place old_place; - - // dimensions of the "grid" - dim4 dims; - ::std::vector places; + dim4 dims_; + ::std::vector places_; }; - ///@{ @name Constructors - dim4 get_dims() const - { - return get_impl()->get_dims(); - } - - size_t get_dim(int axis_id) const - { - return get_dims().get(axis_id); - } - - size_t size() const - { - return get_dims().size(); - } - explicit operator bool() const { - return get_impl() != nullptr; + return exec_place::get_impl() != nullptr; } - /* Note that we compare against the exact same implementation : we could - * have equivalent grids with the same execution places, but to avoid a - * costly comparison we here only look for actually identical grids. - */ bool operator==(const exec_place_grid& rhs) const { return *get_impl() == *(rhs.get_impl()); } - ::std::ptrdiff_t current_place_id() const - { - return get_impl()->current_place_id(); - } - - const exec_place& get_place(pos4 p) const - { - return get_impl()->get_place(p); - } - + /** + * @brief Get the vector of sub-places (grid-specific) + */ const ::std::vector& get_places() const { return get_impl()->get_places(); } - // Set the current place from the 1D index within the grid (flattened grid) - void set_current_place(size_t p_index) - { - return get_impl()->set_current_place(p_index); - } - - // Get the current execution place - const exec_place& get_current_place() - { - return get_impl()->get_current_place(); - } - - // Set the current place, given the position in the grid - void set_current_place(pos4 p) - { - return get_impl()->set_current_place(p); - } - - void unset_current_place() - { - return get_impl()->unset_current_place(); - } - + /** + * @brief Get the typed impl (for grid-specific operations) + */ ::std::shared_ptr get_impl() const { _CCCL_ASSERT(::std::dynamic_pointer_cast(exec_place::get_impl()), "Invalid exec_place_grid impl"); @@ -1333,7 +1350,6 @@ public: : exec_place(nullptr) {} - // private: exec_place_grid(::std::shared_ptr p) : exec_place(mv(p)) {} @@ -1398,41 +1414,40 @@ inline exec_place data_place::affine_exec_place() const + ::std::to_string(pimpl_->get_device_ordinal())); } -/// Implementation deferred because we need the definition of exec_place_grid -inline exec_place exec_place::iterator::operator*() +// === Deferred implementations for get_place() === + +inline exec_place exec_place::impl::get_place(size_t idx) const { - EXPECT(index < it_impl->size()); - if (it_impl->is_grid()) - { - return ::std::static_pointer_cast(it_impl)->get_place(index); - } - return exec_place(it_impl); + EXPECT(idx == 0, "Index out of bounds for scalar exec_place"); + // For generic scalar places, we can't easily return self + // This should be overridden by concrete implementations + return exec_place( + ::std::const_pointer_cast(::std::shared_ptr(::std::shared_ptr{}, this))); } -//! Creates a grid by replicating an execution place multiple times -inline exec_place_grid exec_place::repeat(const exec_place& e, size_t cnt) +inline exec_place exec_place_host::impl::get_place(size_t idx) const { - return make_grid(::std::vector(cnt, e)); + EXPECT(idx == 0, "Index out of bounds for host exec_place"); + return exec_place::host(); } -/* Deferred implementation : ::std::static_pointer_cast requires that exec_place_grid is a complete type */ -inline exec_place_grid exec_place::as_grid() const +inline exec_place exec_place_device::impl::get_place(size_t idx) const { - // Make sure it is really a grid - EXPECT(is_grid()); - return exec_place_grid(::std::static_pointer_cast(pimpl)); + EXPECT(idx == 0, "Index out of bounds for device exec_place"); + return exec_place::device(devid_); } -inline dim4 exec_place::grid_dims() const +//! Creates a grid by replicating an execution place multiple times +inline exec_place_grid exec_place::repeat(const exec_place& e, size_t cnt) { - EXPECT(is_grid()); - return ::std::static_pointer_cast(pimpl)->get_dims(); + return make_grid(::std::vector(cnt, e)); } -inline size_t exec_place::grid_dim(int axis_id) const +/* Deferred implementation : ::std::static_pointer_cast requires that exec_place_grid is a complete type */ +inline exec_place_grid exec_place::as_grid() const { - EXPECT(is_grid()); - return ::std::static_pointer_cast(pimpl)->get_dim(axis_id); + EXPECT(size() > 1, "as_grid() called on scalar exec_place"); + return exec_place_grid(::std::static_pointer_cast(pimpl)); } /* Get the first N available devices */ @@ -1466,8 +1481,7 @@ inline exec_place_grid exec_place::all_devices() //! Creates a cyclic partition of an execution place grid with specified strides inline exec_place_grid partition_cyclic(const exec_place_grid& e_place, dim4 strides, pos4 tile_id) { - const auto& g = e_place.as_grid(); - dim4 g_dims = e_place.get_dims(); + dim4 g_dims = e_place.get_dims(); /* * Example : strides = (3, 2). tile 1 id = (1, 0) @@ -1479,15 +1493,10 @@ inline exec_place_grid partition_cyclic(const exec_place_grid& e_place, dim4 str // Dimension K_x of the new grid on axis x : // pos_x + K_x stride_x = dim_x // K_x = (dim_x - pos_x)/stride_x - dim4 size = dim4((g.get_dim(0) - tile_id.x + strides.x - 1) / strides.x, - (g.get_dim(1) - tile_id.y + strides.y - 1) / strides.y, - (g.get_dim(2) - tile_id.z + strides.z - 1) / strides.z, - (g.get_dim(3) - tile_id.t + strides.t - 1) / strides.t); - - // fprintf(stderr, "G DIM %d STRIDE %d ID %d\n", g_dims.x, strides.x, tile_id.x); - // fprintf(stderr, "G DIM %d STRIDE %d ID %d\n", g_dims.y, strides.y, tile_id.y); - // fprintf(stderr, "G DIM %d STRIDE %d ID %d\n", g_dims.z, strides.z, tile_id.z); - // fprintf(stderr, "G DIM %d STRIDE %d ID %d\n", g_dims.t, strides.t, tile_id.t); + dim4 size = dim4((g_dims.x - tile_id.x + strides.x - 1) / strides.x, + (g_dims.y - tile_id.y + strides.y - 1) / strides.y, + (g_dims.z - tile_id.z + strides.z - 1) / strides.z, + (g_dims.t - tile_id.t + strides.t - 1) / strides.t); ::std::vector places; places.reserve(size.x * size.y * size.z * size.t); @@ -1500,7 +1509,7 @@ inline exec_place_grid partition_cyclic(const exec_place_grid& e_place, dim4 str { for (size_t x = static_cast(tile_id.x); x < g_dims.x; x += strides.x) { - places.push_back(g.get_place(pos4(x, y, z, t))); + places.push_back(e_place.get_place(pos4(x, y, z, t))); } } } @@ -1519,18 +1528,15 @@ inline exec_place_grid partition_cyclic(const exec_place_grid& e_place, dim4 str //! auto sub_g = partition_tile(g, dim4(2,2), dim4(0,1)) inline exec_place_grid partition_tile(const exec_place_grid& e_place, dim4 tile_sizes, pos4 tile_id) { - const auto& g = e_place.as_grid(); + dim4 g_dims = e_place.get_dims(); - // TODO define dim4=dim4 * dim4 dim4 begin_coords( tile_id.x * tile_sizes.x, tile_id.y * tile_sizes.y, tile_id.z * tile_sizes.z, tile_id.t * tile_sizes.t); - // TODO define dim4=MIN(dim4,dim4) - // upper bound coordinate (excluded) - dim4 end_coords(::std::min((tile_id.x + 1) * tile_sizes.x, g.get_dim(0)), - ::std::min((tile_id.y + 1) * tile_sizes.y, g.get_dim(1)), - ::std::min((tile_id.z + 1) * tile_sizes.z, g.get_dim(2)), - ::std::min((tile_id.t + 1) * tile_sizes.t, g.get_dim(3))); + dim4 end_coords(::std::min((tile_id.x + 1) * tile_sizes.x, g_dims.x), + ::std::min((tile_id.y + 1) * tile_sizes.y, g_dims.y), + ::std::min((tile_id.z + 1) * tile_sizes.z, g_dims.z), + ::std::min((tile_id.t + 1) * tile_sizes.t, g_dims.t)); // fprintf(stderr, "G DIM %d TILE SIZE %d ID %d\n", g_dims.x, tile_sizes.x, tile_id.x); // fprintf(stderr, "G DIM %d TILE SIZE %d ID %d\n", g_dims.y, tile_sizes.y, tile_id.y); @@ -1559,7 +1565,7 @@ inline exec_place_grid partition_tile(const exec_place_grid& e_place, dim4 tile_ { for (size_t x = static_cast(begin_coords.x); x < end_coords.x; x++) { - places.push_back(g.get_place(pos4(x, y, z, t))); + places.push_back(e_place.get_place(pos4(x, y, z, t))); } } } diff --git a/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh b/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh index 4176e74b01d..5c000862613 100644 --- a/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh +++ b/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh @@ -75,24 +75,23 @@ public: cudaStream_t get_stream() const { const auto& e_place = get_exec_place(); - if (e_place.is_grid()) + if (e_place.size() > 1) { // Even with a grid, when we have a ctx.task construct we have not // yet selected/activated a specific place. So we take the main // stream associated to the whole task in that case. - ::std::ptrdiff_t current_place_id = e_place.as_grid().current_place_id(); - return (current_place_id < 0 ? dstream.stream : stream_grid[current_place_id].stream); + ::std::ptrdiff_t current_id = e_place.current_place_id(); + return (current_id < 0 ? dstream.stream : stream_grid[current_id].stream); } return dstream.stream; } - // TODO use a pos4 and check that we have a grid, of the proper dimension cudaStream_t get_stream(size_t pos) const { const auto& e_place = get_exec_place(); - if (e_place.is_grid()) + if (e_place.size() > 1) { return stream_grid[pos].stream; } @@ -116,19 +115,15 @@ public: event_list ready_prereqs = acquire(ctx); /* Select the stream(s) */ - if (e_place.is_grid()) + if (e_place.size() > 1) { // We have currently no way to pass an array of per-place streams _CCCL_ASSERT(automatic_stream, "automatic stream is not enabled"); - // Note: we store grid in a variable to avoid dangling references - // because the compiler does not know we are making a reference to - // a vector that remains valid - const auto& grid = e_place.as_grid(); - const auto& places = grid.get_places(); - for (const exec_place& p : places) + // Get stream for each place in the grid + for (size_t i = 0; i < e_place.size(); ++i) { - stream_grid.push_back(p.getStream(true)); + stream_grid.push_back(e_place.get_place(i).getStream(true)); } EXPECT(stream_grid.size() > 0UL); @@ -187,7 +182,7 @@ public: } // Select one stream to sync with all prereqs - auto& s0 = e_place.is_grid() ? stream_grid[0] : dstream; + auto& s0 = (e_place.size() > 1) ? stream_grid[0] : dstream; /* Ensure that stream depend(s) on prereqs */ submitted_events = stream_async_op(ctx, s0, ready_prereqs); @@ -196,8 +191,8 @@ public: submitted_events.set_symbol("Submitted" + get_symbol()); } - /* If this is a grid, all other streams must wait on s0 too */ - if (e_place.is_grid()) + /* If this is a multi-place grid, all other streams must wait on s0 too */ + if (e_place.size() > 1) { insert_dependencies(stream_grid); } @@ -215,17 +210,17 @@ public: void set_current_place(pos4 p) { - get_exec_place().as_grid().set_current_place(p); + get_exec_place().set_current_place(p); } void unset_current_place() { - return get_exec_place().as_grid().unset_current_place(); + get_exec_place().unset_current_place(); } - const exec_place& get_current_place() + exec_place get_current_place() { - return get_exec_place().as_grid().get_current_place(); + return get_exec_place().get_current_place(); } /* End the task, but do not clear its data structures yet */ @@ -236,9 +231,8 @@ public: event_list end_list; const auto& e_place = get_exec_place(); - // Create an event with this stream - if (e_place.is_grid()) + if (e_place.size() > 1) { // s0 depends on all other streams for (size_t i = 1; i < stream_grid.size(); i++) From 6e2df83648b97d4fbd33799379b98ba59ce8667f Mon Sep 17 00:00:00 2001 From: Andrei Alexandrescu Date: Fri, 13 Mar 2026 22:22:57 -0400 Subject: [PATCH 02/12] Remove exec_place_cuda_stream shell class The shell class added no value - just use the dynamic interface directly and return exec_place from the factory methods. Made-with: Cursor --- .../__stf/places/exec/cuda_stream.cuh | 125 ++++++++---------- .../cuda/experimental/__stf/places/places.cuh | 5 +- 2 files changed, 58 insertions(+), 72 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/places/exec/cuda_stream.cuh b/cudax/include/cuda/experimental/__stf/places/exec/cuda_stream.cuh index 5cf256cb9ea..c4935000adc 100644 --- a/cudax/include/cuda/experimental/__stf/places/exec/cuda_stream.cuh +++ b/cudax/include/cuda/experimental/__stf/places/exec/cuda_stream.cuh @@ -30,97 +30,84 @@ namespace cuda::experimental::stf { /** - * @brief Designates execution that is to run on a specific CUDA stream - * + * @brief Implementation for CUDA stream execution places */ -class exec_place_cuda_stream : public exec_place +class exec_place_cuda_stream_impl : public exec_place::impl { public: - class impl : public exec_place::impl + exec_place_cuda_stream_impl(const decorated_stream& dstream) + : exec_place::impl(data_place::device(dstream.dev_id)) + , dstream_(dstream) + , dummy_pool_(dstream) + {} + + exec_place get_place(size_t idx) const override { - public: - impl(const decorated_stream& _dstream) - : exec_place::impl(data_place::device(_dstream.dev_id)) - , dstream_(_dstream) - , dummy_pool_(_dstream) - {} - - // Grid interface - cuda_stream is a 1-element grid - exec_place get_place(size_t idx) const override - { - EXPECT(idx == 0, "Index out of bounds for cuda_stream exec_place"); - return exec_place::cuda_stream(dstream_); - } + EXPECT(idx == 0, "Index out of bounds for cuda_stream exec_place"); + return exec_place::cuda_stream(dstream_); + } - exec_place activate(size_t idx) const override - { - EXPECT(idx == 0, "Index out of bounds for cuda_stream exec_place"); - return exec_place::device(dstream_.dev_id).activate(); - } + exec_place activate(size_t idx) const override + { + EXPECT(idx == 0, "Index out of bounds for cuda_stream exec_place"); + return exec_place::device(dstream_.dev_id).activate(); + } - void deactivate(size_t idx, const exec_place& prev) const override - { - EXPECT(idx == 0, "Index out of bounds for cuda_stream exec_place"); - exec_place::device(dstream_.dev_id).deactivate(prev); - } + void deactivate(size_t idx, const exec_place& prev) const override + { + EXPECT(idx == 0, "Index out of bounds for cuda_stream exec_place"); + exec_place::device(dstream_.dev_id).deactivate(prev); + } - stream_pool& get_stream_pool(bool) const override - { - return dummy_pool_; - } + stream_pool& get_stream_pool(bool) const override + { + return dummy_pool_; + } - ::std::string to_string() const override - { - return "exec(stream id=" + ::std::to_string(dstream_.id) + " dev=" + ::std::to_string(dstream_.dev_id) + ")"; - } + ::std::string to_string() const override + { + return "exec(stream id=" + ::std::to_string(dstream_.id) + " dev=" + ::std::to_string(dstream_.dev_id) + ")"; + } - bool operator==(const exec_place::impl& rhs) const override + bool operator==(const exec_place::impl& rhs) const override + { + if (typeid(*this) != typeid(rhs)) { - if (typeid(*this) != typeid(rhs)) - { - return false; - } - const auto& other = static_cast(rhs); - return dstream_.stream == other.dstream_.stream; + return false; } + const auto& other = static_cast(rhs); + return dstream_.stream == other.dstream_.stream; + } - size_t hash() const override - { - return ::std::hash()(dstream_.stream); - } + size_t hash() const override + { + return ::std::hash()(dstream_.stream); + } - bool operator<(const exec_place::impl& rhs) const override + bool operator<(const exec_place::impl& rhs) const override + { + if (typeid(*this) != typeid(rhs)) { - if (typeid(*this) != typeid(rhs)) - { - return typeid(*this).before(typeid(rhs)); - } - const auto& other = static_cast(rhs); - return dstream_.stream < other.dstream_.stream; + return typeid(*this).before(typeid(rhs)); } - - private: - decorated_stream dstream_; - mutable stream_pool dummy_pool_; - }; - -public: - exec_place_cuda_stream(const decorated_stream& dstream) - : exec_place(::std::make_shared(dstream)) - { - static_assert(sizeof(exec_place_cuda_stream) == sizeof(exec_place), - "exec_place_cuda_stream cannot add state; it would be sliced away."); + const auto& other = static_cast(rhs); + return dstream_.stream < other.dstream_.stream; } + +private: + decorated_stream dstream_; + mutable stream_pool dummy_pool_; }; -inline exec_place_cuda_stream exec_place::cuda_stream(cudaStream_t stream) +inline exec_place exec_place::cuda_stream(cudaStream_t stream) { int devid = get_device_from_stream(stream); - return exec_place_cuda_stream(decorated_stream(stream, get_stream_id(stream), devid)); + return exec_place( + ::std::make_shared(decorated_stream(stream, get_stream_id(stream), devid))); } -inline exec_place_cuda_stream exec_place::cuda_stream(const decorated_stream& dstream) +inline exec_place exec_place::cuda_stream(const decorated_stream& dstream) { - return exec_place_cuda_stream(dstream); + return exec_place(::std::make_shared(dstream)); } } // end namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/places/places.cuh b/cudax/include/cuda/experimental/__stf/places/places.cuh index f1708188986..e83c22c8aba 100644 --- a/cudax/include/cuda/experimental/__stf/places/places.cuh +++ b/cudax/include/cuda/experimental/__stf/places/places.cuh @@ -51,7 +51,6 @@ namespace cuda::experimental::stf class exec_place; class exec_place_host; class exec_place_grid; -class exec_place_cuda_stream; // Green contexts are only supported since CUDA 12.4 #if _CCCL_CTK_AT_LEAST(12, 4) @@ -854,8 +853,8 @@ public: static exec_place green_ctx(const green_ctx_view& gc_view, bool use_green_ctx_data_place = false); #endif // _CCCL_CTK_AT_LEAST(12, 4) - static exec_place_cuda_stream cuda_stream(cudaStream_t stream); - static exec_place_cuda_stream cuda_stream(const decorated_stream& dstream); + static exec_place cuda_stream(cudaStream_t stream); + static exec_place cuda_stream(const decorated_stream& dstream); /** * @brief Returns the currently active device. From eb16cfeaf922480e12a1cc7e86cf4616f12a4e82 Mon Sep 17 00:00:00 2001 From: Andrei Alexandrescu Date: Fri, 13 Mar 2026 22:46:50 -0400 Subject: [PATCH 03/12] Remove exec_place_green_ctx shell class Same as exec_place_cuda_stream - the shell class adds no value. Use the dynamic interface directly and return exec_place from factory. Made-with: Cursor --- .../__stf/places/exec/green_context.cuh | 173 ++++++++---------- .../cuda/experimental/__stf/places/places.cuh | 3 - 2 files changed, 80 insertions(+), 96 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh b/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh index 7a7c6b62c8d..b0dc687be6d 100644 --- a/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh +++ b/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh @@ -252,127 +252,114 @@ private: }; /** - * @brief Designates execution that is to run on a green context. Initialize with the device ordinal and green_context + * @brief Implementation for green context execution places */ -class exec_place_green_ctx : public exec_place +class exec_place_green_ctx_impl : public exec_place::impl { public: - class impl : public exec_place::impl + /** + * @brief Construct a green context execution place + * + * @param gc_view The green context view + * @param use_green_ctx_data_place If true, use a green context data place as the + * affine data place. If false (default), use a regular device data place instead. + */ + exec_place_green_ctx_impl(green_ctx_view gc_view, bool use_green_ctx_data_place = false) + : exec_place::impl( + use_green_ctx_data_place ? make_green_ctx_data_place(gc_view) : data_place::device(gc_view.devid)) + , devid_(gc_view.devid) + , g_ctx_(gc_view.g_ctx) + , pool_(mv(gc_view.pool)) + {} + + // This is used to implement deactivate and wrap an existing context + exec_place_green_ctx_impl(CUcontext saved_context) + : driver_context_(saved_context) + {} + + exec_place get_place(size_t idx) const override { - public: - /** - * @brief Construct a green context execution place - * - * @param gc_view The green context view - * @param use_green_ctx_data_place If true, use a green context data place as the - * affine data place. If false (default), use a regular device data place instead. - */ - impl(green_ctx_view gc_view, bool use_green_ctx_data_place = false) - : exec_place::impl( - use_green_ctx_data_place ? make_green_ctx_data_place(gc_view) : data_place::device(gc_view.devid)) - , devid_(gc_view.devid) - , g_ctx_(gc_view.g_ctx) - , pool_(mv(gc_view.pool)) - {} - - // This is used to implement deactivate and wrap an existing context - impl(CUcontext saved_context) - : driver_context_(saved_context) - {} - - // Grid interface - green_ctx is a 1-element grid - exec_place get_place(size_t idx) const override - { - EXPECT(idx == 0, "Index out of bounds for green_ctx exec_place"); - return exec_place::green_ctx(green_ctx_view(g_ctx_, pool_, devid_)); - } + EXPECT(idx == 0, "Index out of bounds for green_ctx exec_place"); + return exec_place::green_ctx(green_ctx_view(g_ctx_, pool_, devid_)); + } - exec_place activate(size_t idx) const override - { - EXPECT(idx == 0, "Index out of bounds for green_ctx exec_place"); + exec_place activate(size_t idx) const override + { + EXPECT(idx == 0, "Index out of bounds for green_ctx exec_place"); - // Save the current context and transform it into a fake green context place - CUcontext current_ctx; - cuda_safe_call(cuCtxGetCurrent(¤t_ctx)); - exec_place result = exec_place(::std::make_shared(current_ctx)); + // Save the current context and transform it into a fake green context place + CUcontext current_ctx; + cuda_safe_call(cuCtxGetCurrent(¤t_ctx)); + exec_place result = exec_place(::std::make_shared(current_ctx)); - // Convert the green context to a primary context - cuda_safe_call(cuCtxFromGreenCtx(&driver_context_, g_ctx_)); - cuda_safe_call(cuCtxSetCurrent(driver_context_)); + // Convert the green context to a primary context + cuda_safe_call(cuCtxFromGreenCtx(&driver_context_, g_ctx_)); + cuda_safe_call(cuCtxSetCurrent(driver_context_)); - return result; - } + return result; + } - void deactivate(size_t idx, const exec_place& prev) const override - { - EXPECT(idx == 0, "Index out of bounds for green_ctx exec_place"); + void deactivate(size_t idx, const exec_place& prev) const override + { + EXPECT(idx == 0, "Index out of bounds for green_ctx exec_place"); - auto prev_impl = ::std::static_pointer_cast(prev.get_impl()); - CUcontext saved_ctx = prev_impl->driver_context_; + auto prev_impl = ::std::static_pointer_cast(prev.get_impl()); + CUcontext saved_ctx = prev_impl->driver_context_; # ifdef DEBUG - CUcontext current_ctx; - cuda_safe_call(cuCtxGetCurrent(¤t_ctx)); - assert(get_cuda_context_id(current_ctx) == get_cuda_context_id(driver_context_)); + CUcontext current_ctx; + cuda_safe_call(cuCtxGetCurrent(¤t_ctx)); + assert(get_cuda_context_id(current_ctx) == get_cuda_context_id(driver_context_)); # endif - cuda_safe_call(cuCtxSetCurrent(saved_ctx)); - } + cuda_safe_call(cuCtxSetCurrent(saved_ctx)); + } - ::std::string to_string() const override - { - return "green_ctx(id=" + ::std::to_string(get_cuda_context_id(g_ctx_)) + " dev=" + ::std::to_string(devid_) + ")"; - } + ::std::string to_string() const override + { + return "green_ctx(id=" + ::std::to_string(get_cuda_context_id(g_ctx_)) + " dev=" + ::std::to_string(devid_) + ")"; + } - stream_pool& get_stream_pool(bool) const override - { - return pool_; - } + stream_pool& get_stream_pool(bool) const override + { + return pool_; + } - bool operator==(const exec_place::impl& rhs) const override + bool operator==(const exec_place::impl& rhs) const override + { + if (typeid(*this) != typeid(rhs)) { - if (typeid(*this) != typeid(rhs)) - { - return false; - } - const auto& other = static_cast(rhs); - return g_ctx_ == other.g_ctx_; + return false; } + const auto& other = static_cast(rhs); + return g_ctx_ == other.g_ctx_; + } - size_t hash() const override - { - return ::std::hash()(g_ctx_); - } + size_t hash() const override + { + return ::std::hash()(g_ctx_); + } - bool operator<(const exec_place::impl& rhs) const override + bool operator<(const exec_place::impl& rhs) const override + { + if (typeid(*this) != typeid(rhs)) { - if (typeid(*this) != typeid(rhs)) - { - return typeid(*this).before(typeid(rhs)); - } - const auto& other = static_cast(rhs); - return g_ctx_ < other.g_ctx_; + return typeid(*this).before(typeid(rhs)); } - - private: - int devid_ = -1; - CUgreenCtx g_ctx_ = {}; - mutable CUcontext driver_context_ = {}; - mutable stream_pool pool_; - }; - -public: - exec_place_green_ctx(green_ctx_view gc_view, bool use_green_ctx_data_place = false) - : exec_place(::std::make_shared(mv(gc_view), use_green_ctx_data_place)) - { - static_assert(sizeof(exec_place_green_ctx) <= sizeof(exec_place), - "exec_place_green_ctx cannot add state; it would be sliced away."); + const auto& other = static_cast(rhs); + return g_ctx_ < other.g_ctx_; } + +private: + int devid_ = -1; + CUgreenCtx g_ctx_ = {}; + mutable CUcontext driver_context_ = {}; + mutable stream_pool pool_; }; inline exec_place exec_place::green_ctx(const green_ctx_view& gc_view, bool use_green_ctx_data_place) { - return exec_place_green_ctx(gc_view, use_green_ctx_data_place); + return exec_place(::std::make_shared(gc_view, use_green_ctx_data_place)); } inline ::std::shared_ptr green_ctx_data_place_impl::get_affine_exec_impl() const diff --git a/cudax/include/cuda/experimental/__stf/places/places.cuh b/cudax/include/cuda/experimental/__stf/places/places.cuh index e83c22c8aba..467c1b2d474 100644 --- a/cudax/include/cuda/experimental/__stf/places/places.cuh +++ b/cudax/include/cuda/experimental/__stf/places/places.cuh @@ -53,9 +53,6 @@ class exec_place_host; class exec_place_grid; // Green contexts are only supported since CUDA 12.4 -#if _CCCL_CTK_AT_LEAST(12, 4) -class exec_place_green_ctx; -#endif // _CCCL_CTK_AT_LEAST(12, 4) //! Function type for computing executor placement from data coordinates using get_executor_func_t = pos4 (*)(pos4, dim4, dim4); From a99fd0639f29f227f78bb94a8ccb973a4acfb063 Mon Sep 17 00:00:00 2001 From: Andrei Alexandrescu Date: Fri, 13 Mar 2026 23:37:12 -0400 Subject: [PATCH 04/12] Clean up affine_data_place() return type and remove virtual is_grid() - Remove meaningless const from return-by-value affine_data_place() - Remove virtual is_grid() from impl - just use size() > 1 directly Made-with: Cursor --- .../cuda/experimental/__stf/places/places.cuh | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/places/places.cuh b/cudax/include/cuda/experimental/__stf/places/places.cuh index 467c1b2d474..912c5737f1d 100644 --- a/cudax/include/cuda/experimental/__stf/places/places.cuh +++ b/cudax/include/cuda/experimental/__stf/places/places.cuh @@ -466,7 +466,7 @@ public: // ===== Properties ===== - virtual const data_place affine_data_place() const + virtual data_place affine_data_place() const { return affine; } @@ -486,15 +486,6 @@ public: return affine.is_device(); } - /** - * @brief Check if this is a multi-element grid (size > 1) - * @deprecated Use size() > 1 instead - */ - virtual bool is_grid() const - { - return size() > 1; - } - virtual void set_affine_data_place(data_place place) { affine = mv(place); @@ -760,7 +751,7 @@ public: return pimpl->to_string(); } - const data_place affine_data_place() const + data_place affine_data_place() const { return pimpl->affine_data_place(); } @@ -803,7 +794,7 @@ public: */ bool is_grid() const { - return pimpl->is_grid(); + return size() > 1; } /** @@ -1037,7 +1028,7 @@ public: _CCCL_ASSERT(!prev.get_impl(), "Host deactivate expects empty prev"); } - const data_place affine_data_place() const override + data_place affine_data_place() const override { return data_place::host(); } From f7d6c0520f18cf053f5e00b4fa75552ebdc2f8da Mon Sep 17 00:00:00 2001 From: Andrei Alexandrescu Date: Fri, 13 Mar 2026 23:43:16 -0400 Subject: [PATCH 05/12] Simplify exec_place comparison using three-way cmp() Replace separate operator== and operator< virtual methods with a single cmp() method that returns -1/0/1, consistent with data_place_interface. Made-with: Cursor --- .../__stf/places/exec/cuda_stream.cuh | 24 +++--- .../__stf/places/exec/green_context.cuh | 24 +++--- .../cuda/experimental/__stf/places/places.cuh | 85 +++++++++++-------- 3 files changed, 73 insertions(+), 60 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/places/exec/cuda_stream.cuh b/cudax/include/cuda/experimental/__stf/places/exec/cuda_stream.cuh index c4935000adc..a490f57054f 100644 --- a/cudax/include/cuda/experimental/__stf/places/exec/cuda_stream.cuh +++ b/cudax/include/cuda/experimental/__stf/places/exec/cuda_stream.cuh @@ -69,14 +69,22 @@ public: return "exec(stream id=" + ::std::to_string(dstream_.id) + " dev=" + ::std::to_string(dstream_.dev_id) + ")"; } - bool operator==(const exec_place::impl& rhs) const override + int cmp(const exec_place::impl& rhs) const override { if (typeid(*this) != typeid(rhs)) { - return false; + return typeid(*this).before(typeid(rhs)) ? -1 : 1; } const auto& other = static_cast(rhs); - return dstream_.stream == other.dstream_.stream; + if (dstream_.stream < other.dstream_.stream) + { + return -1; + } + if (other.dstream_.stream < dstream_.stream) + { + return 1; + } + return 0; } size_t hash() const override @@ -84,16 +92,6 @@ public: return ::std::hash()(dstream_.stream); } - bool operator<(const exec_place::impl& rhs) const override - { - if (typeid(*this) != typeid(rhs)) - { - return typeid(*this).before(typeid(rhs)); - } - const auto& other = static_cast(rhs); - return dstream_.stream < other.dstream_.stream; - } - private: decorated_stream dstream_; mutable stream_pool dummy_pool_; diff --git a/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh b/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh index b0dc687be6d..284cb3f134e 100644 --- a/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh +++ b/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh @@ -325,14 +325,22 @@ public: return pool_; } - bool operator==(const exec_place::impl& rhs) const override + int cmp(const exec_place::impl& rhs) const override { if (typeid(*this) != typeid(rhs)) { - return false; + return typeid(*this).before(typeid(rhs)) ? -1 : 1; } const auto& other = static_cast(rhs); - return g_ctx_ == other.g_ctx_; + if (g_ctx_ < other.g_ctx_) + { + return -1; + } + if (other.g_ctx_ < g_ctx_) + { + return 1; + } + return 0; } size_t hash() const override @@ -340,16 +348,6 @@ public: return ::std::hash()(g_ctx_); } - bool operator<(const exec_place::impl& rhs) const override - { - if (typeid(*this) != typeid(rhs)) - { - return typeid(*this).before(typeid(rhs)); - } - const auto& other = static_cast(rhs); - return g_ctx_ < other.g_ctx_; - } - private: int devid_ = -1; CUgreenCtx g_ctx_ = {}; diff --git a/cudax/include/cuda/experimental/__stf/places/places.cuh b/cudax/include/cuda/experimental/__stf/places/places.cuh index 912c5737f1d..a07ac3ffc97 100644 --- a/cudax/include/cuda/experimental/__stf/places/places.cuh +++ b/cudax/include/cuda/experimental/__stf/places/places.cuh @@ -493,9 +493,25 @@ public: // ===== Comparison ===== - virtual bool operator==(const impl& rhs) const + /** + * @brief Three-way comparison + * @return -1 if *this < rhs, 0 if *this == rhs, 1 if *this > rhs + */ + virtual int cmp(const impl& rhs) const { - return affine == rhs.affine; + if (typeid(*this) != typeid(rhs)) + { + return typeid(*this).before(typeid(rhs)) ? -1 : 1; + } + if (affine < rhs.affine) + { + return -1; + } + if (rhs.affine < affine) + { + return 1; + } + return 0; } virtual size_t hash() const @@ -503,15 +519,6 @@ public: return affine.hash(); } - virtual bool operator<(const impl& rhs) const - { - if (typeid(*this) != typeid(rhs)) - { - return typeid(*this).before(typeid(rhs)); - } - return device_ordinal(affine) < device_ordinal(rhs.affine); - } - // ===== Stream management ===== virtual stream_pool& get_stream_pool(bool for_computation) const @@ -543,8 +550,13 @@ public: bool operator==(const exec_place& rhs) const { - return *pimpl == *rhs.pimpl; + if (pimpl.get() == rhs.pimpl.get()) + { + return true; + } + return pimpl->cmp(*rhs.pimpl) == 0; } + bool operator!=(const exec_place& rhs) const { return !(*this == rhs); @@ -552,7 +564,7 @@ public: bool operator<(const exec_place& rhs) const { - return *pimpl < *rhs.pimpl; + return pimpl->cmp(*rhs.pimpl) < 0; } bool operator>(const exec_place& rhs) const @@ -1249,14 +1261,34 @@ public: // ===== Comparison ===== - bool operator==(const exec_place::impl& rhs) const override + int cmp(const exec_place::impl& rhs) const override { - auto other = dynamic_cast(&rhs); - if (!other) + if (typeid(*this) != typeid(rhs)) + { + return typeid(*this).before(typeid(rhs)) ? -1 : 1; + } + const auto& other = static_cast(rhs); + // Compare dims first + auto this_dims = ::std::tie(dims_.x, dims_.y, dims_.z, dims_.t); + auto other_dims = ::std::tie(other.dims_.x, other.dims_.y, other.dims_.z, other.dims_.t); + if (this_dims < other_dims) + { + return -1; + } + if (other_dims < this_dims) + { + return 1; + } + // Then compare places + if (places_ < other.places_) + { + return -1; + } + if (other.places_ < places_) { - return false; + return 1; } - return dims_ == other->dims_ && places_ == other->places_; + return 0; } size_t hash() const override @@ -1269,21 +1301,6 @@ public: return h; } - bool operator<(const exec_place::impl& rhs) const override - { - if (typeid(*this) != typeid(rhs)) - { - return typeid(*this).before(typeid(rhs)); - } - const auto& other = static_cast(rhs); - if (!(dims_ == other.dims_)) - { - return ::std::tie(dims_.x, dims_.y, dims_.z, dims_.t) - < ::std::tie(other.dims_.x, other.dims_.y, other.dims_.z, other.dims_.t); - } - return places_ < other.places_; - } - // ===== Stream management ===== stream_pool& get_stream_pool(bool for_computation) const override @@ -1312,7 +1329,7 @@ public: bool operator==(const exec_place_grid& rhs) const { - return *get_impl() == *(rhs.get_impl()); + return get_impl()->cmp(*rhs.get_impl()) == 0; } /** From 86aab4d26375c7fce064bb835b2d8746f2988283 Mon Sep 17 00:00:00 2001 From: Andrei Alexandrescu Date: Fri, 13 Mar 2026 23:47:44 -0400 Subject: [PATCH 06/12] Fix place_partition for 1-element grids With the unified grid model, 1-element grids have size()==1 but is_device()==false. Update place_partition to handle this case by extracting the underlying scalar place from 1-element grids. Made-with: Cursor --- .../__stf/places/place_partition.cuh | 78 +++++++++++++++---- 1 file changed, 64 insertions(+), 14 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/places/place_partition.cuh b/cudax/include/cuda/experimental/__stf/places/place_partition.cuh index e0d2afed705..002b4134830 100644 --- a/cudax/include/cuda/experimental/__stf/places/place_partition.cuh +++ b/cudax/include/cuda/experimental/__stf/places/place_partition.cuh @@ -222,9 +222,9 @@ private: /** @brief Compute the subplaces of a place at the specified granularity (scope) into the sub_places vector */ void compute_subplaces(async_resources_handle& handle, const exec_place& place, place_partition_scope scope) { + // Handle multi-element grids by recursively partitioning if (place.size() > 1 && scope == place_partition_scope::cuda_stream) { - // Recursively partition grid into devices, then into streams for (auto& device_p : place_partition(place, handle, place_partition_scope::cuda_device)) { auto device_p_places = place_partition(device_p, handle, place_partition_scope::cuda_stream).sub_places; @@ -233,6 +233,27 @@ private: return; } + // Handle scalar places (including 1-element grids) for cuda_stream scope + if (place.size() == 1 && scope == place_partition_scope::cuda_stream) + { + // Get the underlying scalar place (for 1-element grids, get the single element) + exec_place scalar_place = place.is_device() ? place : place.get_place(0); + if (!scalar_place.is_device()) + { + // Host or other non-device place - no streams to partition into + sub_places.push_back(place); + return; + } + auto& pool = scalar_place.get_stream_pool(true); + for (size_t i = 0; i < pool.size(); i++) + { + decorated_stream dstream = pool.next(scalar_place); + sub_places.push_back(exec_place::cuda_stream(dstream)); + } + return; + } + + // Legacy path for explicit device check (kept for compatibility) if (place.is_device() && scope == place_partition_scope::cuda_stream) { auto& pool = place.get_stream_pool(true); @@ -258,18 +279,40 @@ private: return; } + // Handle scalar places (including 1-element grids) for green_context scope + if (place.size() == 1 && scope == place_partition_scope::green_context) + { + exec_place scalar_place = place.is_device() ? place : place.get_place(0); + if (!scalar_place.is_device()) + { + sub_places.push_back(place); + return; + } + int dev_id = device_ordinal(scalar_place.affine_data_place()); + + const char* env = getenv("CUDASTF_GREEN_CONTEXT_SIZE"); + int sm_cnt = env ? atoi(env) : 8; + + auto h = handle.get_gc_helper(dev_id, sm_cnt); + + size_t cnt = h->get_count(); + for (size_t i = 0; i < cnt; i++) + { + sub_places.push_back(exec_place::green_ctx(h->get_view(i))); + } + return; + } + + // Legacy path for explicit device check (kept for compatibility) if (place.is_device() && scope == place_partition_scope::green_context) { - // Find the device associated to the place, and get the green context helper int dev_id = device_ordinal(place.affine_data_place()); - // 8 SMs per green context is a granularity that should work on any arch. const char* env = getenv("CUDASTF_GREEN_CONTEXT_SIZE"); int sm_cnt = env ? atoi(env) : 8; auto h = handle.get_gc_helper(dev_id, sm_cnt); - // Get views of green context out of the helper to create execution places size_t cnt = h->get_count(); for (size_t i = 0; i < cnt; i++) { @@ -291,19 +334,26 @@ private: #endif // _CCCL_CTK_BELOW(12, 4) _CCCL_ASSERT(scope != place_partition_scope::cuda_stream, "CUDA stream scope needs an async resource handle."); - if (place.size() > 1 && scope == place_partition_scope::cuda_device) + if (scope == place_partition_scope::cuda_device) { - // Get places from the grid - for (size_t i = 0; i < place.size(); ++i) + if (place.size() > 1) { - sub_places.push_back(place.get_place(i)); + // Multi-element grid: extract all places + for (size_t i = 0; i < place.size(); ++i) + { + sub_places.push_back(place.get_place(i)); + } + } + else if (place.is_device()) + { + // Scalar device place + sub_places.push_back(place); + } + else + { + // 1-element grid or other scalar place: extract the underlying place + sub_places.push_back(place.get_place(0)); } - return; - } - - if (place.is_device() && scope == place_partition_scope::cuda_device) - { - sub_places.push_back(place); return; } From 0ef69031d488f2579d8342b49a9b655e9e76b3f5 Mon Sep 17 00:00:00 2001 From: Andrei Alexandrescu Date: Sat, 14 Mar 2026 00:18:15 -0400 Subject: [PATCH 07/12] Simplify exec_place::impl by removing unnecessary virtuals - Remove virtual is_host()/is_device() from impl; move logic to shell (base impl already returns correct values via affine data place) - Move grid iteration state (current_idx, saved_prev_impl) to grid impl since only multi-element grids use iteration - Add virtual accessors for grid state with assertions for misuse Made-with: Cursor --- .../cuda/experimental/__stf/places/places.cuh | 102 +++++++++++------- 1 file changed, 61 insertions(+), 41 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/places/places.cuh b/cudax/include/cuda/experimental/__stf/places/places.cuh index a07ac3ffc97..767c810e6b9 100644 --- a/cudax/include/cuda/experimental/__stf/places/places.cuh +++ b/cudax/include/cuda/experimental/__stf/places/places.cuh @@ -443,8 +443,7 @@ public: { cuda_safe_call(cudaSetDevice(new_dev_id)); } - auto old_dev = data_place::device(old_dev_id); - return exec_place(mv(old_dev)); + return exec_place(data_place::device(old_dev_id)); } /** @@ -476,16 +475,6 @@ public: return "exec(" + affine.to_string() + ")"; } - virtual bool is_host() const - { - return affine.is_host(); - } - - virtual bool is_device() const - { - return affine.is_device(); - } - virtual void set_affine_data_place(data_place place) { affine = mv(place); @@ -529,15 +518,29 @@ public: static constexpr size_t pool_size = 4; static constexpr size_t data_pool_size = 4; + // Grid iteration state - only meaningful for multi-element grids + virtual ::std::ptrdiff_t get_current_idx() const + { + return -1; + } + virtual void set_current_idx(::std::ptrdiff_t) const + { + _CCCL_ASSERT(false, "set_current_idx called on non-grid exec_place"); + } + virtual ::std::shared_ptr get_saved_prev_impl() const + { + return nullptr; + } + virtual void set_saved_prev_impl(::std::shared_ptr) const + { + _CCCL_ASSERT(false, "set_saved_prev_impl called on non-grid exec_place"); + } + protected: friend class exec_place; data_place affine = data_place::invalid(); mutable stream_pool pool_compute; mutable stream_pool pool_data; - - // Current place state for grid iteration - mutable ::std::ptrdiff_t current_idx = -1; - mutable ::std::shared_ptr saved_prev_impl; }; exec_place() = default; @@ -710,14 +713,15 @@ public: */ void set_current_place(size_t idx) { - if (pimpl->current_idx >= 0) + auto cur_idx = pimpl->get_current_idx(); + if (cur_idx >= 0) { - exec_place saved_prev(pimpl->saved_prev_impl); - pimpl->deactivate(pimpl->current_idx, saved_prev); + exec_place saved_prev(pimpl->get_saved_prev_impl()); + pimpl->deactivate(cur_idx, saved_prev); } - pimpl->current_idx = static_cast<::std::ptrdiff_t>(idx); - exec_place prev = pimpl->activate(idx); - pimpl->saved_prev_impl = prev.pimpl; + pimpl->set_current_idx(static_cast<::std::ptrdiff_t>(idx)); + exec_place prev = pimpl->activate(idx); + pimpl->set_saved_prev_impl(prev.pimpl); } /** @@ -733,10 +737,11 @@ public: */ void unset_current_place() { - EXPECT(pimpl->current_idx >= 0, "unset_current_place() called without corresponding set_current_place()"); - exec_place saved_prev(pimpl->saved_prev_impl); - pimpl->deactivate(pimpl->current_idx, saved_prev); - pimpl->current_idx = -1; + auto cur_idx = pimpl->get_current_idx(); + EXPECT(cur_idx >= 0, "unset_current_place() called without corresponding set_current_place()"); + exec_place saved_prev(pimpl->get_saved_prev_impl()); + pimpl->deactivate(cur_idx, saved_prev); + pimpl->set_current_idx(-1); } /** @@ -744,8 +749,9 @@ public: */ exec_place get_current_place() const { - EXPECT(pimpl->current_idx >= 0, "No current place set"); - return get_place(pimpl->current_idx); + auto cur_idx = pimpl->get_current_idx(); + EXPECT(cur_idx >= 0, "No current place set"); + return get_place(cur_idx); } /** @@ -753,7 +759,7 @@ public: */ ::std::ptrdiff_t current_place_id() const { - return pimpl->current_idx; + return pimpl->get_current_idx(); } // ===== Properties ===== @@ -792,12 +798,12 @@ public: bool is_host() const { - return pimpl->is_host(); + return affine_data_place().is_host(); } bool is_device() const { - return pimpl->is_device(); + return affine_data_place().is_device(); } /** @@ -1249,16 +1255,6 @@ public: + "x" + ::std::to_string(dims_.t) + ")"; } - bool is_device() const override - { - return false; - } - - bool is_host() const override - { - return false; - } - // ===== Comparison ===== int cmp(const exec_place::impl& rhs) const override @@ -1317,9 +1313,33 @@ public: return places_; } + // ===== Grid iteration state ===== + + ::std::ptrdiff_t get_current_idx() const override + { + return current_idx_; + } + + void set_current_idx(::std::ptrdiff_t idx) const override + { + current_idx_ = idx; + } + + ::std::shared_ptr get_saved_prev_impl() const override + { + return saved_prev_impl_; + } + + void set_saved_prev_impl(::std::shared_ptr p) const override + { + saved_prev_impl_ = mv(p); + } + private: dim4 dims_; ::std::vector places_; + mutable ::std::ptrdiff_t current_idx_ = -1; + mutable ::std::shared_ptr saved_prev_impl_; }; explicit operator bool() const From 672cf33d0a7f03331fc327c91992b10808e4533e Mon Sep 17 00:00:00 2001 From: Andrei Alexandrescu Date: Mon, 16 Mar 2026 13:42:21 -0400 Subject: [PATCH 08/12] Fix is_grid() and collapse 1-element grids in factory functions The condition `is_grid()` was changed to `size() > 1` but this broke 1-element grids (e.g. `all_devices()` on single-GPU systems). Fix 1: Restore is_grid() to detect any grid by checking if affine_data_place() is invalid (only grids have invalid affine). Fix 2: Factory functions now collapse 1-element grids to scalars: - make_grid(), all_devices(), n_devices(), repeat() - partition_cyclic(), partition_tile() This ensures that by construction, any true grid has size() > 1, making `size() > 1` equivalent to `is_grid()` in practice. Benefits: - Simpler mental model: grids always have multiple elements - No edge cases for 1-element grids - Single-GPU `all_devices()` returns `device(0)` directly Return type changes (exec_place_grid -> exec_place): - all_devices(), n_devices(), repeat(), make_grid() - partition_cyclic(), partition_tile() - place_partition::as_grid() Made-with: Cursor --- .../experimental/__stf/internal/launch.cuh | 4 +- .../__stf/internal/parallel_for_scope.cuh | 11 ++-- .../__stf/places/place_partition.cuh | 6 +-- .../cuda/experimental/__stf/places/places.cuh | 50 +++++++++++++------ cudax/test/stf/places/recursion.cu | 2 +- 5 files changed, 46 insertions(+), 27 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/internal/launch.cuh b/cudax/include/cuda/experimental/__stf/internal/launch.cuh index f20205acd59..3888efef429 100644 --- a/cudax/include/cuda/experimental/__stf/internal/launch.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/launch.cuh @@ -331,11 +331,11 @@ public: assert(e_place.affine_data_place() == t.get_affine_data_place()); /* - * If we have a multi-place grid, the implicit affine partitioner is the blocked_partition. + * If we have a grid (including 1-element grids), the implicit affine partitioner is the blocked_partition. * * An explicit composite data place is required per data dependency to customize this behaviour. */ - if (e_place.size() > 1) + if (e_place.is_grid()) { // Create a composite data place defined by the grid of places + the partitioning function t.set_affine_data_place(data_place::composite(blocked_partition(), e_place.as_grid())); diff --git a/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh index 90acd49ad22..50d4f523946 100644 --- a/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh @@ -550,8 +550,8 @@ public: // If there is a partitioner, we ensure there is a proper affine data place for this execution place if constexpr (!::std::is_same_v) { - // This is only meaningful for multi-place grids - if (e_place.size() > 1) + // Grids (including 1-element grids) need a composite data place + if (e_place.is_grid()) { // Create a composite data place defined by the grid of places + the partitioning function t.set_affine_data_place(data_place::composite(partitioner_t(), e_place.as_grid())); @@ -662,14 +662,15 @@ public: if (e_place.size() == 1) { // Apply the parallel_for construct over the entire shape on the - // execution place of the task + // execution place of the task. For 1-element grids, extract the element. + const exec_place& scalar_place = e_place.is_grid() ? e_place.get_place(0) : e_place; if constexpr (need_reduction) { - do_parallel_for_redux(f, e_place, shape, t); + do_parallel_for_redux(f, scalar_place, shape, t); } else { - do_parallel_for(f, e_place, shape, t); + do_parallel_for(f, scalar_place, shape, t); } } else diff --git a/cudax/include/cuda/experimental/__stf/places/place_partition.cuh b/cudax/include/cuda/experimental/__stf/places/place_partition.cuh index 002b4134830..35a0e7e5c32 100644 --- a/cudax/include/cuda/experimental/__stf/places/place_partition.cuh +++ b/cudax/include/cuda/experimental/__stf/places/place_partition.cuh @@ -210,10 +210,10 @@ public: return sub_places[i]; } - /** @brief Build an exec_place_grid from the subplaces. - * @return A grid view of the partitioned execution places. + /** @brief Build an exec_place from the subplaces. + * @return A grid view of the partitioned execution places, or single place if size == 1. */ - exec_place_grid as_grid() const + exec_place as_grid() const { return make_grid(sub_places); } diff --git a/cudax/include/cuda/experimental/__stf/places/places.cuh b/cudax/include/cuda/experimental/__stf/places/places.cuh index 767c810e6b9..63dd0c958d8 100644 --- a/cudax/include/cuda/experimental/__stf/places/places.cuh +++ b/cudax/include/cuda/experimental/__stf/places/places.cuh @@ -807,12 +807,13 @@ public: } /** - * @brief Check if this is a multi-element grid (size > 1) - * @deprecated Use size() > 1 instead. All places are now grids. + * @brief Check if this is a grid execution place + * + * Returns true for any grid, including 1-element grids. */ bool is_grid() const { - return size() > 1; + return affine_data_place().is_invalid(); } /** @@ -872,14 +873,14 @@ public: return exec_place::device(cuda_try()); } - static exec_place_grid all_devices(); + static exec_place all_devices(); - static exec_place_grid n_devices(size_t n, dim4 dims); + static exec_place n_devices(size_t n, dim4 dims); - static exec_place_grid n_devices(size_t n); + static exec_place n_devices(size_t n); // For debug purpose on a machine with a single GPU, for example - static exec_place_grid repeat(const exec_place& e, size_t cnt); + static exec_place repeat(const exec_place& e, size_t cnt); template auto partition_by_scope(Args&&... args); @@ -1384,13 +1385,20 @@ public: }; //! Creates a grid of execution places with specified dimensions -inline exec_place_grid make_grid(::std::vector places, const dim4& dims) +//! Returns the single element if size == 1 (no grid wrapper needed) +inline exec_place make_grid(::std::vector places, const dim4& dims) { + _CCCL_ASSERT(!places.empty(), "invalid places"); + if (places.size() == 1) + { + return mv(places[0]); + } return exec_place_grid(mv(places), dims); } //! Creates a linear grid from a vector of execution places -inline exec_place_grid make_grid(::std::vector places) +//! Returns the single element if size == 1 (no grid wrapper needed) +inline exec_place make_grid(::std::vector places) { _CCCL_ASSERT(!places.empty(), "invalid places"); auto grid_dim = dim4(places.size(), 1, 1, 1); @@ -1462,20 +1470,26 @@ inline exec_place exec_place_device::impl::get_place(size_t idx) const } //! Creates a grid by replicating an execution place multiple times -inline exec_place_grid exec_place::repeat(const exec_place& e, size_t cnt) +//! Returns the original place if cnt == 1 (no grid wrapper needed) +inline exec_place exec_place::repeat(const exec_place& e, size_t cnt) { + if (cnt == 1) + { + return e; + } return make_grid(::std::vector(cnt, e)); } /* Deferred implementation : ::std::static_pointer_cast requires that exec_place_grid is a complete type */ inline exec_place_grid exec_place::as_grid() const { - EXPECT(size() > 1, "as_grid() called on scalar exec_place"); + EXPECT(is_grid(), "as_grid() called on scalar exec_place"); return exec_place_grid(::std::static_pointer_cast(pimpl)); } /* Get the first N available devices */ -inline exec_place_grid exec_place::n_devices(size_t n, dim4 dims) +//! Returns single device if n == 1 (no grid wrapper needed) +inline exec_place exec_place::n_devices(size_t n, dim4 dims) { const int ndevs = cuda_try(); @@ -1492,18 +1506,21 @@ inline exec_place_grid exec_place::n_devices(size_t n, dim4 dims) } /* Get the first N available devices */ -inline exec_place_grid exec_place::n_devices(size_t n) +//! Returns single device if n == 1 (no grid wrapper needed) +inline exec_place exec_place::n_devices(size_t n) { return n_devices(n, dim4(n, 1, 1, 1)); } -inline exec_place_grid exec_place::all_devices() +//! Returns all available devices, or single device if only one GPU +inline exec_place exec_place::all_devices() { return n_devices(cuda_try()); } //! Creates a cyclic partition of an execution place grid with specified strides -inline exec_place_grid partition_cyclic(const exec_place_grid& e_place, dim4 strides, pos4 tile_id) +//! Returns single place if partition contains only one element +inline exec_place partition_cyclic(const exec_place& e_place, dim4 strides, pos4 tile_id) { dim4 g_dims = e_place.get_dims(); @@ -1547,10 +1564,11 @@ inline exec_place_grid partition_cyclic(const exec_place_grid& e_place, dim4 str } //! Creates a tiled partition of an execution place grid with specified tile sizes +//! Returns single place if partition contains only one element //! //! example : //! auto sub_g = partition_tile(g, dim4(2,2), dim4(0,1)) -inline exec_place_grid partition_tile(const exec_place_grid& e_place, dim4 tile_sizes, pos4 tile_id) +inline exec_place partition_tile(const exec_place& e_place, dim4 tile_sizes, pos4 tile_id) { dim4 g_dims = e_place.get_dims(); diff --git a/cudax/test/stf/places/recursion.cu b/cudax/test/stf/places/recursion.cu index 3af51e4dc98..de86d7c22bf 100644 --- a/cudax/test/stf/places/recursion.cu +++ b/cudax/test/stf/places/recursion.cu @@ -12,7 +12,7 @@ using namespace cuda::experimental::stf; -void rec_func(exec_place_grid places) +void rec_func(exec_place places) { if (places.size() == 1) { From 36ea11ee06f63ebc6fbff9f777c3e569287d88a2 Mon Sep 17 00:00:00 2001 From: Andrei Alexandrescu Date: Mon, 16 Mar 2026 15:46:29 -0400 Subject: [PATCH 09/12] Remove is_grid() method - use size() > 1 instead With 1-element grids now collapsed to scalars by factory functions, is_grid() is equivalent to size() > 1. Remove the method and use the simpler size check directly. Made-with: Cursor --- .../cuda/experimental/__stf/internal/launch.cuh | 2 +- .../__stf/internal/parallel_for_scope.cuh | 8 ++++---- .../cuda/experimental/__stf/places/places.cuh | 12 +----------- 3 files changed, 6 insertions(+), 16 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/internal/launch.cuh b/cudax/include/cuda/experimental/__stf/internal/launch.cuh index 3888efef429..235219943db 100644 --- a/cudax/include/cuda/experimental/__stf/internal/launch.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/launch.cuh @@ -335,7 +335,7 @@ public: * * An explicit composite data place is required per data dependency to customize this behaviour. */ - if (e_place.is_grid()) + if (e_place.size() > 1) { // Create a composite data place defined by the grid of places + the partitioning function t.set_affine_data_place(data_place::composite(blocked_partition(), e_place.as_grid())); diff --git a/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh index 50d4f523946..53da4f55f9f 100644 --- a/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh @@ -550,8 +550,8 @@ public: // If there is a partitioner, we ensure there is a proper affine data place for this execution place if constexpr (!::std::is_same_v) { - // Grids (including 1-element grids) need a composite data place - if (e_place.is_grid()) + // Grids need a composite data place + if (e_place.size() > 1) { // Create a composite data place defined by the grid of places + the partitioning function t.set_affine_data_place(data_place::composite(partitioner_t(), e_place.as_grid())); @@ -662,8 +662,8 @@ public: if (e_place.size() == 1) { // Apply the parallel_for construct over the entire shape on the - // execution place of the task. For 1-element grids, extract the element. - const exec_place& scalar_place = e_place.is_grid() ? e_place.get_place(0) : e_place; + // execution place of the task. + const exec_place& scalar_place = e_place; if constexpr (need_reduction) { do_parallel_for_redux(f, scalar_place, shape, t); diff --git a/cudax/include/cuda/experimental/__stf/places/places.cuh b/cudax/include/cuda/experimental/__stf/places/places.cuh index 63dd0c958d8..e49d690cf06 100644 --- a/cudax/include/cuda/experimental/__stf/places/places.cuh +++ b/cudax/include/cuda/experimental/__stf/places/places.cuh @@ -806,16 +806,6 @@ public: return affine_data_place().is_device(); } - /** - * @brief Check if this is a grid execution place - * - * Returns true for any grid, including 1-element grids. - */ - bool is_grid() const - { - return affine_data_place().is_invalid(); - } - /** * @brief Get the dimension along a specific axis * @deprecated Use get_dims().get(axis_id) instead @@ -1483,7 +1473,7 @@ inline exec_place exec_place::repeat(const exec_place& e, size_t cnt) /* Deferred implementation : ::std::static_pointer_cast requires that exec_place_grid is a complete type */ inline exec_place_grid exec_place::as_grid() const { - EXPECT(is_grid(), "as_grid() called on scalar exec_place"); + EXPECT(size() > 1, "as_grid() called on scalar exec_place"); return exec_place_grid(::std::static_pointer_cast(pimpl)); } From 383957adcd2f62558c789e0b0784f02082660e66 Mon Sep 17 00:00:00 2001 From: Andrei Alexandrescu Date: Mon, 16 Mar 2026 18:34:22 -0400 Subject: [PATCH 10/12] Change data_place::composite and get_grid to use exec_place Update data_place::composite() to accept const exec_place& instead of const exec_place_grid&. This allows passing exec_place from factory functions that now return exec_place (like repeat(), all_devices()). Also update: - data_place_composite to store exec_place instead of exec_place_grid - get_grid() to return const exec_place& - localized_array constructor parameter - slice interface local variables Made-with: Cursor --- .../__stf/graph/interfaces/slice.cuh | 2 +- .../__stf/localization/composite_slice.cuh | 4 ++-- .../__stf/places/data_place_interface.cuh | 2 +- .../cuda/experimental/__stf/places/places.cuh | 16 ++++++++-------- .../__stf/stream/interfaces/slice.cuh | 2 +- 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/graph/interfaces/slice.cuh b/cudax/include/cuda/experimental/__stf/graph/interfaces/slice.cuh index 6aa1b142e9f..4a053ea53b3 100644 --- a/cudax/include/cuda/experimental/__stf/graph/interfaces/slice.cuh +++ b/cudax/include/cuda/experimental/__stf/graph/interfaces/slice.cuh @@ -87,7 +87,7 @@ public: return; } - exec_place_grid grid = memory_node.get_grid(); + exec_place grid = memory_node.get_grid(); size_t total_size = this->shape.size(); // position (x,y,z,t) on (nx,ny,nz,nt) diff --git a/cudax/include/cuda/experimental/__stf/localization/composite_slice.cuh b/cudax/include/cuda/experimental/__stf/localization/composite_slice.cuh index c4298f35459..b9d7771573a 100644 --- a/cudax/include/cuda/experimental/__stf/localization/composite_slice.cuh +++ b/cudax/include/cuda/experimental/__stf/localization/composite_slice.cuh @@ -75,7 +75,7 @@ public: // ::std::function delinearize : translate the index in a buffer into a position in the data // TODO pass mv(place) template - localized_array(exec_place_grid grid, + localized_array(exec_place grid, get_executor_func_t mapper, F&& delinearize, size_t total_size, @@ -422,7 +422,7 @@ private: } event_list prereqs; // To allow reuse in a cache - exec_place_grid grid; + exec_place grid; get_executor_func_t mapper = nullptr; ::std::vector meta; diff --git a/cudax/include/cuda/experimental/__stf/places/data_place_interface.cuh b/cudax/include/cuda/experimental/__stf/places/data_place_interface.cuh index ecf6d8f7b45..b5ff565c251 100644 --- a/cudax/include/cuda/experimental/__stf/places/data_place_interface.cuh +++ b/cudax/include/cuda/experimental/__stf/places/data_place_interface.cuh @@ -184,7 +184,7 @@ public: * @brief Get the grid for composite places * @throws std::logic_error if not a composite place */ - virtual const exec_place_grid& get_grid() const + virtual const exec_place& get_grid() const { throw ::std::logic_error("get_grid() called on non-composite data_place"); } diff --git a/cudax/include/cuda/experimental/__stf/places/places.cuh b/cudax/include/cuda/experimental/__stf/places/places.cuh index e49d690cf06..58bc80ec9f8 100644 --- a/cudax/include/cuda/experimental/__stf/places/places.cuh +++ b/cudax/include/cuda/experimental/__stf/places/places.cuh @@ -165,9 +165,9 @@ public: // User-visible API when using a different partitioner than the one of the grid template - static data_place composite(partitioner_t p, const exec_place_grid& g); + static data_place composite(partitioner_t p, const exec_place& g); - static data_place composite(get_executor_func_t f, const exec_place_grid& grid); + static data_place composite(get_executor_func_t f, const exec_place& grid); #if _CCCL_CTK_AT_LEAST(12, 4) static data_place green_ctx(const green_ctx_view& gc_view); @@ -296,7 +296,7 @@ public: return p.pimpl_->get_device_ordinal(); } - const exec_place_grid& get_grid() const + const exec_place& get_grid() const { return pimpl_->get_grid(); } @@ -1619,7 +1619,7 @@ inline exec_place partition_tile(const exec_place& e_place, dim4 tile_sizes, pos class data_place_composite final : public data_place_interface { public: - data_place_composite(exec_place_grid grid, get_executor_func_t partitioner_func) + data_place_composite(exec_place grid, get_executor_func_t partitioner_func) : grid_(mv(grid)) , partitioner_func_(mv(partitioner_func)) {} @@ -1679,7 +1679,7 @@ public: return false; } - const exec_place_grid& get_grid() const override + const exec_place& get_grid() const override { return grid_; } @@ -1690,7 +1690,7 @@ public: } private: - exec_place_grid grid_; + exec_place grid_; get_executor_func_t partitioner_func_; }; @@ -1700,14 +1700,14 @@ inline bool data_place::is_composite() const return typeid(ref) == typeid(data_place_composite); } -inline data_place data_place::composite(get_executor_func_t f, const exec_place_grid& grid) +inline data_place data_place::composite(get_executor_func_t f, const exec_place& grid) { return data_place(::std::make_shared(grid, f)); } // User-visible API when the same partitioner as the one of the grid template -data_place data_place::composite(partitioner_t, const exec_place_grid& g) +data_place data_place::composite(partitioner_t, const exec_place& g) { return data_place::composite(&partitioner_t::get_executor, g); } diff --git a/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh b/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh index 21ac9da53a6..5a01d242608 100644 --- a/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh +++ b/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh @@ -94,7 +94,7 @@ public: return; } - exec_place_grid grid = memory_node.get_grid(); + exec_place grid = memory_node.get_grid(); size_t total_size = this->shape.size(); // position (x,y,z,t) on (nx,ny,nz,nt) From 65346793c5886d139240798bfd8734c0b06e58be Mon Sep 17 00:00:00 2001 From: Andrei Alexandrescu Date: Mon, 16 Mar 2026 18:52:32 -0400 Subject: [PATCH 11/12] Remove exec_place_grid class, use exec_place everywhere exec_place_grid was a shell class that provided no additional value over exec_place now that all places support the unified grid model. This change: - Removes exec_place_grid class, keeps impl as exec_place_grid_impl - Updates make_grid() to return exec_place directly - Changes as_grid() to return const exec_place& (just returns *this) - Updates place_partition constructor to take const exec_place& - Updates loop_dispatch template parameter to exec_place - Removes deleted parallel_for(exec_place_grid, ...) overload - Updates C API (stf.cu) to use exec_place* instead of exec_place_grid* - Removes forward declarations of exec_place_grid The C API types (stf_exec_place_grid_handle) are unchanged as they are opaque void* handles that don't depend on the C++ class name. Made-with: Cursor --- c/experimental/stf/src/stf.cu | 12 +- .../__stf/internal/backend_ctx.cuh | 3 - .../__stf/internal/loop_dispatch.cuh | 2 +- .../__stf/places/data_place_interface.cuh | 1 - .../__stf/places/place_partition.cuh | 11 +- .../cuda/experimental/__stf/places/places.cuh | 263 +++++++----------- 6 files changed, 115 insertions(+), 177 deletions(-) diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu index f57ad22db37..27a07cfab0a 100644 --- a/c/experimental/stf/src/stf.cu +++ b/c/experimental/stf/src/stf.cu @@ -58,7 +58,7 @@ static data_place to_data_place(stf_data_place* data_p) { return data_place::invalid(); } - exec_place_grid* grid_ptr = static_cast(grid_handle); + exec_place* grid_ptr = static_cast(grid_handle); // Layout-compatible: pass C mapper directly so the runtime calls it get_executor_func_t cpp_mapper = reinterpret_cast(mapper); return data_place::composite(cpp_mapper, *grid_ptr); @@ -425,8 +425,8 @@ stf_exec_place_grid_handle stf_exec_place_grid_from_devices(const int* device_id { places.push_back(exec_place::device(device_ids[i])); } - exec_place_grid grid = make_grid(::std::move(places)); - return new exec_place_grid(::std::move(grid)); + exec_place grid = make_grid(::std::move(places)); + return new exec_place(::std::move(grid)); } stf_exec_place_grid_handle @@ -439,18 +439,18 @@ stf_exec_place_grid_create(const stf_exec_place* places, size_t count, const stf { cpp_places.push_back(to_exec_place(const_cast(&places[i]))); } - exec_place_grid grid = + exec_place grid = (grid_dims != nullptr) ? make_grid(::std::move(cpp_places), dim4(grid_dims->x, grid_dims->y, grid_dims->z, grid_dims->t)) : make_grid(::std::move(cpp_places)); - return new exec_place_grid(::std::move(grid)); + return new exec_place(::std::move(grid)); } void stf_exec_place_grid_destroy(stf_exec_place_grid_handle grid) { if (grid != nullptr) { - delete static_cast(grid); + delete static_cast(grid); } } diff --git a/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh b/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh index b3659b7c829..3ef18a36803 100644 --- a/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh @@ -1117,9 +1117,6 @@ public: } } - template - auto parallel_for(exec_place_grid e_place, S shape, Deps... deps) = delete; - template auto parallel_for(S shape, Deps... deps) { diff --git a/cudax/include/cuda/experimental/__stf/internal/loop_dispatch.cuh b/cudax/include/cuda/experimental/__stf/internal/loop_dispatch.cuh index 3359a8ea03c..f9acdc65fcc 100644 --- a/cudax/include/cuda/experimental/__stf/internal/loop_dispatch.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/loop_dispatch.cuh @@ -161,7 +161,7 @@ inline void loop_dispatch(context_t ctx, size_t start, size_t end, ::std::functi } else { - loop_dispatch( + loop_dispatch( mv(ctx), exec_place::all_devices(), scope, start, end, mv(func)); } } diff --git a/cudax/include/cuda/experimental/__stf/places/data_place_interface.cuh b/cudax/include/cuda/experimental/__stf/places/data_place_interface.cuh index b5ff565c251..14b20f4a707 100644 --- a/cudax/include/cuda/experimental/__stf/places/data_place_interface.cuh +++ b/cudax/include/cuda/experimental/__stf/places/data_place_interface.cuh @@ -46,7 +46,6 @@ namespace cuda::experimental::stf { // Forward declarations class exec_place; -class exec_place_grid; class pos4; class dim4; diff --git a/cudax/include/cuda/experimental/__stf/places/place_partition.cuh b/cudax/include/cuda/experimental/__stf/places/place_partition.cuh index 35a0e7e5c32..df67b92396b 100644 --- a/cudax/include/cuda/experimental/__stf/places/place_partition.cuh +++ b/cudax/include/cuda/experimental/__stf/places/place_partition.cuh @@ -80,7 +80,7 @@ inline ::std::string place_partition_scope_to_string(place_partition_scope scope * `cuda_device` scope. Green context scope requires CUDA 12.4 or later. * * Iteration over subplaces is provided via `begin()` / `end()`; `as_grid()` builds - * an `exec_place_grid` from the subplaces. + * an `exec_place` grid from the subplaces. */ class place_partition { @@ -131,14 +131,13 @@ public: * @param grid Input execution place grid to partition * @param scope Partitioning granularity */ - place_partition(async_resources_handle& handle, const exec_place_grid& grid, place_partition_scope scope) + place_partition(async_resources_handle& handle, const exec_place& grid, place_partition_scope scope) { ::std::vector<::std::shared_ptr> places; - const auto& grid_places = grid.get_places(); - places.reserve(grid_places.size()); - for (const auto& ep : grid_places) + places.reserve(grid.size()); + for (size_t i = 0; i < grid.size(); ++i) { - places.push_back(::std::make_shared(ep)); + places.push_back(::std::make_shared(grid.get_place(i))); } for (const auto& place : places) { diff --git a/cudax/include/cuda/experimental/__stf/places/places.cuh b/cudax/include/cuda/experimental/__stf/places/places.cuh index 58bc80ec9f8..f5a45ba892f 100644 --- a/cudax/include/cuda/experimental/__stf/places/places.cuh +++ b/cudax/include/cuda/experimental/__stf/places/places.cuh @@ -50,7 +50,6 @@ namespace cuda::experimental::stf { class exec_place; class exec_place_host; -class exec_place_grid; // Green contexts are only supported since CUDA 12.4 @@ -825,10 +824,14 @@ public: } /** - * @brief Convert to exec_place_grid type + * @brief Returns *this for compatibility * @deprecated All places are grids now; use exec_place methods directly */ - exec_place_grid as_grid() const; + const exec_place& as_grid() const + { + EXPECT(size() > 1, "as_grid() called on scalar exec_place"); + return *this; + } /* These helper methods provide convenient way to express execution places, * for example exec_place::host or exec_place::device(4). @@ -1181,197 +1184,146 @@ UNITTEST("exec_place copyable") }; #endif // UNITTESTED_FILE -//! A multidimensional grid of execution places for structured parallel computation -class exec_place_grid : public exec_place +/** + * Implementation class for multi-device execution place grids. + * This is used internally by make_grid() and related factory functions. + */ +class exec_place_grid_impl : public exec_place::impl { public: - /* - * Implementation of the exec_place_grid - */ - class impl : public exec_place::impl + exec_place_grid_impl(::std::vector _places) + : dims_(_places.size(), 1, 1, 1) + , places_(mv(_places)) { - public: - impl(::std::vector _places) - : dims_(_places.size(), 1, 1, 1) - , places_(mv(_places)) - { - _CCCL_ASSERT(!places_.empty(), "Grid must have at least one place"); - _CCCL_ASSERT(dims_.x > 0, "Grid dimensions must be positive"); - } + _CCCL_ASSERT(!places_.empty(), "Grid must have at least one place"); + _CCCL_ASSERT(dims_.x > 0, "Grid dimensions must be positive"); + } - impl(::std::vector _places, const dim4& _dims) - : dims_(_dims) - , places_(mv(_places)) - { - _CCCL_ASSERT(dims_.x > 0, "Grid dimensions must be positive"); - } + exec_place_grid_impl(::std::vector _places, const dim4& _dims) + : dims_(_dims) + , places_(mv(_places)) + { + _CCCL_ASSERT(dims_.x > 0, "Grid dimensions must be positive"); + } - // ===== Grid interface ===== + // ===== Grid interface ===== - dim4 get_dims() const override - { - return dims_; - } + dim4 get_dims() const override + { + return dims_; + } - size_t size() const override - { - return dims_.size(); - } + size_t size() const override + { + return dims_.size(); + } - exec_place get_place(size_t idx) const override - { - EXPECT(idx < places_.size(), "Index out of bounds"); - return places_[idx]; - } + exec_place get_place(size_t idx) const override + { + EXPECT(idx < places_.size(), "Index out of bounds"); + return places_[idx]; + } - // ===== Activation (delegates to sub-places) ===== + // ===== Activation (delegates to sub-places) ===== - exec_place activate(size_t idx) const override - { - EXPECT(idx < places_.size(), "Index out of bounds"); - return places_[idx].activate(0); - } + exec_place activate(size_t idx) const override + { + EXPECT(idx < places_.size(), "Index out of bounds"); + return places_[idx].activate(0); + } - void deactivate(size_t idx, const exec_place& prev) const override - { - EXPECT(idx < places_.size(), "Index out of bounds"); - places_[idx].deactivate(0, prev); - } + void deactivate(size_t idx, const exec_place& prev) const override + { + EXPECT(idx < places_.size(), "Index out of bounds"); + places_[idx].deactivate(0, prev); + } - // ===== Properties ===== + // ===== Properties ===== - ::std::string to_string() const override - { - return "grid(" + ::std::to_string(dims_.x) + "x" + ::std::to_string(dims_.y) + "x" + ::std::to_string(dims_.z) - + "x" + ::std::to_string(dims_.t) + ")"; - } + ::std::string to_string() const override + { + return "grid(" + ::std::to_string(dims_.x) + "x" + ::std::to_string(dims_.y) + "x" + ::std::to_string(dims_.z) + + "x" + ::std::to_string(dims_.t) + ")"; + } - // ===== Comparison ===== + // ===== Comparison ===== - int cmp(const exec_place::impl& rhs) const override + int cmp(const exec_place::impl& rhs) const override + { + if (typeid(*this) != typeid(rhs)) { - if (typeid(*this) != typeid(rhs)) - { - return typeid(*this).before(typeid(rhs)) ? -1 : 1; - } - const auto& other = static_cast(rhs); - // Compare dims first - auto this_dims = ::std::tie(dims_.x, dims_.y, dims_.z, dims_.t); - auto other_dims = ::std::tie(other.dims_.x, other.dims_.y, other.dims_.z, other.dims_.t); - if (this_dims < other_dims) - { - return -1; - } - if (other_dims < this_dims) - { - return 1; - } - // Then compare places - if (places_ < other.places_) - { - return -1; - } - if (other.places_ < places_) - { - return 1; - } - return 0; + return typeid(*this).before(typeid(rhs)) ? -1 : 1; } - - size_t hash() const override + const auto& other = static_cast(rhs); + // Compare dims first + auto this_dims = ::std::tie(dims_.x, dims_.y, dims_.z, dims_.t); + auto other_dims = ::std::tie(other.dims_.x, other.dims_.y, other.dims_.z, other.dims_.t); + if (this_dims < other_dims) { - size_t h = ::cuda::experimental::stf::hash{}(dims_); - for (const auto& p : places_) - { - hash_combine(h, p.hash()); - } - return h; + return -1; } - - // ===== Stream management ===== - - stream_pool& get_stream_pool(bool for_computation) const override + if (other_dims < this_dims) { - _CCCL_ASSERT(!for_computation, "Expected data transfer stream pool"); - _CCCL_ASSERT(!places_.empty(), "Grid must have at least one place"); - return places_[0].get_stream_pool(for_computation); + return 1; } - - // ===== Grid-specific accessors ===== - - const ::std::vector& get_places() const + // Then compare places + if (places_ < other.places_) { - return places_; + return -1; } - - // ===== Grid iteration state ===== - - ::std::ptrdiff_t get_current_idx() const override + if (other.places_ < places_) { - return current_idx_; + return 1; } + return 0; + } - void set_current_idx(::std::ptrdiff_t idx) const override + size_t hash() const override + { + size_t h = ::cuda::experimental::stf::hash{}(dims_); + for (const auto& p : places_) { - current_idx_ = idx; + hash_combine(h, p.hash()); } + return h; + } - ::std::shared_ptr get_saved_prev_impl() const override - { - return saved_prev_impl_; - } + // ===== Stream management ===== - void set_saved_prev_impl(::std::shared_ptr p) const override - { - saved_prev_impl_ = mv(p); - } + stream_pool& get_stream_pool(bool for_computation) const override + { + _CCCL_ASSERT(!for_computation, "Expected data transfer stream pool"); + _CCCL_ASSERT(!places_.empty(), "Grid must have at least one place"); + return places_[0].get_stream_pool(for_computation); + } - private: - dim4 dims_; - ::std::vector places_; - mutable ::std::ptrdiff_t current_idx_ = -1; - mutable ::std::shared_ptr saved_prev_impl_; - }; + // ===== Grid iteration state ===== - explicit operator bool() const + ::std::ptrdiff_t get_current_idx() const override { - return exec_place::get_impl() != nullptr; + return current_idx_; } - bool operator==(const exec_place_grid& rhs) const + void set_current_idx(::std::ptrdiff_t idx) const override { - return get_impl()->cmp(*rhs.get_impl()) == 0; + current_idx_ = idx; } - /** - * @brief Get the vector of sub-places (grid-specific) - */ - const ::std::vector& get_places() const + ::std::shared_ptr get_saved_prev_impl() const override { - return get_impl()->get_places(); + return saved_prev_impl_; } - /** - * @brief Get the typed impl (for grid-specific operations) - */ - ::std::shared_ptr get_impl() const + void set_saved_prev_impl(::std::shared_ptr p) const override { - _CCCL_ASSERT(::std::dynamic_pointer_cast(exec_place::get_impl()), "Invalid exec_place_grid impl"); - return ::std::static_pointer_cast(exec_place::get_impl()); + saved_prev_impl_ = mv(p); } - // Default constructor - exec_place_grid() - : exec_place(nullptr) - {} - - exec_place_grid(::std::shared_ptr p) - : exec_place(mv(p)) - {} - - exec_place_grid(::std::vector p, const dim4& d) - : exec_place(::std::make_shared(mv(p), d)) - {} +private: + dim4 dims_; + ::std::vector places_; + mutable ::std::ptrdiff_t current_idx_ = -1; + mutable ::std::shared_ptr saved_prev_impl_; }; //! Creates a grid of execution places with specified dimensions @@ -1383,7 +1335,7 @@ inline exec_place make_grid(::std::vector places, const dim4& dims) { return mv(places[0]); } - return exec_place_grid(mv(places), dims); + return exec_place(::std::make_shared(mv(places), dims)); } //! Creates a linear grid from a vector of execution places @@ -1396,7 +1348,6 @@ inline exec_place make_grid(::std::vector places) } // === data_place::affine_exec_place implementation === -// Defined here after exec_place_grid is complete inline exec_place data_place::affine_exec_place() const { @@ -1414,7 +1365,6 @@ inline exec_place data_place::affine_exec_place() const if (is_composite()) { // Return the grid of places associated to this composite data place - // exec_place_grid inherits from exec_place, so this works via slicing return get_grid(); } @@ -1470,13 +1420,6 @@ inline exec_place exec_place::repeat(const exec_place& e, size_t cnt) return make_grid(::std::vector(cnt, e)); } -/* Deferred implementation : ::std::static_pointer_cast requires that exec_place_grid is a complete type */ -inline exec_place_grid exec_place::as_grid() const -{ - EXPECT(size() > 1, "as_grid() called on scalar exec_place"); - return exec_place_grid(::std::static_pointer_cast(pimpl)); -} - /* Get the first N available devices */ //! Returns single device if n == 1 (no grid wrapper needed) inline exec_place exec_place::n_devices(size_t n, dim4 dims) From 77589aee862d819fe6ec4e80469f04328d9ed785 Mon Sep 17 00:00:00 2001 From: Andrei Alexandrescu Date: Mon, 16 Mar 2026 21:29:03 -0400 Subject: [PATCH 12/12] pre-commit --- c/experimental/stf/src/stf.cu | 7 +++---- .../cuda/experimental/__stf/graph/interfaces/slice.cuh | 4 ++-- .../cuda/experimental/__stf/internal/loop_dispatch.cuh | 3 +-- .../experimental/__stf/localization/composite_slice.cuh | 8 ++------ cudax/include/cuda/experimental/__stf/places/places.cuh | 4 ++-- .../cuda/experimental/__stf/stream/interfaces/slice.cuh | 4 ++-- 6 files changed, 12 insertions(+), 18 deletions(-) diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu index 27a07cfab0a..7441e508916 100644 --- a/c/experimental/stf/src/stf.cu +++ b/c/experimental/stf/src/stf.cu @@ -439,10 +439,9 @@ stf_exec_place_grid_create(const stf_exec_place* places, size_t count, const stf { cpp_places.push_back(to_exec_place(const_cast(&places[i]))); } - exec_place grid = - (grid_dims != nullptr) - ? make_grid(::std::move(cpp_places), dim4(grid_dims->x, grid_dims->y, grid_dims->z, grid_dims->t)) - : make_grid(::std::move(cpp_places)); + exec_place grid = (grid_dims != nullptr) + ? make_grid(::std::move(cpp_places), dim4(grid_dims->x, grid_dims->y, grid_dims->z, grid_dims->t)) + : make_grid(::std::move(cpp_places)); return new exec_place(::std::move(grid)); } diff --git a/cudax/include/cuda/experimental/__stf/graph/interfaces/slice.cuh b/cudax/include/cuda/experimental/__stf/graph/interfaces/slice.cuh index 4a053ea53b3..f7211b724a0 100644 --- a/cudax/include/cuda/experimental/__stf/graph/interfaces/slice.cuh +++ b/cudax/include/cuda/experimental/__stf/graph/interfaces/slice.cuh @@ -87,8 +87,8 @@ public: return; } - exec_place grid = memory_node.get_grid(); - size_t total_size = this->shape.size(); + exec_place grid = memory_node.get_grid(); + size_t total_size = this->shape.size(); // position (x,y,z,t) on (nx,ny,nz,nt) // * index = x + nx*y + nx*ny*z + nx*ny*nz*t diff --git a/cudax/include/cuda/experimental/__stf/internal/loop_dispatch.cuh b/cudax/include/cuda/experimental/__stf/internal/loop_dispatch.cuh index f9acdc65fcc..9df09e99430 100644 --- a/cudax/include/cuda/experimental/__stf/internal/loop_dispatch.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/loop_dispatch.cuh @@ -161,8 +161,7 @@ inline void loop_dispatch(context_t ctx, size_t start, size_t end, ::std::functi } else { - loop_dispatch( - mv(ctx), exec_place::all_devices(), scope, start, end, mv(func)); + loop_dispatch(mv(ctx), exec_place::all_devices(), scope, start, end, mv(func)); } } #endif // _CCCL_DOXYGEN_INVOKED diff --git a/cudax/include/cuda/experimental/__stf/localization/composite_slice.cuh b/cudax/include/cuda/experimental/__stf/localization/composite_slice.cuh index b9d7771573a..23ed4384f5f 100644 --- a/cudax/include/cuda/experimental/__stf/localization/composite_slice.cuh +++ b/cudax/include/cuda/experimental/__stf/localization/composite_slice.cuh @@ -75,12 +75,8 @@ public: // ::std::function delinearize : translate the index in a buffer into a position in the data // TODO pass mv(place) template - localized_array(exec_place grid, - get_executor_func_t mapper, - F&& delinearize, - size_t total_size, - size_t elemsize, - dim4 data_dims) + localized_array( + exec_place grid, get_executor_func_t mapper, F&& delinearize, size_t total_size, size_t elemsize, dim4 data_dims) : grid(mv(grid)) , mapper(mv(mapper)) , total_size_bytes(total_size * elemsize) diff --git a/cudax/include/cuda/experimental/__stf/places/places.cuh b/cudax/include/cuda/experimental/__stf/places/places.cuh index f5a45ba892f..903d15481cf 100644 --- a/cudax/include/cuda/experimental/__stf/places/places.cuh +++ b/cudax/include/cuda/experimental/__stf/places/places.cuh @@ -1242,8 +1242,8 @@ public: ::std::string to_string() const override { - return "grid(" + ::std::to_string(dims_.x) + "x" + ::std::to_string(dims_.y) + "x" + ::std::to_string(dims_.z) - + "x" + ::std::to_string(dims_.t) + ")"; + return "grid(" + ::std::to_string(dims_.x) + "x" + ::std::to_string(dims_.y) + "x" + ::std::to_string(dims_.z) + "x" + + ::std::to_string(dims_.t) + ")"; } // ===== Comparison ===== diff --git a/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh b/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh index 5a01d242608..3d71c5c6993 100644 --- a/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh +++ b/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh @@ -94,8 +94,8 @@ public: return; } - exec_place grid = memory_node.get_grid(); - size_t total_size = this->shape.size(); + exec_place grid = memory_node.get_grid(); + size_t total_size = this->shape.size(); // position (x,y,z,t) on (nx,ny,nz,nt) // * index = x + nx*y + nx*ny*z + nx*ny*nz*t