diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu index f57ad22db37..7441e508916 100644 --- a/c/experimental/stf/src/stf.cu +++ b/c/experimental/stf/src/stf.cu @@ -58,7 +58,7 @@ static data_place to_data_place(stf_data_place* data_p) { return data_place::invalid(); } - exec_place_grid* grid_ptr = static_cast(grid_handle); + exec_place* grid_ptr = static_cast(grid_handle); // Layout-compatible: pass C mapper directly so the runtime calls it get_executor_func_t cpp_mapper = reinterpret_cast(mapper); return data_place::composite(cpp_mapper, *grid_ptr); @@ -425,8 +425,8 @@ stf_exec_place_grid_handle stf_exec_place_grid_from_devices(const int* device_id { places.push_back(exec_place::device(device_ids[i])); } - exec_place_grid grid = make_grid(::std::move(places)); - return new exec_place_grid(::std::move(grid)); + exec_place grid = make_grid(::std::move(places)); + return new exec_place(::std::move(grid)); } stf_exec_place_grid_handle @@ -439,18 +439,17 @@ stf_exec_place_grid_create(const stf_exec_place* places, size_t count, const stf { cpp_places.push_back(to_exec_place(const_cast(&places[i]))); } - exec_place_grid grid = - (grid_dims != nullptr) - ? make_grid(::std::move(cpp_places), dim4(grid_dims->x, grid_dims->y, grid_dims->z, grid_dims->t)) - : make_grid(::std::move(cpp_places)); - return new exec_place_grid(::std::move(grid)); + exec_place grid = (grid_dims != nullptr) + ? make_grid(::std::move(cpp_places), dim4(grid_dims->x, grid_dims->y, grid_dims->z, grid_dims->t)) + : make_grid(::std::move(cpp_places)); + return new exec_place(::std::move(grid)); } void stf_exec_place_grid_destroy(stf_exec_place_grid_handle grid) { if (grid != nullptr) { - delete static_cast(grid); + delete static_cast(grid); } } diff --git a/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh b/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh index 39375f6db2d..3251b32eded 100644 --- a/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh +++ b/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh @@ -456,17 +456,17 @@ public: void set_current_place(pos4 p) { - get_exec_place().as_grid().set_current_place(p); + get_exec_place().set_current_place(p); } void unset_current_place() { - get_exec_place().as_grid().unset_current_place(); + get_exec_place().unset_current_place(); } - const exec_place& get_current_place() const + exec_place get_current_place() const { - return get_exec_place().as_grid().get_current_place(); + return get_exec_place().get_current_place(); } private: diff --git a/cudax/include/cuda/experimental/__stf/graph/interfaces/slice.cuh b/cudax/include/cuda/experimental/__stf/graph/interfaces/slice.cuh index 6aa1b142e9f..f7211b724a0 100644 --- a/cudax/include/cuda/experimental/__stf/graph/interfaces/slice.cuh +++ b/cudax/include/cuda/experimental/__stf/graph/interfaces/slice.cuh @@ -87,8 +87,8 @@ public: return; } - exec_place_grid grid = memory_node.get_grid(); - size_t total_size = this->shape.size(); + exec_place grid = memory_node.get_grid(); + size_t total_size = this->shape.size(); // position (x,y,z,t) on (nx,ny,nz,nt) // * index = x + nx*y + nx*ny*z + nx*ny*nz*t diff --git a/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh b/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh index b3659b7c829..3ef18a36803 100644 --- a/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh @@ -1117,9 +1117,6 @@ public: } } - template - auto parallel_for(exec_place_grid e_place, S shape, Deps... deps) = delete; - template auto parallel_for(S shape, Deps... deps) { diff --git a/cudax/include/cuda/experimental/__stf/internal/launch.cuh b/cudax/include/cuda/experimental/__stf/internal/launch.cuh index b33b14929c7..235219943db 100644 --- a/cudax/include/cuda/experimental/__stf/internal/launch.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/launch.cuh @@ -95,7 +95,7 @@ void cuda_launcher_graph(interpreted_spec interpreted_policy, Fun&& f, void** ar template void launch_impl(interpreted_spec interpreted_policy, exec_place& p, Fun f, Arg arg, cudaStream_t stream, size_t rank) { - assert(!p.is_grid()); + assert(p.size() == 1); p->*[&] { auto th = thread_hierarchy(static_cast(rank), interpreted_policy); @@ -140,7 +140,7 @@ void launch_impl(interpreted_spec interpreted_policy, exec_place& p, Fun f, Arg template void graph_launch_impl(task_t& t, interpreted_spec interpreted_policy, exec_place& p, Fun f, Arg arg, size_t rank) { - assert(!p.is_grid()); + assert(p.size() == 1); auto kernel_args = tuple_prepend(thread_hierarchy(static_cast(rank), interpreted_policy), mv(arg)); using args_type = decltype(kernel_args); @@ -331,11 +331,11 @@ public: assert(e_place.affine_data_place() == t.get_affine_data_place()); /* - * If we have a grid of places, the implicit affine partitioner is the blocked_partition. + * If we have a grid (including 1-element grids), the implicit affine partitioner is the blocked_partition. * * An explicit composite data place is required per data dependency to customize this behaviour. */ - if (e_place.is_grid()) + if (e_place.size() > 1) { // Create a composite data place defined by the grid of places + the partitioning function t.set_affine_data_place(data_place::composite(blocked_partition(), e_place.as_grid())); diff --git a/cudax/include/cuda/experimental/__stf/internal/loop_dispatch.cuh b/cudax/include/cuda/experimental/__stf/internal/loop_dispatch.cuh index 3359a8ea03c..9df09e99430 100644 --- a/cudax/include/cuda/experimental/__stf/internal/loop_dispatch.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/loop_dispatch.cuh @@ -161,8 +161,7 @@ inline void loop_dispatch(context_t ctx, size_t start, size_t end, ::std::functi } else { - loop_dispatch( - mv(ctx), exec_place::all_devices(), scope, start, end, mv(func)); + loop_dispatch(mv(ctx), exec_place::all_devices(), scope, start, end, mv(func)); } } #endif // _CCCL_DOXYGEN_INVOKED diff --git a/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh index ccc420f6609..53da4f55f9f 100644 --- a/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh @@ -550,8 +550,8 @@ public: // If there is a partitioner, we ensure there is a proper affine data place for this execution place if constexpr (!::std::is_same_v) { - // This is only meaningful for grid of places - if (e_place.is_grid()) + // Grids need a composite data place + if (e_place.size() > 1) { // Create a composite data place defined by the grid of places + the partitioning function t.set_affine_data_place(data_place::composite(partitioner_t(), e_place.as_grid())); @@ -629,7 +629,7 @@ public: if constexpr (need_reduction) { _CCCL_ASSERT(e_place != exec_place::host(), "Reduce access mode currently unimplemented on host."); - _CCCL_ASSERT(!e_place.is_grid(), "Reduce access mode currently unimplemented on grid of places."); + _CCCL_ASSERT(e_place.size() == 1, "Reduce access mode currently unimplemented on grid of places."); do_parallel_for_redux(f, e_place, shape, t); return; } @@ -659,17 +659,18 @@ public: if constexpr (!::std::is_same_v && is_extended_host_device_lambda_closure_type || is_extended_device_lambda_closure_type) { - if (!e_place.is_grid()) + if (e_place.size() == 1) { // Apply the parallel_for construct over the entire shape on the - // execution place of the task + // execution place of the task. + const exec_place& scalar_place = e_place; if constexpr (need_reduction) { - do_parallel_for_redux(f, e_place, shape, t); + do_parallel_for_redux(f, scalar_place, shape, t); } else { - do_parallel_for(f, e_place, shape, t); + do_parallel_for(f, scalar_place, shape, t); } } else @@ -681,11 +682,12 @@ public: } else { - size_t grid_size = t.grid_dims().size(); + const auto& t_place = t.get_exec_place(); + size_t grid_size = t_place.size(); for (size_t i = 0; i < grid_size; i++) { t.set_current_place(pos4(i)); - const auto sub_shape = partitioner_t::apply(shape, pos4(i), t.grid_dims()); + const auto sub_shape = partitioner_t::apply(shape, pos4(i), t_place.get_dims()); do_parallel_for(f, t.get_current_place(), sub_shape, t); t.unset_current_place(); } diff --git a/cudax/include/cuda/experimental/__stf/localization/composite_slice.cuh b/cudax/include/cuda/experimental/__stf/localization/composite_slice.cuh index c4298f35459..23ed4384f5f 100644 --- a/cudax/include/cuda/experimental/__stf/localization/composite_slice.cuh +++ b/cudax/include/cuda/experimental/__stf/localization/composite_slice.cuh @@ -75,12 +75,8 @@ public: // ::std::function delinearize : translate the index in a buffer into a position in the data // TODO pass mv(place) template - localized_array(exec_place_grid grid, - get_executor_func_t mapper, - F&& delinearize, - size_t total_size, - size_t elemsize, - dim4 data_dims) + localized_array( + exec_place grid, get_executor_func_t mapper, F&& delinearize, size_t total_size, size_t elemsize, dim4 data_dims) : grid(mv(grid)) , mapper(mv(mapper)) , total_size_bytes(total_size * elemsize) @@ -422,7 +418,7 @@ private: } event_list prereqs; // To allow reuse in a cache - exec_place_grid grid; + exec_place grid; get_executor_func_t mapper = nullptr; ::std::vector meta; diff --git a/cudax/include/cuda/experimental/__stf/places/data_place_interface.cuh b/cudax/include/cuda/experimental/__stf/places/data_place_interface.cuh index ecf6d8f7b45..14b20f4a707 100644 --- a/cudax/include/cuda/experimental/__stf/places/data_place_interface.cuh +++ b/cudax/include/cuda/experimental/__stf/places/data_place_interface.cuh @@ -46,7 +46,6 @@ namespace cuda::experimental::stf { // Forward declarations class exec_place; -class exec_place_grid; class pos4; class dim4; @@ -184,7 +183,7 @@ public: * @brief Get the grid for composite places * @throws std::logic_error if not a composite place */ - virtual const exec_place_grid& get_grid() const + virtual const exec_place& get_grid() const { throw ::std::logic_error("get_grid() called on non-composite data_place"); } diff --git a/cudax/include/cuda/experimental/__stf/places/exec/cuda_stream.cuh b/cudax/include/cuda/experimental/__stf/places/exec/cuda_stream.cuh index a18545e4014..a490f57054f 100644 --- a/cudax/include/cuda/experimental/__stf/places/exec/cuda_stream.cuh +++ b/cudax/include/cuda/experimental/__stf/places/exec/cuda_stream.cuh @@ -30,92 +30,82 @@ namespace cuda::experimental::stf { /** - * @brief Designates execution that is to run on a specific CUDA stream - * + * @brief Implementation for CUDA stream execution places */ -class exec_place_cuda_stream : public exec_place +class exec_place_cuda_stream_impl : public exec_place::impl { public: - class impl : public exec_place::impl + exec_place_cuda_stream_impl(const decorated_stream& dstream) + : exec_place::impl(data_place::device(dstream.dev_id)) + , dstream_(dstream) + , dummy_pool_(dstream) + {} + + exec_place get_place(size_t idx) const override { - public: - impl(const decorated_stream& _dstream) - : exec_place::impl(data_place::device(_dstream.dev_id)) - , dstream(_dstream) - , dummy_pool(_dstream) - {} - - /* We set the current device to be the device on which the CUDA stream was created */ - exec_place activate() const override - { - return exec_place::device(dstream.dev_id).activate(); - } + EXPECT(idx == 0, "Index out of bounds for cuda_stream exec_place"); + return exec_place::cuda_stream(dstream_); + } - void deactivate(const exec_place& prev) const override - { - return exec_place::device(dstream.dev_id).deactivate(prev); - } + exec_place activate(size_t idx) const override + { + EXPECT(idx == 0, "Index out of bounds for cuda_stream exec_place"); + return exec_place::device(dstream_.dev_id).activate(); + } - stream_pool& get_stream_pool(bool) const override - { - return dummy_pool; - } + void deactivate(size_t idx, const exec_place& prev) const override + { + EXPECT(idx == 0, "Index out of bounds for cuda_stream exec_place"); + exec_place::device(dstream_.dev_id).deactivate(prev); + } - ::std::string to_string() const override - { - return "exec(stream id=" + ::std::to_string(dstream.id) + " dev=" + ::std::to_string(dstream.dev_id) + ")"; - } + stream_pool& get_stream_pool(bool) const override + { + return dummy_pool_; + } - bool operator==(const exec_place::impl& rhs) const override + ::std::string to_string() const override + { + return "exec(stream id=" + ::std::to_string(dstream_.id) + " dev=" + ::std::to_string(dstream_.dev_id) + ")"; + } + + int cmp(const exec_place::impl& rhs) const override + { + if (typeid(*this) != typeid(rhs)) { - if (typeid(*this) != typeid(rhs)) - { - return false; - } - const auto& other = static_cast(rhs); - // Compare by stream handle - return dstream.stream == other.dstream.stream; + return typeid(*this).before(typeid(rhs)) ? -1 : 1; } - - size_t hash() const override + const auto& other = static_cast(rhs); + if (dstream_.stream < other.dstream_.stream) { - // Hash the stream handle, not the affine data place - return ::std::hash()(dstream.stream); + return -1; } - - bool operator<(const exec_place::impl& rhs) const override + if (other.dstream_.stream < dstream_.stream) { - if (typeid(*this) != typeid(rhs)) - { - return typeid(*this).before(typeid(rhs)); - } - const auto& other = static_cast(rhs); - return dstream.stream < other.dstream.stream; + return 1; } + return 0; + } - private: - decorated_stream dstream; - // We create a dummy pool of streams which only consists in a single stream in practice. - mutable stream_pool dummy_pool; - }; - -public: - exec_place_cuda_stream(const decorated_stream& dstream) - : exec_place(::std::make_shared(dstream)) + size_t hash() const override { - static_assert(sizeof(exec_place_cuda_stream) == sizeof(exec_place), - "exec_place_cuda_stream cannot add state; it would be sliced away."); + return ::std::hash()(dstream_.stream); } + +private: + decorated_stream dstream_; + mutable stream_pool dummy_pool_; }; -inline exec_place_cuda_stream exec_place::cuda_stream(cudaStream_t stream) +inline exec_place exec_place::cuda_stream(cudaStream_t stream) { int devid = get_device_from_stream(stream); - return exec_place_cuda_stream(decorated_stream(stream, get_stream_id(stream), devid)); + return exec_place( + ::std::make_shared(decorated_stream(stream, get_stream_id(stream), devid))); } -inline exec_place_cuda_stream exec_place::cuda_stream(const decorated_stream& dstream) +inline exec_place exec_place::cuda_stream(const decorated_stream& dstream) { - return exec_place_cuda_stream(dstream); + return exec_place(::std::make_shared(dstream)); } } // end namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh b/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh index f5e5531eb52..284cb3f134e 100644 --- a/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh +++ b/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh @@ -252,135 +252,112 @@ private: }; /** - * @brief Designates execution that is to run on a green context. Initialize with the device ordinal and green_context + * @brief Implementation for green context execution places */ -class exec_place_green_ctx : public exec_place +class exec_place_green_ctx_impl : public exec_place::impl { public: - class impl : public exec_place::impl + /** + * @brief Construct a green context execution place + * + * @param gc_view The green context view + * @param use_green_ctx_data_place If true, use a green context data place as the + * affine data place. If false (default), use a regular device data place instead. + */ + exec_place_green_ctx_impl(green_ctx_view gc_view, bool use_green_ctx_data_place = false) + : exec_place::impl( + use_green_ctx_data_place ? make_green_ctx_data_place(gc_view) : data_place::device(gc_view.devid)) + , devid_(gc_view.devid) + , g_ctx_(gc_view.g_ctx) + , pool_(mv(gc_view.pool)) + {} + + // This is used to implement deactivate and wrap an existing context + exec_place_green_ctx_impl(CUcontext saved_context) + : driver_context_(saved_context) + {} + + exec_place get_place(size_t idx) const override { - public: - /** - * @brief Construct a green context execution place - * - * @param gc_view The green context view - * @param use_green_ctx_data_place If true, use a green context data place as the - * affine data place. If false (default), use a regular device data place instead. - */ - impl(green_ctx_view gc_view, bool use_green_ctx_data_place = false) - : exec_place::impl( - use_green_ctx_data_place ? make_green_ctx_data_place(gc_view) : data_place::device(gc_view.devid)) - , devid(gc_view.devid) - , g_ctx(gc_view.g_ctx) - , pool(mv(gc_view.pool)) - {} - - // This is used to implement deactivate and wrap an existing context - impl(CUcontext saved_context) - : driver_context(saved_context) - {} - - exec_place activate() const override - { - // Save the current context and transform it into a fake green context place - CUcontext current_ctx; - cuda_safe_call(cuCtxGetCurrent(¤t_ctx)); - exec_place result = exec_place(::std::make_shared(current_ctx)); - - // Convert the green context to a primary context (TODO cache this ?) - cuda_safe_call(cuCtxFromGreenCtx(&driver_context, g_ctx)); - -# if 0 - // for debug purposes, display the affinity - { - CUdevResource check_resource; - cuda_safe_call(cuGreenCtxGetDevResource(g_ctx, &check_resource, CU_DEV_RESOURCE_TYPE_SM)); - unsigned long long check_ctxId; - cuda_safe_call(cuCtxGetId(driver_context, &check_ctxId)); - fprintf(stderr, "ACTIVATE : set affinity with %d SMs (ctx ID = %llu)\n", check_resource.sm.smCount, - check_ctxId); - } -# endif + EXPECT(idx == 0, "Index out of bounds for green_ctx exec_place"); + return exec_place::green_ctx(green_ctx_view(g_ctx_, pool_, devid_)); + } - cuda_safe_call(cuCtxSetCurrent(driver_context)); + exec_place activate(size_t idx) const override + { + EXPECT(idx == 0, "Index out of bounds for green_ctx exec_place"); - return result; - } + // Save the current context and transform it into a fake green context place + CUcontext current_ctx; + cuda_safe_call(cuCtxGetCurrent(¤t_ctx)); + exec_place result = exec_place(::std::make_shared(current_ctx)); - void deactivate(const exec_place& prev) const override - { - auto prev_impl = ::std::static_pointer_cast(prev.get_impl()); - CUcontext saved_ctx = prev_impl->driver_context; + // Convert the green context to a primary context + cuda_safe_call(cuCtxFromGreenCtx(&driver_context_, g_ctx_)); + cuda_safe_call(cuCtxSetCurrent(driver_context_)); + + return result; + } + + void deactivate(size_t idx, const exec_place& prev) const override + { + EXPECT(idx == 0, "Index out of bounds for green_ctx exec_place"); + + auto prev_impl = ::std::static_pointer_cast(prev.get_impl()); + CUcontext saved_ctx = prev_impl->driver_context_; # ifdef DEBUG - // Ensure that the current context is the green context that we have activated before - CUcontext current_ctx; - cuda_safe_call(cuCtxGetCurrent(¤t_ctx)); - assert(get_cuda_context_id(current_ctx) == get_cuda_context_id(driver_context)); + CUcontext current_ctx; + cuda_safe_call(cuCtxGetCurrent(¤t_ctx)); + assert(get_cuda_context_id(current_ctx) == get_cuda_context_id(driver_context_)); # endif - cuda_safe_call(cuCtxSetCurrent(saved_ctx)); - } + cuda_safe_call(cuCtxSetCurrent(saved_ctx)); + } - ::std::string to_string() const override - { - return "green ctx ( id=" + ::std::to_string(get_cuda_context_id(g_ctx)) + " dev_id =" + ::std::to_string(devid) - + ")"; - } + ::std::string to_string() const override + { + return "green_ctx(id=" + ::std::to_string(get_cuda_context_id(g_ctx_)) + " dev=" + ::std::to_string(devid_) + ")"; + } - stream_pool& get_stream_pool(bool) const override - { - return pool; - } + stream_pool& get_stream_pool(bool) const override + { + return pool_; + } - bool operator==(const exec_place::impl& rhs) const override + int cmp(const exec_place::impl& rhs) const override + { + if (typeid(*this) != typeid(rhs)) { - if (typeid(*this) != typeid(rhs)) - { - return false; - } - const auto& other = static_cast(rhs); - // Compare green context handles - return g_ctx == other.g_ctx; + return typeid(*this).before(typeid(rhs)) ? -1 : 1; } - - size_t hash() const override + const auto& other = static_cast(rhs); + if (g_ctx_ < other.g_ctx_) { - // Hash the green context handle, not the affine data place - return ::std::hash()(g_ctx); + return -1; } - - bool operator<(const exec_place::impl& rhs) const override + if (other.g_ctx_ < g_ctx_) { - if (typeid(*this) != typeid(rhs)) - { - return typeid(*this).before(typeid(rhs)); - } - const auto& other = static_cast(rhs); - return g_ctx < other.g_ctx; + return 1; } + return 0; + } - private: - int devid = -1; - CUgreenCtx g_ctx = {}; - // a context created from the green context (or used to store an existing context to implement - // activate/deactivate) - mutable CUcontext driver_context = {}; - mutable stream_pool pool; - }; - -public: - exec_place_green_ctx(green_ctx_view gc_view, bool use_green_ctx_data_place = false) - : exec_place(::std::make_shared(mv(gc_view), use_green_ctx_data_place)) + size_t hash() const override { - static_assert(sizeof(exec_place_green_ctx) <= sizeof(exec_place), - "exec_place_green_ctx cannot add state; it would be sliced away."); + return ::std::hash()(g_ctx_); } + +private: + int devid_ = -1; + CUgreenCtx g_ctx_ = {}; + mutable CUcontext driver_context_ = {}; + mutable stream_pool pool_; }; inline exec_place exec_place::green_ctx(const green_ctx_view& gc_view, bool use_green_ctx_data_place) { - return exec_place_green_ctx(gc_view, use_green_ctx_data_place); + return exec_place(::std::make_shared(gc_view, use_green_ctx_data_place)); } inline ::std::shared_ptr green_ctx_data_place_impl::get_affine_exec_impl() const diff --git a/cudax/include/cuda/experimental/__stf/places/place_partition.cuh b/cudax/include/cuda/experimental/__stf/places/place_partition.cuh index d7cd70dca54..df67b92396b 100644 --- a/cudax/include/cuda/experimental/__stf/places/place_partition.cuh +++ b/cudax/include/cuda/experimental/__stf/places/place_partition.cuh @@ -80,7 +80,7 @@ inline ::std::string place_partition_scope_to_string(place_partition_scope scope * `cuda_device` scope. Green context scope requires CUDA 12.4 or later. * * Iteration over subplaces is provided via `begin()` / `end()`; `as_grid()` builds - * an `exec_place_grid` from the subplaces. + * an `exec_place` grid from the subplaces. */ class place_partition { @@ -131,14 +131,13 @@ public: * @param grid Input execution place grid to partition * @param scope Partitioning granularity */ - place_partition(async_resources_handle& handle, const exec_place_grid& grid, place_partition_scope scope) + place_partition(async_resources_handle& handle, const exec_place& grid, place_partition_scope scope) { ::std::vector<::std::shared_ptr> places; - const auto& grid_places = grid.get_places(); - places.reserve(grid_places.size()); - for (const auto& ep : grid_places) + places.reserve(grid.size()); + for (size_t i = 0; i < grid.size(); ++i) { - places.push_back(::std::make_shared(ep)); + places.push_back(::std::make_shared(grid.get_place(i))); } for (const auto& place : places) { @@ -210,10 +209,10 @@ public: return sub_places[i]; } - /** @brief Build an exec_place_grid from the subplaces. - * @return A grid view of the partitioned execution places. + /** @brief Build an exec_place from the subplaces. + * @return A grid view of the partitioned execution places, or single place if size == 1. */ - exec_place_grid as_grid() const + exec_place as_grid() const { return make_grid(sub_places); } @@ -222,9 +221,9 @@ private: /** @brief Compute the subplaces of a place at the specified granularity (scope) into the sub_places vector */ void compute_subplaces(async_resources_handle& handle, const exec_place& place, place_partition_scope scope) { - if (place.is_grid() && scope == place_partition_scope::cuda_stream) + // Handle multi-element grids by recursively partitioning + if (place.size() > 1 && scope == place_partition_scope::cuda_stream) { - // Recursively partition grid into devices, then into streams for (auto& device_p : place_partition(place, handle, place_partition_scope::cuda_device)) { auto device_p_places = place_partition(device_p, handle, place_partition_scope::cuda_stream).sub_places; @@ -233,6 +232,27 @@ private: return; } + // Handle scalar places (including 1-element grids) for cuda_stream scope + if (place.size() == 1 && scope == place_partition_scope::cuda_stream) + { + // Get the underlying scalar place (for 1-element grids, get the single element) + exec_place scalar_place = place.is_device() ? place : place.get_place(0); + if (!scalar_place.is_device()) + { + // Host or other non-device place - no streams to partition into + sub_places.push_back(place); + return; + } + auto& pool = scalar_place.get_stream_pool(true); + for (size_t i = 0; i < pool.size(); i++) + { + decorated_stream dstream = pool.next(scalar_place); + sub_places.push_back(exec_place::cuda_stream(dstream)); + } + return; + } + + // Legacy path for explicit device check (kept for compatibility) if (place.is_device() && scope == place_partition_scope::cuda_stream) { auto& pool = place.get_stream_pool(true); @@ -247,7 +267,7 @@ private: // Green contexts are only supported since CUDA 12.4 #if _CCCL_CTK_AT_LEAST(12, 4) - if (place.is_grid() && scope == place_partition_scope::green_context) + if (place.size() > 1 && scope == place_partition_scope::green_context) { // Recursively partition grid into devices, then into green contexts for (auto& device_p : place_partition(place, handle, place_partition_scope::cuda_device)) @@ -258,18 +278,40 @@ private: return; } + // Handle scalar places (including 1-element grids) for green_context scope + if (place.size() == 1 && scope == place_partition_scope::green_context) + { + exec_place scalar_place = place.is_device() ? place : place.get_place(0); + if (!scalar_place.is_device()) + { + sub_places.push_back(place); + return; + } + int dev_id = device_ordinal(scalar_place.affine_data_place()); + + const char* env = getenv("CUDASTF_GREEN_CONTEXT_SIZE"); + int sm_cnt = env ? atoi(env) : 8; + + auto h = handle.get_gc_helper(dev_id, sm_cnt); + + size_t cnt = h->get_count(); + for (size_t i = 0; i < cnt; i++) + { + sub_places.push_back(exec_place::green_ctx(h->get_view(i))); + } + return; + } + + // Legacy path for explicit device check (kept for compatibility) if (place.is_device() && scope == place_partition_scope::green_context) { - // Find the device associated to the place, and get the green context helper int dev_id = device_ordinal(place.affine_data_place()); - // 8 SMs per green context is a granularity that should work on any arch. const char* env = getenv("CUDASTF_GREEN_CONTEXT_SIZE"); int sm_cnt = env ? atoi(env) : 8; auto h = handle.get_gc_helper(dev_id, sm_cnt); - // Get views of green context out of the helper to create execution places size_t cnt = h->get_count(); for (size_t i = 0; i < cnt; i++) { @@ -291,17 +333,26 @@ private: #endif // _CCCL_CTK_BELOW(12, 4) _CCCL_ASSERT(scope != place_partition_scope::cuda_stream, "CUDA stream scope needs an async resource handle."); - if (place.is_grid() && scope == place_partition_scope::cuda_device) - { - exec_place_grid g = place.as_grid(); - // Copy the vector of places - sub_places = g.get_places(); - return; - } - - if (place.is_device() && scope == place_partition_scope::cuda_device) + if (scope == place_partition_scope::cuda_device) { - sub_places.push_back(place); + if (place.size() > 1) + { + // Multi-element grid: extract all places + for (size_t i = 0; i < place.size(); ++i) + { + sub_places.push_back(place.get_place(i)); + } + } + else if (place.is_device()) + { + // Scalar device place + sub_places.push_back(place); + } + else + { + // 1-element grid or other scalar place: extract the underlying place + sub_places.push_back(place.get_place(0)); + } return; } diff --git a/cudax/include/cuda/experimental/__stf/places/places.cuh b/cudax/include/cuda/experimental/__stf/places/places.cuh index 8b8a75c23cc..903d15481cf 100644 --- a/cudax/include/cuda/experimental/__stf/places/places.cuh +++ b/cudax/include/cuda/experimental/__stf/places/places.cuh @@ -50,13 +50,8 @@ namespace cuda::experimental::stf { class exec_place; class exec_place_host; -class exec_place_grid; -class exec_place_cuda_stream; // Green contexts are only supported since CUDA 12.4 -#if _CCCL_CTK_AT_LEAST(12, 4) -class exec_place_green_ctx; -#endif // _CCCL_CTK_AT_LEAST(12, 4) //! Function type for computing executor placement from data coordinates using get_executor_func_t = pos4 (*)(pos4, dim4, dim4); @@ -169,9 +164,9 @@ public: // User-visible API when using a different partitioner than the one of the grid template - static data_place composite(partitioner_t p, const exec_place_grid& g); + static data_place composite(partitioner_t p, const exec_place& g); - static data_place composite(get_executor_func_t f, const exec_place_grid& grid); + static data_place composite(get_executor_func_t f, const exec_place& grid); #if _CCCL_CTK_AT_LEAST(12, 4) static data_place green_ctx(const green_ctx_view& gc_view); @@ -300,7 +295,7 @@ public: return p.pimpl_->get_device_ordinal(); } - const exec_place_grid& get_grid() const + const exec_place& get_grid() const { return pimpl_->get_grid(); } @@ -377,18 +372,19 @@ inline data_place from_index(size_t n); /** * @brief Indicates where a computation takes place (CPU, dev0, dev1, ...) * - * Currently data and computation are together `(devid == int(data_place))`. + * All execution places are modeled as grids. Scalar places (host, single device) + * are simply 1-element grids. This unified model eliminates special-casing and + * allows uniform iteration over any exec_place. */ class exec_place { public: /* - * @brief Using the pimpl idiom. Public because a number of classes inehrit from this. + * @brief Using the pimpl idiom. Public because a number of classes inherit from this. */ class impl { public: - // Note that the default ctor assumes an invalid affine data place impl() = default; impl(const impl&) = delete; impl& operator=(const impl&) = delete; @@ -398,8 +394,44 @@ public: : affine(mv(place)) {} - virtual exec_place activate() const + // ===== Grid interface (all places are grids) ===== + + /** + * @brief Get the dimensions of this grid + * + * For scalar places, returns dim4(1, 1, 1, 1). + */ + virtual dim4 get_dims() const + { + return dim4(1, 1, 1, 1); + } + + /** + * @brief Get the total number of places in this grid + */ + virtual size_t size() const { + return 1; + } + + /** + * @brief Get the sub-place at the given linear index + * + * For scalar places, idx must be 0. + */ + virtual exec_place get_place(size_t idx) const; + + // ===== Activation/deactivation (indexed) ===== + + /** + * @brief Activate the sub-place at the given index + * + * For scalar places, idx must be 0. + * Returns the previous execution state needed for deactivate(). + */ + virtual exec_place activate(size_t idx) const + { + EXPECT(idx == 0, "Index out of bounds for scalar exec_place"); if (!affine.is_device()) { return exec_place(); @@ -410,12 +442,15 @@ public: { cuda_safe_call(cudaSetDevice(new_dev_id)); } - auto old_dev = data_place::device(old_dev_id); - return exec_place(mv(old_dev)); + return exec_place(data_place::device(old_dev_id)); } - virtual void deactivate(const exec_place& prev) const + /** + * @brief Deactivate the sub-place at the given index, restoring previous state + */ + virtual void deactivate(size_t idx, const exec_place& prev) const { + EXPECT(idx == 0, "Index out of bounds for scalar exec_place"); if (affine.is_device()) { auto current_dev_id = cuda_try(); @@ -427,7 +462,9 @@ public: } } - virtual const data_place affine_data_place() const + // ===== Properties ===== + + virtual data_place affine_data_place() const { return affine; } @@ -437,34 +474,32 @@ public: return "exec(" + affine.to_string() + ")"; } - virtual bool is_host() const - { - return affine.is_host(); - } - - virtual bool is_device() const - { - return affine.is_device(); - } - - virtual bool is_grid() const - { - return false; - } - - virtual size_t size() const - { - return 1; - } - virtual void set_affine_data_place(data_place place) { affine = mv(place); } - virtual bool operator==(const impl& rhs) const + // ===== Comparison ===== + + /** + * @brief Three-way comparison + * @return -1 if *this < rhs, 0 if *this == rhs, 1 if *this > rhs + */ + virtual int cmp(const impl& rhs) const { - return affine == rhs.affine; + if (typeid(*this) != typeid(rhs)) + { + return typeid(*this).before(typeid(rhs)) ? -1 : 1; + } + if (affine < rhs.affine) + { + return -1; + } + if (rhs.affine < affine) + { + return 1; + } + return 0; } virtual size_t hash() const @@ -472,24 +507,8 @@ public: return affine.hash(); } - virtual bool operator<(const impl& rhs) const - { - // Different types: order by typeid - if (typeid(*this) != typeid(rhs)) - { - return typeid(*this).before(typeid(rhs)); - } - // Same type (both base impl): compare by device ID - // (base impl stores devid in affine, so we extract it via device_ordinal) - return device_ordinal(affine) < device_ordinal(rhs.affine); - } + // ===== Stream management ===== - /** - * @brief Get the stream pool for this execution place. - * - * The base implementation returns pool_compute or pool_data stored - * directly on the impl. - */ virtual stream_pool& get_stream_pool(bool for_computation) const { return for_computation ? pool_compute : pool_data; @@ -498,6 +517,24 @@ public: static constexpr size_t pool_size = 4; static constexpr size_t data_pool_size = 4; + // Grid iteration state - only meaningful for multi-element grids + virtual ::std::ptrdiff_t get_current_idx() const + { + return -1; + } + virtual void set_current_idx(::std::ptrdiff_t) const + { + _CCCL_ASSERT(false, "set_current_idx called on non-grid exec_place"); + } + virtual ::std::shared_ptr get_saved_prev_impl() const + { + return nullptr; + } + virtual void set_saved_prev_impl(::std::shared_ptr) const + { + _CCCL_ASSERT(false, "set_saved_prev_impl called on non-grid exec_place"); + } + protected: friend class exec_place; data_place affine = data_place::invalid(); @@ -515,17 +552,21 @@ public: bool operator==(const exec_place& rhs) const { - return *pimpl == *rhs.pimpl; + if (pimpl.get() == rhs.pimpl.get()) + { + return true; + } + return pimpl->cmp(*rhs.pimpl) == 0; } + bool operator!=(const exec_place& rhs) const { return !(*this == rhs); } - // To use in a ::std::map indexed by exec_place bool operator<(const exec_place& rhs) const { - return *pimpl < *rhs.pimpl; + return pimpl->cmp(*rhs.pimpl) < 0; } bool operator>(const exec_place& rhs) const @@ -543,20 +584,51 @@ public: return !(*this < rhs); } + size_t hash() const + { + return pimpl->hash(); + } + + // ===== Grid interface (all places are grids) ===== + /** - * @brief Compute a hash value for this execution place + * @brief Get the dimensions of this grid * - * Used by std::hash specialization for unordered containers. + * For scalar places (host, single device), returns dim4(1, 1, 1, 1). */ - size_t hash() const + dim4 get_dims() const { - return pimpl->hash(); + return pimpl->get_dims(); } /** - * @brief an iterator class which goes over all subplaces in an exec place. + * @brief Get the total number of places in this grid + */ + size_t size() const + { + return pimpl->size(); + } + + /** + * @brief Get the sub-place at the given linear index * - * This is a trivial singleton unless we have a grid of places. + * For scalar places, idx must be 0 and returns the place itself. + */ + exec_place get_place(size_t idx) const + { + return pimpl->get_place(idx); + } + + /** + * @brief Get the sub-place at the given multi-dimensional position + */ + exec_place get_place(pos4 p) const + { + return get_place(get_dims().get_index(p)); + } + + /** + * @brief an iterator class which goes over all subplaces in an exec place. */ class iterator { @@ -566,7 +638,10 @@ public: , index(index) {} - exec_place operator*(); + exec_place operator*() + { + return it_impl->get_place(index); + } iterator& operator++() { @@ -598,98 +673,165 @@ public: return iterator(pimpl, pimpl->size()); } + // ===== Activation/deactivation ===== + /** - * @brief Returns a string representation of the execution place object. + * @brief Activate the sub-place at the given index * - * @return std::string + * @param idx The index of the sub-place to activate (default 0 for scalar places) + * @return The previous execution state needed for deactivate() */ - ::std::string to_string() const + exec_place activate(size_t idx = 0) const { - return pimpl->to_string(); + return pimpl->activate(idx); } /** - * @brief Returns the `data_place` naturally associated with this execution place. + * @brief Deactivate the sub-place at the given index, restoring previous state + * + * @param idx The index of the sub-place to deactivate (default 0 for scalar places) + * @param prev The previous state returned by activate() */ - const data_place affine_data_place() const + void deactivate(size_t idx, const exec_place& prev) const { - return pimpl->affine_data_place(); + pimpl->deactivate(idx, prev); } - void set_affine_data_place(data_place place) + /** + * @brief Convenience overload for scalar places (idx=0) + */ + void deactivate(const exec_place& prev) const { - pimpl->set_affine_data_place(mv(place)); + deactivate(0, prev); } - stream_pool& get_stream_pool(bool for_computation) const + /** + * @brief Set the current place for grid iteration + * + * Activates the place at the given index and saves state for later restoration. + */ + void set_current_place(size_t idx) { - return pimpl->get_stream_pool(for_computation); + auto cur_idx = pimpl->get_current_idx(); + if (cur_idx >= 0) + { + exec_place saved_prev(pimpl->get_saved_prev_impl()); + pimpl->deactivate(cur_idx, saved_prev); + } + pimpl->set_current_idx(static_cast<::std::ptrdiff_t>(idx)); + exec_place prev = pimpl->activate(idx); + pimpl->set_saved_prev_impl(prev.pimpl); } /** - * @brief Get a decorated stream from the stream pool associated to this execution place. + * @brief Set the current place using multi-dimensional position */ - decorated_stream getStream(bool for_computation) const; - - cudaStream_t pick_stream(bool for_computation = true) const + void set_current_place(pos4 p) { - return getStream(for_computation).stream; + set_current_place(get_dims().get_index(p)); } - // TODO make protected ! - const ::std::shared_ptr& get_impl() const + /** + * @brief Unset the current place, restoring previous execution context + */ + void unset_current_place() { - return pimpl; + auto cur_idx = pimpl->get_current_idx(); + EXPECT(cur_idx >= 0, "unset_current_place() called without corresponding set_current_place()"); + exec_place saved_prev(pimpl->get_saved_prev_impl()); + pimpl->deactivate(cur_idx, saved_prev); + pimpl->set_current_idx(-1); } /** - * @brief Set computation to run on this place. - * - * @return `exec_place` The previous execution place. See `deactivate` below. + * @brief Get the currently active sub-place */ - exec_place activate() const + exec_place get_current_place() const { - return pimpl->activate(); + auto cur_idx = pimpl->get_current_idx(); + EXPECT(cur_idx >= 0, "No current place set"); + return get_place(cur_idx); } /** - * @brief Undoes the effect of `activate`. Call with the previous `exec_place` object returned by `activate`. - * - * @warning Undefined behavior if you don't pass the result of `activate`. + * @brief Get the index of the currently active sub-place, or -1 if none */ - void deactivate(const exec_place& p) const + ::std::ptrdiff_t current_place_id() const + { + return pimpl->get_current_idx(); + } + + // ===== Properties ===== + + ::std::string to_string() const + { + return pimpl->to_string(); + } + + data_place affine_data_place() const { - pimpl->deactivate(p); + return pimpl->affine_data_place(); + } + + void set_affine_data_place(data_place place) + { + pimpl->set_affine_data_place(mv(place)); + } + + stream_pool& get_stream_pool(bool for_computation) const + { + return pimpl->get_stream_pool(for_computation); + } + + decorated_stream getStream(bool for_computation) const; + + cudaStream_t pick_stream(bool for_computation = true) const + { + return getStream(for_computation).stream; + } + + const ::std::shared_ptr& get_impl() const + { + return pimpl; } bool is_host() const { - return pimpl->is_host(); + return affine_data_place().is_host(); } bool is_device() const { - return pimpl->is_device(); + return affine_data_place().is_device(); } - bool is_grid() const + /** + * @brief Get the dimension along a specific axis + * @deprecated Use get_dims().get(axis_id) instead + */ + size_t grid_dim(int axis_id) const { - return pimpl->is_grid(); + return get_dims().get(axis_id); } - size_t size() const + /** + * @brief Get all dimensions + * @deprecated Use get_dims() instead + */ + dim4 grid_dims() const { - return pimpl->size(); + return get_dims(); } - // Get the implementation assuming this is a grid - // We need to defer the implementation after exec_place_grid has been - // defined because this requires a ::std::static_pointer_cast from the base - // class to exec_place_grid - exec_place_grid as_grid() const; - - size_t grid_dim(int axid_is) const; - dim4 grid_dims() const; + /** + * @brief Returns *this for compatibility + * @deprecated All places are grids now; use exec_place methods directly + */ + const exec_place& as_grid() const + { + EXPECT(size() > 1, "as_grid() called on scalar exec_place"); + return *this; + } /* These helper methods provide convenient way to express execution places, * for example exec_place::host or exec_place::device(4). @@ -711,8 +853,8 @@ public: static exec_place green_ctx(const green_ctx_view& gc_view, bool use_green_ctx_data_place = false); #endif // _CCCL_CTK_AT_LEAST(12, 4) - static exec_place_cuda_stream cuda_stream(cudaStream_t stream); - static exec_place_cuda_stream cuda_stream(const decorated_stream& dstream); + static exec_place cuda_stream(cudaStream_t stream); + static exec_place cuda_stream(const decorated_stream& dstream); /** * @brief Returns the currently active device. @@ -724,14 +866,14 @@ public: return exec_place::device(cuda_try()); } - static exec_place_grid all_devices(); + static exec_place all_devices(); - static exec_place_grid n_devices(size_t n, dim4 dims); + static exec_place n_devices(size_t n, dim4 dims); - static exec_place_grid n_devices(size_t n); + static exec_place n_devices(size_t n); // For debug purpose on a machine with a single GPU, for example - static exec_place_grid repeat(const exec_place& e, size_t cnt); + static exec_place repeat(const exec_place& e, size_t cnt); template auto partition_by_scope(Args&&... args); @@ -870,11 +1012,11 @@ inline decorated_stream exec_place::getStream(bool for_computation) const /** * @brief Designates execution that is to run on the host. * + * Host is modeled as a 1-element grid containing the host execution context. */ class exec_place_host : public exec_place { public: - // Implementation of the exec_place_host class class impl : public exec_place::impl { public: @@ -882,21 +1024,27 @@ public: : exec_place::impl(data_place::host()) {} - // operator<: base class implementation is correct (compares typeid, then device_ordinal). - // Since host is a singleton, all instances compare equal. + // Grid interface - host is a 1-element grid + exec_place get_place(size_t idx) const override; - exec_place activate() const override + // Activation - no-op for host + exec_place activate(size_t idx) const override { + EXPECT(idx == 0, "Index out of bounds for host exec_place"); return exec_place(); - } // no-op - void deactivate(const exec_place& p) const override + } + + void deactivate(size_t idx, const exec_place& prev) const override { - _CCCL_ASSERT(!p.get_impl(), ""); - } // no-op - virtual const data_place affine_data_place() const override + EXPECT(idx == 0, "Index out of bounds for host exec_place"); + _CCCL_ASSERT(!prev.get_impl(), "Host deactivate expects empty prev"); + } + + data_place affine_data_place() const override { return data_place::host(); } + stream_pool& get_stream_pool(bool for_computation) const override { return exec_place::current_device().get_stream_pool(for_computation); @@ -943,6 +1091,8 @@ UNITTEST("exec_place_host::operator->*") /** * @brief Designates execution that is to run on a specific CUDA device. + * + * Device is modeled as a 1-element grid containing that device. */ class exec_place_device : public exec_place { @@ -952,10 +1102,22 @@ public: public: explicit impl(int devid) : exec_place::impl(data_place::device(devid)) + , devid_(devid) { pool_compute = stream_pool(pool_size); pool_data = stream_pool(data_pool_size); } + + // Grid interface - device is a 1-element grid + exec_place get_place(size_t idx) const override; + + int get_devid() const + { + return devid_; + } + + private: + int devid_; }; }; @@ -1022,335 +1184,163 @@ UNITTEST("exec_place copyable") }; #endif // UNITTESTED_FILE -//! A multidimensional grid of execution places for structured parallel computation -class exec_place_grid : public exec_place +/** + * Implementation class for multi-device execution place grids. + * This is used internally by make_grid() and related factory functions. + */ +class exec_place_grid_impl : public exec_place::impl { public: - /* - * Implementation of the exec_place_grid - */ - class impl : public exec_place::impl + exec_place_grid_impl(::std::vector _places) + : dims_(_places.size(), 1, 1, 1) + , places_(mv(_places)) { - public: - // Define a grid directly from a vector of places - // This creates an execution grid automatically - impl(::std::vector _places) - : dims(_places.size(), 1, 1, 1) - , places(mv(_places)) - { - _CCCL_ASSERT(!places.empty(), ""); - _CCCL_ASSERT(dims.x > 0, ""); - _CCCL_ASSERT(affine.is_invalid(), ""); - } - - // With a "dim4 shape" - impl(::std::vector _places, const dim4& _dims) - : dims(_dims) - , places(mv(_places)) - { - _CCCL_ASSERT(dims.x > 0, ""); - _CCCL_ASSERT(affine.is_invalid(), ""); - } - - // TODO improve with a better description - ::std::string to_string() const final - { - return ::std::string("GRID place"); - } - - exec_place activate() const override - { - // No-op - return exec_place(); - } - - // TODO : shall we deactivate the current place, if any ? - void deactivate(const exec_place& _prev) const override - { - // No-op - EXPECT(!_prev.get_impl(), "Invalid execution place."); - } - - /* Dynamically checks whether an execution place is a device */ - bool is_device() const override - { - return false; - } - - /* Dynamically checks whether an execution place is a grid */ - bool is_grid() const override - { - return true; - } - - bool operator==(const exec_place::impl& rhs) const override - { - // First, check if rhs is of type exec_place_grid::impl - auto other = dynamic_cast(&rhs); - if (!other) - { - return false; // rhs is not a grid, so they are not equal - } - - // Compare two grids - return *this == *other; - } - - // Compare two grids - bool operator==(const impl& rhs) const - { - // Compare grid-specific properties - // Note: for grids, equality is determined by dims and places, not the affine data place - return dims == rhs.dims && places == rhs.places; - } - - size_t hash() const override - { - // Hash based on dims and places, consistent with operator== - size_t h = ::cuda::experimental::stf::hash{}(dims); - for (const auto& p : places) - { - hash_combine(h, p.hash()); - } - return h; - } - - bool operator<(const exec_place::impl& rhs) const override - { - // Different types: order by typeid - if (typeid(*this) != typeid(rhs)) - { - return typeid(*this).before(typeid(rhs)); - } - // Same type: safe to cast - const auto& other = static_cast(rhs); - // Compare dims first, then places - if (!(dims == other.dims)) - { - // Use tuple comparison for consistent ordering - return ::std::tie(dims.x, dims.y, dims.z, dims.t) - < ::std::tie(other.dims.x, other.dims.y, other.dims.z, other.dims.t); - } - return places < other.places; - } - - const ::std::vector& get_places() const - { - return places; - } - - stream_pool& get_stream_pool(bool for_computation) const override - { - _CCCL_ASSERT(!for_computation, "Expected data transfer stream pool"); - const auto& v = get_places(); - _CCCL_ASSERT(v.size() > 0, "Grid must have at least one place"); - return v[0].get_stream_pool(for_computation); - } - - exec_place grid_activate(size_t i) const - { - const auto& v = get_places(); - return v[i].activate(); - } + _CCCL_ASSERT(!places_.empty(), "Grid must have at least one place"); + _CCCL_ASSERT(dims_.x > 0, "Grid dimensions must be positive"); + } - void grid_deactivate(size_t i, exec_place p) const - { - const auto& v = get_places(); - v[i].deactivate(p); - } + exec_place_grid_impl(::std::vector _places, const dim4& _dims) + : dims_(_dims) + , places_(mv(_places)) + { + _CCCL_ASSERT(dims_.x > 0, "Grid dimensions must be positive"); + } - const exec_place& get_current_place() - { - return get_places()[current_p_1d]; - } + // ===== Grid interface ===== - // Set the current place from the 1D index within the grid (flattened grid) - void set_current_place(size_t p_index) - { - // Unset the previous place, if any - if (current_p_1d >= 0) - { - // First deactivate the previous place - grid_deactivate(current_p_1d, old_place); - } + dim4 get_dims() const override + { + return dims_; + } - // get the 1D index for that position - current_p_1d = (::std::ptrdiff_t) p_index; + size_t size() const override + { + return dims_.size(); + } - // The returned value contains the state to restore when we deactivate the place - old_place = grid_activate(current_p_1d); - } + exec_place get_place(size_t idx) const override + { + EXPECT(idx < places_.size(), "Index out of bounds"); + return places_[idx]; + } - // Set the current place, given the position in the grid - void set_current_place(pos4 p) - { - size_t p_index = dims.get_index(p); - set_current_place(p_index); - } + // ===== Activation (delegates to sub-places) ===== - void unset_current_place() - { - EXPECT(current_p_1d >= 0, "unset_current_place() called without corresponding call to set_current_place()"); + exec_place activate(size_t idx) const override + { + EXPECT(idx < places_.size(), "Index out of bounds"); + return places_[idx].activate(0); + } - // First deactivate the previous place - grid_deactivate(current_p_1d, old_place); - current_p_1d = -1; - } + void deactivate(size_t idx, const exec_place& prev) const override + { + EXPECT(idx < places_.size(), "Index out of bounds"); + places_[idx].deactivate(0, prev); + } - ::std::ptrdiff_t current_place_id() const - { - return current_p_1d; - } + // ===== Properties ===== - dim4 get_dims() const - { - return dims; - } + ::std::string to_string() const override + { + return "grid(" + ::std::to_string(dims_.x) + "x" + ::std::to_string(dims_.y) + "x" + ::std::to_string(dims_.z) + "x" + + ::std::to_string(dims_.t) + ")"; + } - size_t get_dim(int axis_id) const - { - return dims.get(axis_id); - } + // ===== Comparison ===== - size_t size() const override + int cmp(const exec_place::impl& rhs) const override + { + if (typeid(*this) != typeid(rhs)) { - return dims.size(); + return typeid(*this).before(typeid(rhs)) ? -1 : 1; } - - /* Get the place associated to this position in the grid */ - const exec_place& get_place(pos4 p) const + const auto& other = static_cast(rhs); + // Compare dims first + auto this_dims = ::std::tie(dims_.x, dims_.y, dims_.z, dims_.t); + auto other_dims = ::std::tie(other.dims_.x, other.dims_.y, other.dims_.z, other.dims_.t); + if (this_dims < other_dims) { - return coords_to_place(p); + return -1; } - - const exec_place& get_place(size_t p_index) const + if (other_dims < this_dims) { - return coords_to_place(p_index); + return 1; } - - private: - // What is the execution place at theses coordinates in the exec place grid ? - const exec_place& coords_to_place(size_t c0, size_t c1 = 0, size_t c2 = 0, size_t c3 = 0) const + // Then compare places + if (places_ < other.places_) { - // Flatten the (c0, c1, c2, c3) vector into a global index - size_t index = c0 + dims.get(0) * (c1 + dims.get(1) * (c2 + c3 * dims.get(2))); - return places[index]; + return -1; } - - const exec_place& coords_to_place(pos4 coords) const + if (other.places_ < places_) { - return coords_to_place(coords.x, coords.y, coords.z, coords.t); + return 1; } - - // current position in the grid (flattened to 1D) if we have a grid of - // execution place. -1 indicates there is no current position. - ::std::ptrdiff_t current_p_1d = -1; - - // saved state before setting the current place - exec_place old_place; - - // dimensions of the "grid" - dim4 dims; - ::std::vector places; - }; - - ///@{ @name Constructors - dim4 get_dims() const - { - return get_impl()->get_dims(); + return 0; } - size_t get_dim(int axis_id) const - { - return get_dims().get(axis_id); - } - - size_t size() const - { - return get_dims().size(); - } - - explicit operator bool() const - { - return get_impl() != nullptr; - } - - /* Note that we compare against the exact same implementation : we could - * have equivalent grids with the same execution places, but to avoid a - * costly comparison we here only look for actually identical grids. - */ - bool operator==(const exec_place_grid& rhs) const - { - return *get_impl() == *(rhs.get_impl()); - } - - ::std::ptrdiff_t current_place_id() const + size_t hash() const override { - return get_impl()->current_place_id(); + size_t h = ::cuda::experimental::stf::hash{}(dims_); + for (const auto& p : places_) + { + hash_combine(h, p.hash()); + } + return h; } - const exec_place& get_place(pos4 p) const - { - return get_impl()->get_place(p); - } + // ===== Stream management ===== - const ::std::vector& get_places() const + stream_pool& get_stream_pool(bool for_computation) const override { - return get_impl()->get_places(); + _CCCL_ASSERT(!for_computation, "Expected data transfer stream pool"); + _CCCL_ASSERT(!places_.empty(), "Grid must have at least one place"); + return places_[0].get_stream_pool(for_computation); } - // Set the current place from the 1D index within the grid (flattened grid) - void set_current_place(size_t p_index) - { - return get_impl()->set_current_place(p_index); - } + // ===== Grid iteration state ===== - // Get the current execution place - const exec_place& get_current_place() + ::std::ptrdiff_t get_current_idx() const override { - return get_impl()->get_current_place(); + return current_idx_; } - // Set the current place, given the position in the grid - void set_current_place(pos4 p) + void set_current_idx(::std::ptrdiff_t idx) const override { - return get_impl()->set_current_place(p); + current_idx_ = idx; } - void unset_current_place() + ::std::shared_ptr get_saved_prev_impl() const override { - return get_impl()->unset_current_place(); + return saved_prev_impl_; } - ::std::shared_ptr get_impl() const + void set_saved_prev_impl(::std::shared_ptr p) const override { - _CCCL_ASSERT(::std::dynamic_pointer_cast(exec_place::get_impl()), "Invalid exec_place_grid impl"); - return ::std::static_pointer_cast(exec_place::get_impl()); + saved_prev_impl_ = mv(p); } - // Default constructor - exec_place_grid() - : exec_place(nullptr) - {} - - // private: - exec_place_grid(::std::shared_ptr p) - : exec_place(mv(p)) - {} - - exec_place_grid(::std::vector p, const dim4& d) - : exec_place(::std::make_shared(mv(p), d)) - {} +private: + dim4 dims_; + ::std::vector places_; + mutable ::std::ptrdiff_t current_idx_ = -1; + mutable ::std::shared_ptr saved_prev_impl_; }; //! Creates a grid of execution places with specified dimensions -inline exec_place_grid make_grid(::std::vector places, const dim4& dims) +//! Returns the single element if size == 1 (no grid wrapper needed) +inline exec_place make_grid(::std::vector places, const dim4& dims) { - return exec_place_grid(mv(places), dims); + _CCCL_ASSERT(!places.empty(), "invalid places"); + if (places.size() == 1) + { + return mv(places[0]); + } + return exec_place(::std::make_shared(mv(places), dims)); } //! Creates a linear grid from a vector of execution places -inline exec_place_grid make_grid(::std::vector places) +//! Returns the single element if size == 1 (no grid wrapper needed) +inline exec_place make_grid(::std::vector places) { _CCCL_ASSERT(!places.empty(), "invalid places"); auto grid_dim = dim4(places.size(), 1, 1, 1); @@ -1358,7 +1348,6 @@ inline exec_place_grid make_grid(::std::vector places) } // === data_place::affine_exec_place implementation === -// Defined here after exec_place_grid is complete inline exec_place data_place::affine_exec_place() const { @@ -1376,7 +1365,6 @@ inline exec_place data_place::affine_exec_place() const if (is_composite()) { // Return the grid of places associated to this composite data place - // exec_place_grid inherits from exec_place, so this works via slicing return get_grid(); } @@ -1398,45 +1386,43 @@ inline exec_place data_place::affine_exec_place() const + ::std::to_string(pimpl_->get_device_ordinal())); } -/// Implementation deferred because we need the definition of exec_place_grid -inline exec_place exec_place::iterator::operator*() -{ - EXPECT(index < it_impl->size()); - if (it_impl->is_grid()) - { - return ::std::static_pointer_cast(it_impl)->get_place(index); - } - return exec_place(it_impl); -} +// === Deferred implementations for get_place() === -//! Creates a grid by replicating an execution place multiple times -inline exec_place_grid exec_place::repeat(const exec_place& e, size_t cnt) +inline exec_place exec_place::impl::get_place(size_t idx) const { - return make_grid(::std::vector(cnt, e)); + EXPECT(idx == 0, "Index out of bounds for scalar exec_place"); + // For generic scalar places, we can't easily return self + // This should be overridden by concrete implementations + return exec_place( + ::std::const_pointer_cast(::std::shared_ptr(::std::shared_ptr{}, this))); } -/* Deferred implementation : ::std::static_pointer_cast requires that exec_place_grid is a complete type */ -inline exec_place_grid exec_place::as_grid() const +inline exec_place exec_place_host::impl::get_place(size_t idx) const { - // Make sure it is really a grid - EXPECT(is_grid()); - return exec_place_grid(::std::static_pointer_cast(pimpl)); + EXPECT(idx == 0, "Index out of bounds for host exec_place"); + return exec_place::host(); } -inline dim4 exec_place::grid_dims() const +inline exec_place exec_place_device::impl::get_place(size_t idx) const { - EXPECT(is_grid()); - return ::std::static_pointer_cast(pimpl)->get_dims(); + EXPECT(idx == 0, "Index out of bounds for device exec_place"); + return exec_place::device(devid_); } -inline size_t exec_place::grid_dim(int axis_id) const +//! Creates a grid by replicating an execution place multiple times +//! Returns the original place if cnt == 1 (no grid wrapper needed) +inline exec_place exec_place::repeat(const exec_place& e, size_t cnt) { - EXPECT(is_grid()); - return ::std::static_pointer_cast(pimpl)->get_dim(axis_id); + if (cnt == 1) + { + return e; + } + return make_grid(::std::vector(cnt, e)); } /* Get the first N available devices */ -inline exec_place_grid exec_place::n_devices(size_t n, dim4 dims) +//! Returns single device if n == 1 (no grid wrapper needed) +inline exec_place exec_place::n_devices(size_t n, dim4 dims) { const int ndevs = cuda_try(); @@ -1453,21 +1439,23 @@ inline exec_place_grid exec_place::n_devices(size_t n, dim4 dims) } /* Get the first N available devices */ -inline exec_place_grid exec_place::n_devices(size_t n) +//! Returns single device if n == 1 (no grid wrapper needed) +inline exec_place exec_place::n_devices(size_t n) { return n_devices(n, dim4(n, 1, 1, 1)); } -inline exec_place_grid exec_place::all_devices() +//! Returns all available devices, or single device if only one GPU +inline exec_place exec_place::all_devices() { return n_devices(cuda_try()); } //! Creates a cyclic partition of an execution place grid with specified strides -inline exec_place_grid partition_cyclic(const exec_place_grid& e_place, dim4 strides, pos4 tile_id) +//! Returns single place if partition contains only one element +inline exec_place partition_cyclic(const exec_place& e_place, dim4 strides, pos4 tile_id) { - const auto& g = e_place.as_grid(); - dim4 g_dims = e_place.get_dims(); + dim4 g_dims = e_place.get_dims(); /* * Example : strides = (3, 2). tile 1 id = (1, 0) @@ -1479,15 +1467,10 @@ inline exec_place_grid partition_cyclic(const exec_place_grid& e_place, dim4 str // Dimension K_x of the new grid on axis x : // pos_x + K_x stride_x = dim_x // K_x = (dim_x - pos_x)/stride_x - dim4 size = dim4((g.get_dim(0) - tile_id.x + strides.x - 1) / strides.x, - (g.get_dim(1) - tile_id.y + strides.y - 1) / strides.y, - (g.get_dim(2) - tile_id.z + strides.z - 1) / strides.z, - (g.get_dim(3) - tile_id.t + strides.t - 1) / strides.t); - - // fprintf(stderr, "G DIM %d STRIDE %d ID %d\n", g_dims.x, strides.x, tile_id.x); - // fprintf(stderr, "G DIM %d STRIDE %d ID %d\n", g_dims.y, strides.y, tile_id.y); - // fprintf(stderr, "G DIM %d STRIDE %d ID %d\n", g_dims.z, strides.z, tile_id.z); - // fprintf(stderr, "G DIM %d STRIDE %d ID %d\n", g_dims.t, strides.t, tile_id.t); + dim4 size = dim4((g_dims.x - tile_id.x + strides.x - 1) / strides.x, + (g_dims.y - tile_id.y + strides.y - 1) / strides.y, + (g_dims.z - tile_id.z + strides.z - 1) / strides.z, + (g_dims.t - tile_id.t + strides.t - 1) / strides.t); ::std::vector places; places.reserve(size.x * size.y * size.z * size.t); @@ -1500,7 +1483,7 @@ inline exec_place_grid partition_cyclic(const exec_place_grid& e_place, dim4 str { for (size_t x = static_cast(tile_id.x); x < g_dims.x; x += strides.x) { - places.push_back(g.get_place(pos4(x, y, z, t))); + places.push_back(e_place.get_place(pos4(x, y, z, t))); } } } @@ -1514,23 +1497,21 @@ inline exec_place_grid partition_cyclic(const exec_place_grid& e_place, dim4 str } //! Creates a tiled partition of an execution place grid with specified tile sizes +//! Returns single place if partition contains only one element //! //! example : //! auto sub_g = partition_tile(g, dim4(2,2), dim4(0,1)) -inline exec_place_grid partition_tile(const exec_place_grid& e_place, dim4 tile_sizes, pos4 tile_id) +inline exec_place partition_tile(const exec_place& e_place, dim4 tile_sizes, pos4 tile_id) { - const auto& g = e_place.as_grid(); + dim4 g_dims = e_place.get_dims(); - // TODO define dim4=dim4 * dim4 dim4 begin_coords( tile_id.x * tile_sizes.x, tile_id.y * tile_sizes.y, tile_id.z * tile_sizes.z, tile_id.t * tile_sizes.t); - // TODO define dim4=MIN(dim4,dim4) - // upper bound coordinate (excluded) - dim4 end_coords(::std::min((tile_id.x + 1) * tile_sizes.x, g.get_dim(0)), - ::std::min((tile_id.y + 1) * tile_sizes.y, g.get_dim(1)), - ::std::min((tile_id.z + 1) * tile_sizes.z, g.get_dim(2)), - ::std::min((tile_id.t + 1) * tile_sizes.t, g.get_dim(3))); + dim4 end_coords(::std::min((tile_id.x + 1) * tile_sizes.x, g_dims.x), + ::std::min((tile_id.y + 1) * tile_sizes.y, g_dims.y), + ::std::min((tile_id.z + 1) * tile_sizes.z, g_dims.z), + ::std::min((tile_id.t + 1) * tile_sizes.t, g_dims.t)); // fprintf(stderr, "G DIM %d TILE SIZE %d ID %d\n", g_dims.x, tile_sizes.x, tile_id.x); // fprintf(stderr, "G DIM %d TILE SIZE %d ID %d\n", g_dims.y, tile_sizes.y, tile_id.y); @@ -1559,7 +1540,7 @@ inline exec_place_grid partition_tile(const exec_place_grid& e_place, dim4 tile_ { for (size_t x = static_cast(begin_coords.x); x < end_coords.x; x++) { - places.push_back(g.get_place(pos4(x, y, z, t))); + places.push_back(e_place.get_place(pos4(x, y, z, t))); } } } @@ -1581,7 +1562,7 @@ inline exec_place_grid partition_tile(const exec_place_grid& e_place, dim4 tile_ class data_place_composite final : public data_place_interface { public: - data_place_composite(exec_place_grid grid, get_executor_func_t partitioner_func) + data_place_composite(exec_place grid, get_executor_func_t partitioner_func) : grid_(mv(grid)) , partitioner_func_(mv(partitioner_func)) {} @@ -1641,7 +1622,7 @@ public: return false; } - const exec_place_grid& get_grid() const override + const exec_place& get_grid() const override { return grid_; } @@ -1652,7 +1633,7 @@ public: } private: - exec_place_grid grid_; + exec_place grid_; get_executor_func_t partitioner_func_; }; @@ -1662,14 +1643,14 @@ inline bool data_place::is_composite() const return typeid(ref) == typeid(data_place_composite); } -inline data_place data_place::composite(get_executor_func_t f, const exec_place_grid& grid) +inline data_place data_place::composite(get_executor_func_t f, const exec_place& grid) { return data_place(::std::make_shared(grid, f)); } // User-visible API when the same partitioner as the one of the grid template -data_place data_place::composite(partitioner_t, const exec_place_grid& g) +data_place data_place::composite(partitioner_t, const exec_place& g) { return data_place::composite(&partitioner_t::get_executor, g); } diff --git a/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh b/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh index 21ac9da53a6..3d71c5c6993 100644 --- a/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh +++ b/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh @@ -94,8 +94,8 @@ public: return; } - exec_place_grid grid = memory_node.get_grid(); - size_t total_size = this->shape.size(); + exec_place grid = memory_node.get_grid(); + size_t total_size = this->shape.size(); // position (x,y,z,t) on (nx,ny,nz,nt) // * index = x + nx*y + nx*ny*z + nx*ny*nz*t diff --git a/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh b/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh index 4176e74b01d..5c000862613 100644 --- a/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh +++ b/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh @@ -75,24 +75,23 @@ public: cudaStream_t get_stream() const { const auto& e_place = get_exec_place(); - if (e_place.is_grid()) + if (e_place.size() > 1) { // Even with a grid, when we have a ctx.task construct we have not // yet selected/activated a specific place. So we take the main // stream associated to the whole task in that case. - ::std::ptrdiff_t current_place_id = e_place.as_grid().current_place_id(); - return (current_place_id < 0 ? dstream.stream : stream_grid[current_place_id].stream); + ::std::ptrdiff_t current_id = e_place.current_place_id(); + return (current_id < 0 ? dstream.stream : stream_grid[current_id].stream); } return dstream.stream; } - // TODO use a pos4 and check that we have a grid, of the proper dimension cudaStream_t get_stream(size_t pos) const { const auto& e_place = get_exec_place(); - if (e_place.is_grid()) + if (e_place.size() > 1) { return stream_grid[pos].stream; } @@ -116,19 +115,15 @@ public: event_list ready_prereqs = acquire(ctx); /* Select the stream(s) */ - if (e_place.is_grid()) + if (e_place.size() > 1) { // We have currently no way to pass an array of per-place streams _CCCL_ASSERT(automatic_stream, "automatic stream is not enabled"); - // Note: we store grid in a variable to avoid dangling references - // because the compiler does not know we are making a reference to - // a vector that remains valid - const auto& grid = e_place.as_grid(); - const auto& places = grid.get_places(); - for (const exec_place& p : places) + // Get stream for each place in the grid + for (size_t i = 0; i < e_place.size(); ++i) { - stream_grid.push_back(p.getStream(true)); + stream_grid.push_back(e_place.get_place(i).getStream(true)); } EXPECT(stream_grid.size() > 0UL); @@ -187,7 +182,7 @@ public: } // Select one stream to sync with all prereqs - auto& s0 = e_place.is_grid() ? stream_grid[0] : dstream; + auto& s0 = (e_place.size() > 1) ? stream_grid[0] : dstream; /* Ensure that stream depend(s) on prereqs */ submitted_events = stream_async_op(ctx, s0, ready_prereqs); @@ -196,8 +191,8 @@ public: submitted_events.set_symbol("Submitted" + get_symbol()); } - /* If this is a grid, all other streams must wait on s0 too */ - if (e_place.is_grid()) + /* If this is a multi-place grid, all other streams must wait on s0 too */ + if (e_place.size() > 1) { insert_dependencies(stream_grid); } @@ -215,17 +210,17 @@ public: void set_current_place(pos4 p) { - get_exec_place().as_grid().set_current_place(p); + get_exec_place().set_current_place(p); } void unset_current_place() { - return get_exec_place().as_grid().unset_current_place(); + get_exec_place().unset_current_place(); } - const exec_place& get_current_place() + exec_place get_current_place() { - return get_exec_place().as_grid().get_current_place(); + return get_exec_place().get_current_place(); } /* End the task, but do not clear its data structures yet */ @@ -236,9 +231,8 @@ public: event_list end_list; const auto& e_place = get_exec_place(); - // Create an event with this stream - if (e_place.is_grid()) + if (e_place.size() > 1) { // s0 depends on all other streams for (size_t i = 1; i < stream_grid.size(); i++) diff --git a/cudax/test/stf/places/recursion.cu b/cudax/test/stf/places/recursion.cu index 3af51e4dc98..de86d7c22bf 100644 --- a/cudax/test/stf/places/recursion.cu +++ b/cudax/test/stf/places/recursion.cu @@ -12,7 +12,7 @@ using namespace cuda::experimental::stf; -void rec_func(exec_place_grid places) +void rec_func(exec_place places) { if (places.size() == 1) {