Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 8 additions & 9 deletions c/experimental/stf/src/stf.cu
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ static data_place to_data_place(stf_data_place* data_p)
{
return data_place::invalid();
}
exec_place_grid* grid_ptr = static_cast<exec_place_grid*>(grid_handle);
exec_place* grid_ptr = static_cast<exec_place*>(grid_handle);
// Layout-compatible: pass C mapper directly so the runtime calls it
get_executor_func_t cpp_mapper = reinterpret_cast<get_executor_func_t>(mapper);
return data_place::composite(cpp_mapper, *grid_ptr);
Expand Down Expand Up @@ -425,8 +425,8 @@ stf_exec_place_grid_handle stf_exec_place_grid_from_devices(const int* device_id
{
places.push_back(exec_place::device(device_ids[i]));
}
exec_place_grid grid = make_grid(::std::move(places));
return new exec_place_grid(::std::move(grid));
exec_place grid = make_grid(::std::move(places));
return new exec_place(::std::move(grid));
}

stf_exec_place_grid_handle
Expand All @@ -439,18 +439,17 @@ stf_exec_place_grid_create(const stf_exec_place* places, size_t count, const stf
{
cpp_places.push_back(to_exec_place(const_cast<stf_exec_place*>(&places[i])));
}
exec_place_grid grid =
(grid_dims != nullptr)
? make_grid(::std::move(cpp_places), dim4(grid_dims->x, grid_dims->y, grid_dims->z, grid_dims->t))
: make_grid(::std::move(cpp_places));
return new exec_place_grid(::std::move(grid));
exec_place grid = (grid_dims != nullptr)
? make_grid(::std::move(cpp_places), dim4(grid_dims->x, grid_dims->y, grid_dims->z, grid_dims->t))
: make_grid(::std::move(cpp_places));
return new exec_place(::std::move(grid));
}

void stf_exec_place_grid_destroy(stf_exec_place_grid_handle grid)
{
if (grid != nullptr)
{
delete static_cast<exec_place_grid*>(grid);
delete static_cast<exec_place*>(grid);
}
}

Expand Down
8 changes: 4 additions & 4 deletions cudax/include/cuda/experimental/__stf/graph/graph_task.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -456,17 +456,17 @@ public:

void set_current_place(pos4 p)
{
get_exec_place().as_grid().set_current_place(p);
get_exec_place().set_current_place(p);
}

void unset_current_place()
{
get_exec_place().as_grid().unset_current_place();
get_exec_place().unset_current_place();
}

const exec_place& get_current_place() const
exec_place get_current_place() const
{
return get_exec_place().as_grid().get_current_place();
return get_exec_place().get_current_place();
}

private:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,8 @@ public:
return;
}

exec_place_grid grid = memory_node.get_grid();
size_t total_size = this->shape.size();
exec_place grid = memory_node.get_grid();
size_t total_size = this->shape.size();

// position (x,y,z,t) on (nx,ny,nz,nt)
// * index = x + nx*y + nx*ny*z + nx*ny*nz*t
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1117,9 +1117,6 @@ public:
}
}

template <typename S, typename... Deps>
auto parallel_for(exec_place_grid e_place, S shape, Deps... deps) = delete;

template <typename S, typename... Deps>
auto parallel_for(S shape, Deps... deps)
{
Expand Down
8 changes: 4 additions & 4 deletions cudax/include/cuda/experimental/__stf/internal/launch.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ void cuda_launcher_graph(interpreted_spec interpreted_policy, Fun&& f, void** ar
template <typename Fun, typename interpreted_spec, typename Arg>
void launch_impl(interpreted_spec interpreted_policy, exec_place& p, Fun f, Arg arg, cudaStream_t stream, size_t rank)
{
assert(!p.is_grid());
assert(p.size() == 1);

p->*[&] {
auto th = thread_hierarchy(static_cast<int>(rank), interpreted_policy);
Expand Down Expand Up @@ -140,7 +140,7 @@ void launch_impl(interpreted_spec interpreted_policy, exec_place& p, Fun f, Arg
template <typename task_t, typename Fun, typename interpreted_spec, typename Arg>
void graph_launch_impl(task_t& t, interpreted_spec interpreted_policy, exec_place& p, Fun f, Arg arg, size_t rank)
{
assert(!p.is_grid());
assert(p.size() == 1);

auto kernel_args = tuple_prepend(thread_hierarchy(static_cast<int>(rank), interpreted_policy), mv(arg));
using args_type = decltype(kernel_args);
Expand Down Expand Up @@ -331,11 +331,11 @@ public:
assert(e_place.affine_data_place() == t.get_affine_data_place());

/*
* If we have a grid of places, the implicit affine partitioner is the blocked_partition.
* If we have a grid (including 1-element grids), the implicit affine partitioner is the blocked_partition.
*
* An explicit composite data place is required per data dependency to customize this behaviour.
*/
if (e_place.is_grid())
if (e_place.size() > 1)
{
// Create a composite data place defined by the grid of places + the partitioning function
t.set_affine_data_place(data_place::composite(blocked_partition(), e_place.as_grid()));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -161,8 +161,7 @@ inline void loop_dispatch(context_t ctx, size_t start, size_t end, ::std::functi
}
else
{
loop_dispatch<context_t, exec_place_grid, use_threads>(
mv(ctx), exec_place::all_devices(), scope, start, end, mv(func));
loop_dispatch<context_t, exec_place, use_threads>(mv(ctx), exec_place::all_devices(), scope, start, end, mv(func));
}
}
#endif // _CCCL_DOXYGEN_INVOKED
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -550,8 +550,8 @@ public:
// If there is a partitioner, we ensure there is a proper affine data place for this execution place
if constexpr (!::std::is_same_v<partitioner_t, null_partition>)
{
// This is only meaningful for grid of places
if (e_place.is_grid())
// Grids need a composite data place
if (e_place.size() > 1)
{
// Create a composite data place defined by the grid of places + the partitioning function
t.set_affine_data_place(data_place::composite(partitioner_t(), e_place.as_grid()));
Expand Down Expand Up @@ -629,7 +629,7 @@ public:
if constexpr (need_reduction)
{
_CCCL_ASSERT(e_place != exec_place::host(), "Reduce access mode currently unimplemented on host.");
_CCCL_ASSERT(!e_place.is_grid(), "Reduce access mode currently unimplemented on grid of places.");
_CCCL_ASSERT(e_place.size() == 1, "Reduce access mode currently unimplemented on grid of places.");
do_parallel_for_redux(f, e_place, shape, t);
return;
}
Expand Down Expand Up @@ -659,17 +659,18 @@ public:
if constexpr (!::std::is_same_v<exec_place_t, exec_place_host> && is_extended_host_device_lambda_closure_type
|| is_extended_device_lambda_closure_type)
{
if (!e_place.is_grid())
if (e_place.size() == 1)
{
// Apply the parallel_for construct over the entire shape on the
// execution place of the task
// execution place of the task.
const exec_place& scalar_place = e_place;
if constexpr (need_reduction)
{
do_parallel_for_redux(f, e_place, shape, t);
do_parallel_for_redux(f, scalar_place, shape, t);
}
else
{
do_parallel_for(f, e_place, shape, t);
do_parallel_for(f, scalar_place, shape, t);
}
}
else
Expand All @@ -681,11 +682,12 @@ public:
}
else
{
size_t grid_size = t.grid_dims().size();
const auto& t_place = t.get_exec_place();
size_t grid_size = t_place.size();
for (size_t i = 0; i < grid_size; i++)
{
t.set_current_place(pos4(i));
const auto sub_shape = partitioner_t::apply(shape, pos4(i), t.grid_dims());
const auto sub_shape = partitioner_t::apply(shape, pos4(i), t_place.get_dims());
do_parallel_for(f, t.get_current_place(), sub_shape, t);
t.unset_current_place();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,12 +75,8 @@ public:
// ::std::function<pos4(size_t)> delinearize : translate the index in a buffer into a position in the data
// TODO pass mv(place)
template <typename F>
localized_array(exec_place_grid grid,
get_executor_func_t mapper,
F&& delinearize,
size_t total_size,
size_t elemsize,
dim4 data_dims)
localized_array(
exec_place grid, get_executor_func_t mapper, F&& delinearize, size_t total_size, size_t elemsize, dim4 data_dims)
: grid(mv(grid))
, mapper(mv(mapper))
, total_size_bytes(total_size * elemsize)
Expand Down Expand Up @@ -422,7 +418,7 @@ private:
}

event_list prereqs; // To allow reuse in a cache
exec_place_grid grid;
exec_place grid;
get_executor_func_t mapper = nullptr;
::std::vector<metadata> meta;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ namespace cuda::experimental::stf
{
// Forward declarations
class exec_place;
class exec_place_grid;
class pos4;
class dim4;

Expand Down Expand Up @@ -184,7 +183,7 @@ public:
* @brief Get the grid for composite places
* @throws std::logic_error if not a composite place
*/
virtual const exec_place_grid& get_grid() const
virtual const exec_place& get_grid() const
{
throw ::std::logic_error("get_grid() called on non-composite data_place");
}
Expand Down
116 changes: 53 additions & 63 deletions cudax/include/cuda/experimental/__stf/places/exec/cuda_stream.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -30,92 +30,82 @@
namespace cuda::experimental::stf
{
/**
* @brief Designates execution that is to run on a specific CUDA stream
*
* @brief Implementation for CUDA stream execution places
*/
class exec_place_cuda_stream : public exec_place
class exec_place_cuda_stream_impl : public exec_place::impl
{
public:
class impl : public exec_place::impl
exec_place_cuda_stream_impl(const decorated_stream& dstream)
: exec_place::impl(data_place::device(dstream.dev_id))
, dstream_(dstream)
, dummy_pool_(dstream)
{}

exec_place get_place(size_t idx) const override
{
public:
impl(const decorated_stream& _dstream)
: exec_place::impl(data_place::device(_dstream.dev_id))
, dstream(_dstream)
, dummy_pool(_dstream)
{}

/* We set the current device to be the device on which the CUDA stream was created */
exec_place activate() const override
{
return exec_place::device(dstream.dev_id).activate();
}
EXPECT(idx == 0, "Index out of bounds for cuda_stream exec_place");
return exec_place::cuda_stream(dstream_);
}

void deactivate(const exec_place& prev) const override
{
return exec_place::device(dstream.dev_id).deactivate(prev);
}
exec_place activate(size_t idx) const override
{
EXPECT(idx == 0, "Index out of bounds for cuda_stream exec_place");
return exec_place::device(dstream_.dev_id).activate();
}

stream_pool& get_stream_pool(bool) const override
{
return dummy_pool;
}
void deactivate(size_t idx, const exec_place& prev) const override
{
EXPECT(idx == 0, "Index out of bounds for cuda_stream exec_place");
exec_place::device(dstream_.dev_id).deactivate(prev);
}

::std::string to_string() const override
{
return "exec(stream id=" + ::std::to_string(dstream.id) + " dev=" + ::std::to_string(dstream.dev_id) + ")";
}
stream_pool& get_stream_pool(bool) const override
{
return dummy_pool_;
}

bool operator==(const exec_place::impl& rhs) const override
::std::string to_string() const override
{
return "exec(stream id=" + ::std::to_string(dstream_.id) + " dev=" + ::std::to_string(dstream_.dev_id) + ")";
}

int cmp(const exec_place::impl& rhs) const override
{
if (typeid(*this) != typeid(rhs))
{
if (typeid(*this) != typeid(rhs))
{
return false;
}
const auto& other = static_cast<const impl&>(rhs);
// Compare by stream handle
return dstream.stream == other.dstream.stream;
return typeid(*this).before(typeid(rhs)) ? -1 : 1;
}

size_t hash() const override
const auto& other = static_cast<const exec_place_cuda_stream_impl&>(rhs);
if (dstream_.stream < other.dstream_.stream)
{
// Hash the stream handle, not the affine data place
return ::std::hash<cudaStream_t>()(dstream.stream);
return -1;
}

bool operator<(const exec_place::impl& rhs) const override
if (other.dstream_.stream < dstream_.stream)
{
if (typeid(*this) != typeid(rhs))
{
return typeid(*this).before(typeid(rhs));
}
const auto& other = static_cast<const impl&>(rhs);
return dstream.stream < other.dstream.stream;
return 1;
}
return 0;
}

private:
decorated_stream dstream;
// We create a dummy pool of streams which only consists in a single stream in practice.
mutable stream_pool dummy_pool;
};

public:
exec_place_cuda_stream(const decorated_stream& dstream)
: exec_place(::std::make_shared<impl>(dstream))
size_t hash() const override
{
static_assert(sizeof(exec_place_cuda_stream) == sizeof(exec_place),
"exec_place_cuda_stream cannot add state; it would be sliced away.");
return ::std::hash<cudaStream_t>()(dstream_.stream);
}

private:
decorated_stream dstream_;
mutable stream_pool dummy_pool_;
};

inline exec_place_cuda_stream exec_place::cuda_stream(cudaStream_t stream)
inline exec_place exec_place::cuda_stream(cudaStream_t stream)
{
int devid = get_device_from_stream(stream);
return exec_place_cuda_stream(decorated_stream(stream, get_stream_id(stream), devid));
return exec_place(
::std::make_shared<exec_place_cuda_stream_impl>(decorated_stream(stream, get_stream_id(stream), devid)));
}

inline exec_place_cuda_stream exec_place::cuda_stream(const decorated_stream& dstream)
inline exec_place exec_place::cuda_stream(const decorated_stream& dstream)
{
return exec_place_cuda_stream(dstream);
return exec_place(::std::make_shared<exec_place_cuda_stream_impl>(dstream));
}
} // end namespace cuda::experimental::stf
Loading
Loading