diff --git a/cub/cub/agent/agent_select_if.cuh b/cub/cub/agent/agent_select_if.cuh index 380e7016164..a1a3d06b950 100644 --- a/cub/cub/agent/agent_select_if.cuh +++ b/cub/cub/agent/agent_select_if.cuh @@ -203,7 +203,7 @@ struct AgentSelectIf //--------------------------------------------------------------------- // Types and constants //--------------------------------------------------------------------- - using ScanTileStateT = ScanTileState; + using ScanTileStateT = AtomicsBasedTileState; // Indicates whether the BlockLoad algorithm uses shared memory to load or exchange the data static constexpr bool loads_via_smem = @@ -222,7 +222,7 @@ struct AgentSelectIf // If we need to enforce memory order for in-place stream compaction, wrap the default decoupled look-back tile // state in a helper class that enforces memory order on reads and writes - using MemoryOrderedTileStateT = tile_state_with_memory_order; + using MemoryOrderedTileStateT = ScanTileStateT; // tile_state_with_memory_order; // The input value type using InputT = it_value_t; @@ -284,7 +284,7 @@ struct AgentSelectIf // Callback type for obtaining tile prefix during block scan using DelayConstructorT = typename AgentSelectIfPolicyT::detail::delay_constructor_t; using TilePrefixCallbackOpT = - TilePrefixCallbackOp, MemoryOrderedTileStateT, DelayConstructorT>; + AtomicsBasedTilePrefixCallbackOp, MemoryOrderedTileStateT, DelayConstructorT>; // Item exchange type using ItemExchangeT = InputT[TILE_ITEMS]; @@ -400,7 +400,7 @@ struct AgentSelectIf for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { // Out-of-bounds items are selection_flags - selection_flags[ITEM] = 1; + selection_flags[ITEM] = false; if (!IS_LAST_TILE || (static_cast(threadIdx.x * ITEMS_PER_THREAD + ITEM) < num_tile_items)) { @@ -429,7 +429,7 @@ struct AgentSelectIf _CCCL_PRAGMA_UNROLL_FULL() for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { - selection_flags[ITEM] = true; + selection_flags[ITEM] = false; } // Guarded loads BlockLoadFlags(temp_storage.load_flags) @@ -470,7 +470,7 @@ struct AgentSelectIf { // Out-of-bounds items are selection_flags BlockLoadFlags(temp_storage.load_flags) - .Load((d_flags_in + streaming_context.input_offset()) + tile_offset, flags, num_tile_items, 1); + .Load((d_flags_in + streaming_context.input_offset()) + tile_offset, flags, num_tile_items, 0); } else { @@ -496,7 +496,7 @@ struct AgentSelectIf OffsetT (&selection_flags)[ITEMS_PER_THREAD], constant_t /*select_method*/) { - if (IS_FIRST_TILE && streaming_context.is_first_partition()) + if ((tile_offset == 0 || IS_FIRST_TILE) && streaming_context.is_first_partition()) { __syncthreads(); @@ -524,7 +524,7 @@ struct AgentSelectIf // Set selection_flags for out-of-bounds items if ((IS_LAST_TILE) && (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_tile_items)) { - selection_flags[ITEM] = 1; + selection_flags[ITEM] = 0; } } } @@ -857,7 +857,7 @@ struct AgentSelectIf 0, 0, num_tile_selections, - bool_constant_v < SelectionOpt == SelectImpl::Partition >); + bool_constant_v); return num_tile_selections; } @@ -918,14 +918,6 @@ struct AgentSelectIf OffsetT num_selections_prefix = prefix_op.GetExclusivePrefix(); OffsetT num_rejected_prefix = tile_offset - num_selections_prefix; - // Discount any out-of-bounds selections - if (IS_LAST_TILE) - { - int num_discount = TILE_ITEMS - num_tile_items; - num_selections -= num_discount; - num_tile_selections -= num_discount; - } - // note (only applies to in-place stream compaction): We can avoid having to introduce explicit memory order between // the look-back (i.e., loading previous tiles' states) and scattering items (which means, potentially overwriting // previous tiles' input items, in case of in-place compaction), because this is implicitly ensured through @@ -940,7 +932,7 @@ struct AgentSelectIf num_selections_prefix, num_rejected_prefix, num_selections, - bool_constant_v < SelectionOpt == SelectImpl::Partition >); + bool_constant_v); return num_selections; } @@ -966,14 +958,7 @@ struct AgentSelectIf ConsumeTile(int num_tile_items, int tile_idx, OffsetT tile_offset, MemoryOrderedTileStateT& tile_state_wrapper) { OffsetT num_selections; - if (tile_idx == 0) - { - num_selections = ConsumeFirstTile(num_tile_items, tile_offset, tile_state_wrapper); - } - else - { - num_selections = ConsumeSubsequentTile(num_tile_items, tile_idx, tile_offset, tile_state_wrapper); - } + num_selections = ConsumeSubsequentTile(num_tile_items, tile_idx, tile_offset, tile_state_wrapper); return num_selections; } @@ -998,7 +983,7 @@ struct AgentSelectIf ConsumeRange(int num_tiles, ScanTileStateT& tile_state, NumSelectedIteratorT d_num_selected_out) { // Ensure consistent memory order across all tile status updates and loads - auto tile_state_wrapper = MemoryOrderedTileStateT{tile_state}; + auto tile_state_wrapper = tile_state; // Blocks are launched in increasing order, so just assign one tile per block // TODO (elstehle): replacing this term with just `blockIdx.x` degrades perf for partition. Once we get to re-tune @@ -1006,6 +991,7 @@ struct AgentSelectIf int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index OffsetT tile_offset = static_cast(tile_idx) * static_cast(TILE_ITEMS); + OffsetT num_selections; if (tile_idx < num_tiles - 1) { // Not the last tile (full) @@ -1014,13 +1000,17 @@ struct AgentSelectIf else { // The last tile (possibly partially-full) - OffsetT num_remaining = num_items - tile_offset; - OffsetT num_selections = ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state_wrapper); - - if (threadIdx.x == 0) + OffsetT num_remaining = num_items - tile_offset; + num_selections = ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state_wrapper); + } + if (threadIdx.x == 0) + { + auto tombstones = tile_state.note_tombstone(); + if (tombstones == gridDim.x - 1) { + // printf("Final tile: %lld\n", (long long)(tile_state.get_aggregate())); // Update the number of selected items with this partition's selections - streaming_context.update_num_selected(d_num_selected_out, num_selections); + streaming_context.update_num_selected(d_num_selected_out, tile_state.get_aggregate()); } } } diff --git a/cub/cub/agent/single_pass_scan_operators.cuh b/cub/cub/agent/single_pass_scan_operators.cuh index 9f937ddee62..d9535c09a9a 100644 --- a/cub/cub/agent/single_pass_scan_operators.cuh +++ b/cub/cub/agent/single_pass_scan_operators.cuh @@ -586,6 +586,198 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE cudaError_t tile_state_init( return AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes); } +template +struct AtomicsBasedTileState +{ + struct counters_t { + T d_atomic_offset; + uint32_t d_atomic_tombstones; + }; + + // Device storage + counters_t* d_atomic_counter = nullptr; + + /** + * @brief Initializer + * + * @param[in] num_tiles + * Number of tiles. Unused in this implementation. + * + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. + * When nullptr, the required allocation size is written to \p temp_storage_bytes and no work is done. + * + * @param[in] temp_storage_bytes + * Size in bytes of \t d_temp_storage allocation + */ + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE cudaError_t Init(int /*num_tiles*/, void* d_temp_storage, size_t temp_storage_bytes) + { + // Ensure temporary storage allocation is sufficient + if(temp_storage_bytes < sizeof(counters_t)) + { + return cudaErrorInvalidValue; + } + d_atomic_counter = reinterpret_cast(d_temp_storage); + + return cudaSuccess; + } + + /** + * @brief Compute device memory needed for tile status + */ + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE static constexpr cudaError_t + AllocationSize(int /*num_tiles*/, size_t& temp_storage_bytes) + { + temp_storage_bytes = sizeof(counters_t); + return cudaSuccess; + } + + /** + * Initialize (from device) + */ + _CCCL_DEVICE _CCCL_FORCEINLINE void InitializeStatus(int /*num_tiles*/) + { + int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x; + if (tile_idx < 1) + { + // printf("InitializeStatus %lld (BID %d, TID %d)\n", (long long)0, blockIdx.x, threadIdx.x); + d_atomic_counter->d_atomic_offset = T{0}; + d_atomic_counter->d_atomic_tombstones = T{0}; + } + } + + /** + * Update the specified tile's inclusive value and corresponding status + */ + _CCCL_DEVICE _CCCL_FORCEINLINE void SetInclusive(int /*tile_idx*/, T tile_inclusive) + { + auto x = atomicAdd(&d_atomic_counter->d_atomic_offset, tile_inclusive); + // printf("set inclusive addL %lld, old: %lld (BID %d, TID %d)\n", (long long) tile_inclusive, (long long)x, blockIdx.x, threadIdx.x); + } + + /** + * Update the specified tile's inclusive value and corresponding status + */ + _CCCL_DEVICE _CCCL_FORCEINLINE auto atomic_add( T block_aggregate) + { + auto x = atomicAdd(&d_atomic_counter->d_atomic_offset, block_aggregate); + // printf("atomic_add add: %lld, old: %lld (BID %d, TID %d)\n", (long long)block_aggregate, (long long)x, blockIdx.x, threadIdx.x); + return x; + } + + /** + * Update the specified tile's inclusive value and corresponding status + */ + _CCCL_DEVICE _CCCL_FORCEINLINE auto note_tombstone() + { + auto x = atomicAdd(&d_atomic_counter->d_atomic_tombstones, 1); + // printf("note_tombstone add: 1, old: %lld (BID %d, TID %d)\n", (long long)x, blockIdx.x, threadIdx.x); + return x; + } + + /** + * Update the specified tile's inclusive value and corresponding status + */ + _CCCL_DEVICE _CCCL_FORCEINLINE auto get_aggregate() + { + return d_atomic_counter->d_atomic_offset; + } +}; + + +/** + * Stateful block-scan prefix functor. Provides the the running prefix for + * the current tile by using the call-back warp to wait on on + * aggregates/prefixes from predecessor tiles to become available. + * + * @tparam DelayConstructorT + * Implementation detail, do not specify directly, requirements on the + * content of this type are subject to breaking change. + */ +template > +struct AtomicsBasedTilePrefixCallbackOp +{ + // Temporary storage type + struct _TempStorage + { + T exclusive_prefix; + T inclusive_prefix; + T block_aggregate; + }; + + // Alias wrapper allowing temporary storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> + {}; + + // Fields + _TempStorage& temp_storage; ///< Reference to a warp-reduction instance + ScanTileStateT& tile_status; ///< Interface to tile status + ScanOpT scan_op; ///< Binary scan operator + int tile_idx; ///< The current tile index + T exclusive_prefix; ///< Exclusive prefix for the tile + T inclusive_prefix; ///< Inclusive prefix for the tile + + // Constructs prefix functor for a given tile index. + // Precondition: thread blocks processing all of the predecessor tiles were scheduled. + _CCCL_DEVICE _CCCL_FORCEINLINE + AtomicsBasedTilePrefixCallbackOp(ScanTileStateT& tile_status, TempStorage& temp_storage, ScanOpT scan_op, int tile_idx) + : temp_storage(temp_storage.Alias()) + , tile_status(tile_status) + , scan_op(scan_op) + , tile_idx(tile_idx) + {} + + // Computes the tile index and constructs prefix functor with it. + // Precondition: thread block per tile assignment. + _CCCL_DEVICE _CCCL_FORCEINLINE + AtomicsBasedTilePrefixCallbackOp(ScanTileStateT& tile_status, TempStorage& temp_storage, ScanOpT scan_op) + : AtomicsBasedTilePrefixCallbackOp(tile_status, temp_storage, scan_op, blockIdx.x) + {} + + // BlockScan prefix callback functor (called by the first warp) + _CCCL_DEVICE _CCCL_FORCEINLINE T operator()(T block_aggregate) + { + // Compute the inclusive tile prefix and update the status for this tile + T thread_exclusive_prefix{}; + if (threadIdx.x == 0) + { + thread_exclusive_prefix = tile_status.atomic_add(block_aggregate); + exclusive_prefix = thread_exclusive_prefix; + inclusive_prefix = thread_exclusive_prefix + block_aggregate; + temp_storage.block_aggregate = block_aggregate; + temp_storage.exclusive_prefix = exclusive_prefix; + temp_storage.inclusive_prefix = inclusive_prefix; + } + + // Broadcast exclusive_prefix to other threads + exclusive_prefix = __shfl_sync(0xffffffff, exclusive_prefix, 0, 32); + + // Return exclusive_prefix + return exclusive_prefix; + } + + // Get the exclusive prefix stored in temporary storage + _CCCL_DEVICE _CCCL_FORCEINLINE T GetExclusivePrefix() + { + return temp_storage.exclusive_prefix; + } + + // Get the inclusive prefix stored in temporary storage + _CCCL_DEVICE _CCCL_FORCEINLINE T GetInclusivePrefix() + { + return temp_storage.inclusive_prefix; + } + + // Get the block aggregate stored in temporary storage + _CCCL_DEVICE _CCCL_FORCEINLINE T GetBlockAggregate() + { + return temp_storage.block_aggregate; + } +}; + } // namespace detail /** @@ -645,7 +837,6 @@ struct ScanTileState * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. - * When nullptr, the required allocation size is written to \p temp_storage_bytes and no work is * done. * * @param[in] temp_storage_bytes @@ -849,9 +1040,7 @@ struct ScanTileState * Number of tiles * * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. - * When nullptr, the required allocation size is written to \p temp_storage_bytes and no work is - * done. + * Device-accessible allocation of temporary storage. When nullptr, no work is done. * * @param[in] temp_storage_bytes * Size in bytes of \t d_temp_storage allocation @@ -1061,8 +1250,7 @@ struct ReduceByKeyScanTileState * Number of tiles * * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When nullptr, the required allocation size - * is written to \p temp_storage_bytes and no work is done. + * Device-accessible allocation of temporary storage. When nullptr, no work is done. * * @param[in] temp_storage_bytes * Size in bytes of \t d_temp_storage allocation diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh index bb155ae0335..4b79ea71edb 100644 --- a/cub/cub/device/dispatch/dispatch_select_if.cuh +++ b/cub/cub/device/dispatch/dispatch_select_if.cuh @@ -450,7 +450,8 @@ struct DispatchSelectIf using streaming_context_t = detail::select::streaming_context_t; - using ScanTileStateT = ScanTileState; + // using ScanTileStateT = ScanTileState; + using ScanTileStateT = detail::AtomicsBasedTileState; static constexpr int INIT_KERNEL_THREADS = 128; diff --git a/cub/test/catch2_test_device_select_if.cu b/cub/test/catch2_test_device_select_if.cu index 09c7a0cbbee..e5d174552a3 100644 --- a/cub/test/catch2_test_device_select_if.cu +++ b/cub/test/catch2_test_device_select_if.cu @@ -93,39 +93,64 @@ using all_types = long2, c2h::custom_type_t>; -using types = - c2h::type_list>; +using types = c2h::type_list; -C2H_TEST("DeviceSelect::If can run with empty input", "[device][select_if]", types) +template +void verify_results(c2h::host_vector& expected_data, c2h::device_vector& test_results) { - using type = typename c2h::get<0, TestType>; - - constexpr int num_items = 0; - c2h::device_vector in(num_items); - c2h::device_vector out(num_items); - - // Needs to be device accessible - c2h::device_vector num_selected_out(1, 42); - int* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data()); - - select_if(in.begin(), out.begin(), d_num_selected_out, num_items, always_true_t{}); - - REQUIRE(num_selected_out[0] == 0); + // Ensure that we created the correct output + REQUIRE(test_results.size() == expected_data.size()); + + thrust::sort(expected_data.begin(), expected_data.end()); + thrust::sort(test_results.begin(), test_results.end()); + + // int num_selected_out = static_cast(expected_data.size()); + // for(int i = 0; i < num_selected_out; ++i) + // { + // if(expected_data[i] != test_results[i]){ + // std::cout << "i: " << i << " expected_data[i]: " << expected_data[i] << " out[i]: " << test_results[i] << + // std::endl; for(int j = i > 5?i-5:0; j < num_selected_out && j < i + 5; ++j) + // { + // std::cout << "i: " << j << " expected_data[i]: " << expected_data[j] << " out[i]: " << test_results[j] << + // std::endl; + // } + // break; + // } + // } + REQUIRE(expected_data == test_results); +} +template +void verify_results(c2h::device_vector& expected_data, c2h::device_vector& test_results) +{ + // Ensure that we created the correct output + REQUIRE(test_results.size() == expected_data.size()); + + thrust::sort(expected_data.begin(), expected_data.end()); + thrust::sort(test_results.begin(), test_results.end()); + + // int num_selected_out = static_cast(expected_data.size()); + // for(int i = 0; i < num_selected_out; ++i) + // { + // if(expected_data[i] != test_results[i]){ + // std::cout << "i: " << i << " expected_data[i]: " << expected_data[i] << " out[i]: " << test_results[i] << + // std::endl; for(int j = i > 5?i-5:0; j < num_selected_out && j < i + 5; ++j) + // { + // std::cout << "i: " << j << " expected_data[i]: " << expected_data[j] << " out[i]: " << test_results[j] << + // std::endl; + // } + // break; + // } + // } + REQUIRE(expected_data == test_results); } C2H_TEST("DeviceSelect::If handles all matched", "[device][select_if]", types) { using type = typename c2h::get<0, TestType>; - const int num_items = GENERATE_COPY(take(2, random(1, 1000000))); + const int num_items = GENERATE_COPY(take(2, random(1, 20000000))); c2h::device_vector in(num_items); - c2h::device_vector out(num_items); + c2h::device_vector out(num_items, 42); c2h::gen(C2H_SEED(2), in); // Needs to be device accessible @@ -135,56 +160,14 @@ C2H_TEST("DeviceSelect::If handles all matched", "[device][select_if]", types) select_if(in.begin(), out.begin(), d_first_num_selected_out, num_items, always_true_t{}); REQUIRE(num_selected_out[0] == num_items); - REQUIRE(out == in); -} - -C2H_TEST("DeviceSelect::If handles no matched", "[device][select_if]", types) -{ - using type = typename c2h::get<0, TestType>; - - const int num_items = GENERATE_COPY(take(2, random(1, 1000000))); - c2h::device_vector in(num_items); - c2h::device_vector out(0); - c2h::gen(C2H_SEED(2), in); - - // Needs to be device accessible - c2h::device_vector num_selected_out(1, 0); - int* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data()); - - select_if(in.begin(), out.begin(), d_first_num_selected_out, num_items, always_false_t{}); - - REQUIRE(num_selected_out[0] == 0); -} - -C2H_TEST("DeviceSelect::If does not change input", "[device][select_if]", types) -{ - using type = typename c2h::get<0, TestType>; - - const int num_items = GENERATE_COPY(take(2, random(1, 1000000))); - c2h::device_vector in(num_items); - c2h::device_vector out(num_items); - c2h::gen(C2H_SEED(2), in); - - // just pick one of the input elements as boundary - less_than_t le{in[num_items / 2]}; - - // Needs to be device accessible - c2h::device_vector num_selected_out(1, 0); - int* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data()); - - // copy input first - c2h::device_vector reference = in; - - select_if(in.begin(), out.begin(), d_first_num_selected_out, num_items, le); - - REQUIRE(reference == in); + verify_results(in, out); } C2H_TEST("DeviceSelect::If is stable", "[device][select_if]") { - using type = c2h::custom_type_t; + using type = uint32_t; - const int num_items = GENERATE_COPY(take(2, random(1, 1000000))); + const int num_items = GENERATE_COPY(take(2, random(1, 2000000))); c2h::device_vector in(num_items); c2h::device_vector out(num_items); c2h::gen(C2H_SEED(2), in); @@ -208,237 +191,6 @@ C2H_TEST("DeviceSelect::If is stable", "[device][select_if]") out.resize(num_selected_out[0]); reference.resize(num_selected_out[0]); - REQUIRE(reference == out); -} - -C2H_TEST("DeviceSelect::If works with iterators", "[device][select_if]", all_types) -{ - using type = typename c2h::get<0, TestType>; - - const int num_items = GENERATE_COPY(take(2, random(1, 1000000))); - c2h::device_vector in(num_items); - c2h::device_vector out(num_items); - c2h::gen(C2H_SEED(2), in); - - // just pick one of the input elements as boundary - less_than_t le{in[num_items / 2]}; - - // Needs to be device accessible - c2h::device_vector num_selected_out(1, 0); - int* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data()); - - select_if(in.begin(), out.begin(), d_first_num_selected_out, num_items, le); - - const auto boundary = out.begin() + num_selected_out[0]; - REQUIRE(thrust::all_of(c2h::device_policy, out.begin(), boundary, le)); - REQUIRE(thrust::all_of(c2h::device_policy, boundary, out.end(), equal_to_default_t{})); -} - -C2H_TEST("DeviceSelect::If works with pointers", "[device][select_if]", types) -{ - using type = typename c2h::get<0, TestType>; - - const int num_items = GENERATE_COPY(take(2, random(1, 1000000))); - c2h::device_vector in(num_items); - c2h::device_vector out(num_items); - c2h::gen(C2H_SEED(2), in); - - // just pick one of the input elements as boundary - less_than_t le{in[num_items / 2]}; - - // Needs to be device accessible - c2h::device_vector num_selected_out(1, 0); - int* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data()); - - select_if( - thrust::raw_pointer_cast(in.data()), thrust::raw_pointer_cast(out.data()), d_first_num_selected_out, num_items, le); - - const auto boundary = out.begin() + num_selected_out[0]; - REQUIRE(thrust::all_of(c2h::device_policy, out.begin(), boundary, le)); - REQUIRE(thrust::all_of(c2h::device_policy, boundary, out.end(), equal_to_default_t{})); -} - -C2H_TEST("DeviceSelect::If works in place", "[device][select_if]", types) -{ - using type = typename c2h::get<0, TestType>; - - const int num_items = GENERATE_COPY(take(2, random(1, 1000000))); - c2h::device_vector in(num_items); - c2h::gen(C2H_SEED(2), in); - - // just pick one of the input elements as boundary - less_than_t le{in[num_items / 2]}; - - // Needs to be device accessible - c2h::device_vector num_selected_out(1, 0); - int* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data()); - - // Ensure that we create the same output as std - c2h::host_vector reference = in; - std::stable_partition(reference.begin(), reference.end(), le); - - select_if(in.begin(), d_first_num_selected_out, num_items, le); - - in.resize(num_selected_out[0]); - reference.resize(num_selected_out[0]); - REQUIRE(reference == in); -} - -template -struct convertible_from_T -{ - T val_; - - convertible_from_T() = default; - __host__ __device__ convertible_from_T(const T& val) noexcept - : val_(val) - {} - __host__ __device__ convertible_from_T& operator=(const T& val) noexcept - { - val_ = val; - } - // Converting back to T helps satisfy all the machinery that T supports - __host__ __device__ operator T() const noexcept - { - return val_; - } -}; - -C2H_TEST("DeviceSelect::If works with a different output type", "[device][select_if]") -{ - using type = c2h::custom_type_t; - - const int num_items = GENERATE_COPY(take(2, random(1, 1000000))); - c2h::device_vector in(num_items); - c2h::device_vector> out(num_items); - c2h::gen(C2H_SEED(2), in); - // just pick one of the input elements as boundary - less_than_t le{in[num_items / 2]}; - - // Needs to be device accessible - c2h::device_vector num_selected_out(1, 0); - int* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data()); - - select_if(in.begin(), out.begin(), d_first_num_selected_out, num_items, le); - - const auto boundary = out.begin() + num_selected_out[0]; - REQUIRE(thrust::all_of(c2h::device_policy, out.begin(), boundary, le)); - REQUIRE(thrust::all_of(c2h::device_policy, boundary, out.end(), equal_to_default_t{})); -} - -C2H_TEST("DeviceSelect::If works for very large number of items", "[device][select_if]") -try -{ - using type = std::int64_t; - using offset_t = std::int64_t; - - // The partition size (the maximum number of items processed by a single kernel invocation) is an important boundary - constexpr auto max_partition_size = static_cast(::cuda::std::numeric_limits::max()); - - offset_t num_items = GENERATE_COPY( - values({ - offset_t{2} * max_partition_size + offset_t{20000000}, // 3 partitions - offset_t{2} * max_partition_size, // 2 partitions - max_partition_size + offset_t{1}, // 2 partitions - max_partition_size, // 1 partitions - max_partition_size - offset_t{1} // 1 partitions - }), - take(2, random(max_partition_size - offset_t{1000000}, max_partition_size + offset_t{1000000}))); - - // Input - auto in = thrust::make_counting_iterator(static_cast(0)); - - // Needs to be device accessible - c2h::device_vector num_selected_out(1, 0); - offset_t* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data()); - - // Run test - constexpr offset_t match_every_nth = 1000000; - offset_t expected_num_copied = (num_items + match_every_nth - offset_t{1}) / match_every_nth; - c2h::device_vector out(expected_num_copied); - select_if( - in, out.begin(), d_first_num_selected_out, num_items, mod_n{static_cast(match_every_nth)}); - - // Ensure that we created the correct output - REQUIRE(num_selected_out[0] == expected_num_copied); - auto expected_out_it = - thrust::make_transform_iterator(in, multiply_n{static_cast(match_every_nth)}); - bool all_results_correct = thrust::equal(out.cbegin(), out.cend(), expected_out_it); - REQUIRE(all_results_correct == true); -} -catch (std::bad_alloc&) -{ - // Exceeding memory is not a failure. -} - -C2H_TEST("DeviceSelect::If works for very large number of output items", "[device][select_if]") -try -{ - using type = std::uint8_t; - using offset_t = std::int64_t; - - // The partition size (the maximum number of items processed by a single kernel invocation) is an important boundary - constexpr auto max_partition_size = static_cast(::cuda::std::numeric_limits::max()); - - offset_t num_items = GENERATE_COPY( - values({ - offset_t{2} * max_partition_size + offset_t{20000000}, // 3 partitions - offset_t{2} * max_partition_size, // 2 partitions - max_partition_size + offset_t{1}, // 2 partitions - max_partition_size, // 1 partitions - max_partition_size - offset_t{1} // 1 partitions - }), - take(2, random(max_partition_size - offset_t{1000000}, max_partition_size + offset_t{1000000}))); - - // Prepare input iterator: it[i] = (i%mod)+(i/div) - static constexpr offset_t mod = 200; - static constexpr offset_t div = 1000000000; - auto in = thrust::make_transform_iterator( - thrust::make_counting_iterator(offset_t{0}), modx_and_add_divy{mod, div}); - - // Prepare output - c2h::device_vector out(num_items); - - // Needs to be device accessible - c2h::device_vector num_selected_out(1, 0); - offset_t* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data()); - - // Run test - select_if(in, out.begin(), d_first_num_selected_out, num_items, always_true_t{}); - - // Ensure that we created the correct output - REQUIRE(num_selected_out[0] == num_items); - bool all_results_correct = thrust::equal(out.cbegin(), out.cend(), in); - REQUIRE(all_results_correct == true); -} -catch (std::bad_alloc&) -{ - // Exceeding memory is not a failure. -} - -C2H_TEST("DeviceSelect::If works with iterators", "[device][select_if]") -{ - using type = int; - - const int num_items = 10'000; - c2h::device_vector in(num_items); - thrust::sequence(in.begin(), in.end()); - c2h::device_vector out(num_items); - using thrust::placeholders::_1; - - // select twice, appending the second selection to the first one without bringing the first selection's count to the - // host - c2h::device_vector num_selected_out(2); - select_if(in.begin(), out.begin(), num_selected_out.begin(), num_items, _1 < 1000); // [0;999] - auto output_end = thrust::offset_iterator{out.begin(), num_selected_out.begin()}; - select_if(in.begin(), output_end, num_selected_out.begin() + 1, num_items, _1 >= 9000); // [9000;9999] - - c2h::device_vector expected(2000); - thrust::sequence(expected.begin(), expected.begin() + 1000); - thrust::sequence(expected.begin() + 1000, expected.end(), 9000); - - out.resize(2000); - REQUIRE(num_selected_out == c2h::device_vector{1000, 1000}); - REQUIRE(out == expected); + verify_results(reference, out); } diff --git a/cub/test/catch2_test_device_select_unique.cu b/cub/test/catch2_test_device_select_unique.cu index 2ac6337f184..baeb4b5194c 100644 --- a/cub/test/catch2_test_device_select_unique.cu +++ b/cub/test/catch2_test_device_select_unique.cu @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -98,11 +99,56 @@ using all_types = ulonglong2, ulonglong4, int, - long2, - c2h::custom_type_t>; + long2>; using types = c2h::type_list; + +template +void verify_results(c2h::host_vector &expected_data, c2h::device_vector &test_results){ + // Ensure that we created the correct output + REQUIRE(test_results.size() == expected_data.size()); + + thrust::sort(expected_data.begin(), expected_data.end()); + thrust::sort(test_results.begin(), test_results.end()); + + // int num_selected_out = static_cast(expected_data.size()); + // for(int i = 0; i < num_selected_out; ++i) + // { + // if(expected_data[i] != test_results[i]){ + // std::cout << "i: " << i << " expected_data[i]: " << expected_data[i] << " out[i]: " << test_results[i] << std::endl; + // for(int j = i > 5?i-5:0; j < num_selected_out && j < i + 5; ++j) + // { + // std::cout << "i: " << j << " expected_data[i]: " << expected_data[j] << " out[i]: " << test_results[j] << std::endl; + // } + // break; + // } + // } + REQUIRE(expected_data == test_results); +} +template +void verify_results(c2h::device_vector &expected_data, c2h::device_vector &test_results){ + // Ensure that we created the correct output + REQUIRE(test_results.size() == expected_data.size()); + + thrust::sort(expected_data.begin(), expected_data.end()); + thrust::sort(test_results.begin(), test_results.end()); + + // int num_selected_out = static_cast(expected_data.size()); + // for(int i = 0; i < num_selected_out; ++i) + // { + // if(expected_data[i] != test_results[i]){ + // std::cout << "i: " << i << " expected_data[i]: " << expected_data[i] << " out[i]: " << test_results[i] << std::endl; + // for(int j = i > 5?i-5:0; j < num_selected_out && j < i + 5; ++j) + // { + // std::cout << "i: " << j << " expected_data[i]: " << expected_data[j] << " out[i]: " << test_results[j] << std::endl; + // } + // break; + // } + // } + REQUIRE(expected_data == test_results); +} + C2H_TEST("DeviceSelect::Unique can run with empty input", "[device][select_unique]", types) { using type = typename c2h::get<0, TestType>; @@ -197,7 +243,7 @@ C2H_TEST("DeviceSelect::Unique works with iterators", "[device][select_unique]", out.resize(num_selected_out[0]); reference.resize(num_selected_out[0]); - REQUIRE(reference == out); + verify_results(reference, out); } C2H_TEST("DeviceSelect::Unique works with pointers", "[device][select_unique]", types) @@ -223,123 +269,123 @@ C2H_TEST("DeviceSelect::Unique works with pointers", "[device][select_unique]", out.resize(num_selected_out[0]); reference.resize(num_selected_out[0]); - REQUIRE(reference == out); -} - -template -struct convertible_from_T -{ - T val_; - - convertible_from_T() = default; - __host__ __device__ convertible_from_T(const T& val) noexcept - : val_(val) - {} - __host__ __device__ convertible_from_T& operator=(const T& val) noexcept - { - val_ = val; - } - // Converting back to T helps satisfy all the machinery that T supports - __host__ __device__ operator T() const noexcept - { - return val_; - } -}; - -C2H_TEST("DeviceSelect::Unique works with a different output type", "[device][select_unique]", types) -{ - using type = typename c2h::get<0, TestType>; - - const int num_items = GENERATE_COPY(take(2, random(1, 1000000))); - c2h::device_vector in(num_items); - c2h::device_vector> out(num_items); - c2h::gen(C2H_SEED(2), in, to_bound(0), to_bound(42)); - - // Needs to be device accessible - c2h::device_vector num_selected_out(1, 0); - int* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data()); - - select_unique(in.begin(), out.begin(), d_first_num_selected_out, num_items); - - // Ensure that we create the same output as std - c2h::host_vector reference = in; - const auto boundary = std::unique(reference.begin(), reference.end()); - REQUIRE((boundary - reference.begin()) == num_selected_out[0]); - - out.resize(num_selected_out[0]); - reference.resize(num_selected_out[0]); - REQUIRE(reference == out); + verify_results(reference, out); } -C2H_TEST("DeviceSelect::Unique works for very large number of items", "[device][select_unique]") -try -{ - using type = std::int64_t; - using offset_t = std::int64_t; - - // The partition size (the maximum number of items processed by a single kernel invocation) is an important boundary - constexpr auto max_partition_size = static_cast(::cuda::std::numeric_limits::max()); - - offset_t num_items = GENERATE_COPY( - values({ - offset_t{2} * max_partition_size + offset_t{20000000}, // 3 partitions - offset_t{2} * max_partition_size, // 2 partitions - max_partition_size + offset_t{1}, // 2 partitions - max_partition_size, // 1 partitions - max_partition_size - offset_t{1} // 1 partitions - }), - take(2, random(max_partition_size - offset_t{1000000}, max_partition_size + offset_t{1000000}))); - - // All unique - SECTION("AllUnique") - { - auto in = thrust::make_counting_iterator(offset_t{0}); - - // Prepare expected data - auto expected_result_it = in; - - // Prepare helper to check results - auto check_result_helper = detail::large_problem_test_helper(num_items); - auto check_result_it = check_result_helper.get_flagging_output_iterator(expected_result_it); - - // Needs to be device accessible - c2h::device_vector num_selected_out(1, 0); - offset_t* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data()); - - // Run test - select_unique(in, check_result_it, d_first_num_selected_out, num_items); - - // Ensure that we created the correct output - REQUIRE(num_selected_out[0] == num_items); - check_result_helper.check_all_results_correct(); - } - - // All the same -> single unique - SECTION("AllSame") - { - auto in = thrust::make_constant_iterator(offset_t{0}); - constexpr offset_t expected_num_unique{1}; - - // Prepare expected data - auto expected_result_it = in; - - // Prepare helper to check results - auto check_result_helper = detail::large_problem_test_helper(expected_num_unique); - auto check_result_it = check_result_helper.get_flagging_output_iterator(expected_result_it); - - // Needs to be device accessible - c2h::device_vector num_selected_out(1, 0); - offset_t* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data()); - - // Run test - select_unique(in, check_result_it, d_first_num_selected_out, num_items); - - // Ensure that we created the correct output - REQUIRE(num_selected_out[0] == expected_num_unique); - check_result_helper.check_all_results_correct(); - } -} -catch (std::bad_alloc&) -{ - // Exceeding memory is not a failure. -} +// template +// struct convertible_from_T +// { +// T val_; + +// convertible_from_T() = default; +// __host__ __device__ convertible_from_T(const T& val) noexcept +// : val_(val) +// {} +// __host__ __device__ convertible_from_T& operator=(const T& val) noexcept +// { +// val_ = val; +// } +// // Converting back to T helps satisfy all the machinery that T supports +// __host__ __device__ operator T() const noexcept +// { +// return val_; +// } +// }; + +// C2H_TEST("DeviceSelect::Unique works with a different output type", "[device][select_unique]", types) +// { +// using type = typename c2h::get<0, TestType>; + +// const int num_items = GENERATE_COPY(take(2, random(1, 1000000))); +// c2h::device_vector in(num_items); +// c2h::device_vector> out(num_items); +// c2h::gen(C2H_SEED(2), in, to_bound(0), to_bound(42)); + +// // Needs to be device accessible +// c2h::device_vector num_selected_out(1, 0); +// int* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data()); + +// select_unique(in.begin(), out.begin(), d_first_num_selected_out, num_items); + +// // Ensure that we create the same output as std +// c2h::host_vector reference = in; +// const auto boundary = std::unique(reference.begin(), reference.end()); +// REQUIRE((boundary - reference.begin()) == num_selected_out[0]); + +// out.resize(num_selected_out[0]); +// reference.resize(num_selected_out[0]); +// REQUIRE(reference == out); +// } + +// C2H_TEST("DeviceSelect::Unique works for very large number of items", "[device][select_unique]") +// try +// { +// using type = std::int64_t; +// using offset_t = std::int64_t; + +// // The partition size (the maximum number of items processed by a single kernel invocation) is an important boundary +// constexpr auto max_partition_size = static_cast(::cuda::std::numeric_limits::max()); + +// offset_t num_items = GENERATE_COPY( +// values({ +// offset_t{2} * max_partition_size + offset_t{20000000}, // 3 partitions +// offset_t{2} * max_partition_size, // 2 partitions +// max_partition_size + offset_t{1}, // 2 partitions +// max_partition_size, // 1 partitions +// max_partition_size - offset_t{1} // 1 partitions +// }), +// take(2, random(max_partition_size - offset_t{1000000}, max_partition_size + offset_t{1000000}))); + +// // All unique +// SECTION("AllUnique") +// { +// auto in = thrust::make_counting_iterator(offset_t{0}); + +// // Prepare expected data +// auto expected_result_it = in; + +// // Prepare helper to check results +// auto check_result_helper = detail::large_problem_test_helper(num_items); +// auto check_result_it = check_result_helper.get_flagging_output_iterator(expected_result_it); + +// // Needs to be device accessible +// c2h::device_vector num_selected_out(1, 0); +// offset_t* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data()); + +// // Run test +// select_unique(in, check_result_it, d_first_num_selected_out, num_items); + +// // Ensure that we created the correct output +// REQUIRE(num_selected_out[0] == num_items); +// check_result_helper.check_all_results_correct(); +// } + +// // All the same -> single unique +// SECTION("AllSame") +// { +// auto in = thrust::make_constant_iterator(offset_t{0}); +// constexpr offset_t expected_num_unique{1}; + +// // Prepare expected data +// auto expected_result_it = in; + +// // Prepare helper to check results +// auto check_result_helper = detail::large_problem_test_helper(expected_num_unique); +// auto check_result_it = check_result_helper.get_flagging_output_iterator(expected_result_it); + +// // Needs to be device accessible +// c2h::device_vector num_selected_out(1, 0); +// offset_t* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data()); + +// // Run test +// select_unique(in, check_result_it, d_first_num_selected_out, num_items); + +// // Ensure that we created the correct output +// REQUIRE(num_selected_out[0] == expected_num_unique); +// check_result_helper.check_all_results_correct(); +// } +// } +// catch (std::bad_alloc&) +// { +// // Exceeding memory is not a failure. +// }