From fc5c078b87d5242d21cdbb680c9a84db66d04260 Mon Sep 17 00:00:00 2001 From: ActuallyaDeviloper Date: Mon, 2 Mar 2020 19:20:55 +0100 Subject: [PATCH 1/6] Add new parameters to the FFT's constructor to be able to reuse internal buffers and avoid unnecessary GPU memory usage. --- glfft.cpp | 26 +++++++++++++++++--------- glfft.hpp | 11 ++++++++++- 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/glfft.cpp b/glfft.cpp index bf55422..fae49c0 100644 --- a/glfft.cpp +++ b/glfft.cpp @@ -522,7 +522,8 @@ static inline unsigned type_to_input_components(Type type) FFT::FFT(Context *context, unsigned Nx, unsigned Ny, Type type, Direction direction, Target input_target, Target output_target, - std::shared_ptr program_cache, const FFTOptions &options, const FFTWisdom &wisdom) + std::shared_ptr program_cache, const FFTOptions &options, const FFTWisdom &wisdom, + std::unique_ptr reuse_preallocated_temporary_buffer0, std::unique_ptr reuse_preallocated_temporary_buffer1) : context(context), cache(move(program_cache)), size_x(Nx), size_y(Ny) { set_texture_offset_scale(0.5f / Nx, 0.5f / Ny, 1.0f / Nx, 1.0f / Ny); @@ -530,10 +531,14 @@ FFT::FFT(Context *context, unsigned Nx, unsigned Ny, size_t temp_buffer_size = Nx * Ny * sizeof(float) * (type == ComplexToComplexDual ? 4 : 2); temp_buffer_size >>= options.type.output_fp16; - temp_buffer = context->create_buffer(nullptr, temp_buffer_size, AccessStreamCopy); + temp_buffer = reuse_preallocated_temporary_buffer0 ? + std::move(reuse_preallocated_temporary_buffer0) : + context->create_buffer(nullptr, temp_buffer_size, AccessStreamCopy); if (output_target != SSBO) { - temp_buffer_image = context->create_buffer(nullptr, temp_buffer_size, AccessStreamCopy); + temp_buffer_image = reuse_preallocated_temporary_buffer1 ? + std::move(reuse_preallocated_temporary_buffer1) : + context->create_buffer(nullptr, temp_buffer_size, AccessStreamCopy); } bool expand = false; @@ -1001,9 +1006,7 @@ void FFT::process(CommandBuffer *cmd, Resource *output, Resource *input, Resourc Resource *buffers[2] = { input, - passes.size() & 1 ? - (passes.back().parameters.output_target != SSBO ? temp_buffer_image.get() : output) : - temp_buffer.get(), + (passes.back().parameters.output_target == SSBO && passes.size() & 1) ? output : temp_buffer.get() }; if (input_aux != 0) @@ -1144,9 +1147,14 @@ void FFT::process(CommandBuffer *cmd, Resource *output, Resource *input, Resourc if (pass_index == 0) { - buffers[0] = passes.size() & 1 ? - temp_buffer.get() : - (passes.back().parameters.output_target != SSBO ? temp_buffer_image.get() : output); + if (passes.back().parameters.output_target == SSBO) + { + buffers[0] = passes.size() & 1 ? temp_buffer.get() : output; + } + else + { + buffers[0] = temp_buffer_image.get(); + } } swap(buffers[0], buffers[1]); diff --git a/glfft.hpp b/glfft.hpp index c3b1a69..cd0ff4b 100644 --- a/glfft.hpp +++ b/glfft.hpp @@ -55,10 +55,19 @@ class FFT /// @param options FFT options such as performance related parameters and types. /// @param wisdom GLFFT wisdom which can override performance related options /// (options.performance is used as a fallback). + /// @param reuse_preallocated_temporary_buffer0 + /// For large FFTs also a large internal temporary buffer is required. To reduce memory consumption + /// you can provide a preallocated buffer here that can be shared with other parts of the program. + /// The buffer must have size at least Nx * Ny * (type == ComplexToComplexDual ? 4 : 2) * (options.type.fp16 ? 2 : 4). + /// The provided buffer must not be used while the FFT is in progress and will contain unpredictable garbage data afterwards. + /// @param reuse_preallocated_temporary_buffer1 + /// Same as reuse_preallocated_temporary_buffer0 and used only if the output is a texture. + /// May be aliased with the input if the input if the input is not needed again after processing. FFT(Context *context, unsigned Nx, unsigned Ny, Type type, Direction direction, Target input_target, Target output_target, std::shared_ptr cache, const FFTOptions &options, - const FFTWisdom &wisdom = FFTWisdom()); + const FFTWisdom &wisdom = FFTWisdom(), std::unique_ptr reuse_preallocated_temporary_buffer0 = nullptr, + std::unique_ptr reuse_preallocated_temporary_buffer1 = nullptr); /// @brief Creates a single stage FFT. Used mostly internally for benchmarking partial FFTs. /// From 8af8eabedc6995bb98e53d168356572c37b84e57 Mon Sep 17 00:00:00 2001 From: ActuallyaDeviloper Date: Mon, 2 Mar 2020 22:19:41 +0100 Subject: [PATCH 2/6] Allow specifying custom sampling code in the shader. --- glfft.cpp | 9 +++++++-- glfft.hpp | 8 +++++++- glfft_common.hpp | 18 ++++++++++++------ glsl/fft_common.comp | 5 ++++- 4 files changed, 30 insertions(+), 10 deletions(-) diff --git a/glfft.cpp b/glfft.cpp index fae49c0..d1bdefe 100644 --- a/glfft.cpp +++ b/glfft.cpp @@ -523,7 +523,7 @@ static inline unsigned type_to_input_components(Type type) FFT::FFT(Context *context, unsigned Nx, unsigned Ny, Type type, Direction direction, Target input_target, Target output_target, std::shared_ptr program_cache, const FFTOptions &options, const FFTWisdom &wisdom, - std::unique_ptr reuse_preallocated_temporary_buffer0, std::unique_ptr reuse_preallocated_temporary_buffer1) + std::string input_load_texture_code, std::unique_ptr reuse_preallocated_temporary_buffer0, std::unique_ptr reuse_preallocated_temporary_buffer1) : context(context), cache(move(program_cache)), size_x(Nx), size_y(Ny) { set_texture_offset_scale(0.5f / Nx, 0.5f / Ny, 1.0f / Nx, 1.0f / Ny); @@ -649,6 +649,7 @@ FFT::FFT(Context *context, unsigned Nx, unsigned Ny, radix.shared_banked, options.type.fp16, input_fp16, options.type.output_fp16, options.type.normalize, + input_load_texture_code }; const Pass pass = { @@ -736,7 +737,7 @@ void FFT::store_shader_string(const char *path, const string &source) unique_ptr FFT::build_program(const Parameters ¶ms) { string str; - str.reserve(16 * 1024); + str.reserve(64 * 1024); #if 0 context->log("Building program:\n"); @@ -799,6 +800,10 @@ unique_ptr FFT::build_program(const Parameters ¶ms) str += "#define FFT_CONVOLVE\n"; } + str += "#define FFT_LOAD_TEXTURE_CODE "; + str += params.input_load_texture_code.empty() ? input_load_texture_code_default : params.input_load_texture_code; + str += "\n"; + str += params.shared_banked ? "#define FFT_SHARED_BANKED 1\n" : "#define FFT_SHARED_BANKED 0\n"; str += params.direction == Forward ? "#define FFT_FORWARD\n" : "#define FFT_INVERSE\n"; diff --git a/glfft.hpp b/glfft.hpp index cd0ff4b..a202098 100644 --- a/glfft.hpp +++ b/glfft.hpp @@ -55,6 +55,10 @@ class FFT /// @param options FFT options such as performance related parameters and types. /// @param wisdom GLFFT wisdom which can override performance related options /// (options.performance is used as a fallback). + /// @param input_load_texture_code + /// Custom code for sampling the input texture can be inserted here. + /// This must only use a single line and must define a function with signature + /// "cfloat load_texture(uvec2 coord)" and can call "cfloat load_texture_inner(uvec2 coord)". /// @param reuse_preallocated_temporary_buffer0 /// For large FFTs also a large internal temporary buffer is required. To reduce memory consumption /// you can provide a preallocated buffer here that can be shared with other parts of the program. @@ -66,7 +70,9 @@ class FFT FFT(Context *context, unsigned Nx, unsigned Ny, Type type, Direction direction, Target input_target, Target output_target, std::shared_ptr cache, const FFTOptions &options, - const FFTWisdom &wisdom = FFTWisdom(), std::unique_ptr reuse_preallocated_temporary_buffer0 = nullptr, + const FFTWisdom &wisdom = FFTWisdom(), + std::string input_load_texture_code = input_load_texture_code_default, + std::unique_ptr reuse_preallocated_temporary_buffer0 = nullptr, std::unique_ptr reuse_preallocated_temporary_buffer1 = nullptr); /// @brief Creates a single stage FFT. Used mostly internally for benchmarking partial FFTs. diff --git a/glfft_common.hpp b/glfft_common.hpp index 4065f68..b2b6a1a 100644 --- a/glfft_common.hpp +++ b/glfft_common.hpp @@ -33,7 +33,7 @@ namespace GLFFT { -enum Direction +enum Direction : char { /// Forward FFT transform. Forward = -1, @@ -44,7 +44,7 @@ enum Direction Inverse = 1 }; -enum Mode +enum Mode : char { Horizontal, HorizontalDual, @@ -55,7 +55,7 @@ enum Mode ResolveComplexToReal, }; -enum Type +enum Type : char { /// Regular complex-to-complex transform. ComplexToComplex, @@ -68,7 +68,7 @@ enum Type RealToComplex }; -enum Target +enum Target : char { /// GL_SHADER_STORAGE_BUFFER SSBO, @@ -81,6 +81,11 @@ enum Target ImageReal }; +static constexpr char const input_load_texture_code_default[] = + "cfloat load_texture(uvec2 coord) {" + " return load_texture_inner(coord);" + "}"; + struct Parameters { unsigned workgroup_size_x; @@ -96,10 +101,11 @@ struct Parameters bool shared_banked; bool fft_fp16, input_fp16, output_fp16; bool fft_normalize; - + std::string input_load_texture_code; // If empty defaults to input_load_texture_code_default. Unfortunately we can't put it here because that breaks the initializer lists in C++11. bool operator==(const Parameters &other) const { - return std::memcmp(this, &other, sizeof(Parameters)) == 0; + return std::memcmp(this, &other, offsetof(Parameters, input_load_texture_code)) == 0 + && input_load_texture_code == other.input_load_texture_code; } }; diff --git a/glsl/fft_common.comp b/glsl/fft_common.comp index c8f0e3c..df95624 100644 --- a/glsl/fft_common.comp +++ b/glsl/fft_common.comp @@ -314,7 +314,7 @@ cfloat load_texture(sampler2D sampler, uvec2 coord) #endif } -cfloat load_texture(uvec2 coord) +cfloat load_texture_inner(uvec2 coord) { #ifdef FFT_CONVOLVE // Convolution in frequency domain is multiplication. @@ -326,6 +326,9 @@ cfloat load_texture(uvec2 coord) #endif } +// This must define a function with signature cfloat load_texture(uvec2 coord) +FFT_LOAD_TEXTURE_CODE + // Implement a dummy load_global, or we have to #ifdef out lots of dead code elsewhere. #ifdef FFT_VEC8 cfloat load_global(uint offset) From 9502db7eca4e88a869882b048b325bf6ba03a7f2 Mon Sep 17 00:00:00 2001 From: ActuallyaDeviloper Date: Tue, 3 Mar 2020 19:28:07 +0100 Subject: [PATCH 3/6] Fix a couple warnings by using the correct data type. --- glfft.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/glfft.hpp b/glfft.hpp index a202098..e1c6252 100644 --- a/glfft.hpp +++ b/glfft.hpp @@ -130,12 +130,12 @@ class FFT double get_cost() const { return cost; } /// @brief Returns number of passes (glDispatchCompute) in a process() call. - unsigned get_num_passes() const { return passes.size(); } + size_t get_num_passes() const { return passes.size(); } /// @brief Returns Nx. - unsigned get_dimension_x() const { return size_x; } + size_t get_dimension_x() const { return size_x; } /// @brief Returns Ny. - unsigned get_dimension_y() const { return size_y; } + size_t get_dimension_y() const { return size_y; } /// @brief Sets offset and scale parameters for normalized texel coordinates when sampling textures. /// From 70d927c7658282a1652cb4e0b00e51a0a1f1cb6d Mon Sep 17 00:00:00 2001 From: ActuallyaDeviloper Date: Sat, 7 Mar 2020 00:28:22 +0100 Subject: [PATCH 4/6] Fix glfft intermediate data formats. Sometimes it is internally using half precision for intermediate results nobody asked for. This introduces rounding errors. --- glfft.cpp | 24 +++++++++++++++--------- glfft_common.hpp | 2 +- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/glfft.cpp b/glfft.cpp index d1bdefe..2c81012 100644 --- a/glfft.cpp +++ b/glfft.cpp @@ -529,12 +529,12 @@ FFT::FFT(Context *context, unsigned Nx, unsigned Ny, set_texture_offset_scale(0.5f / Nx, 0.5f / Ny, 1.0f / Nx, 1.0f / Ny); size_t temp_buffer_size = Nx * Ny * sizeof(float) * (type == ComplexToComplexDual ? 4 : 2); - temp_buffer_size >>= options.type.output_fp16; + temp_buffer_size >>= options.type.fp16; temp_buffer = reuse_preallocated_temporary_buffer0 ? std::move(reuse_preallocated_temporary_buffer0) : context->create_buffer(nullptr, temp_buffer_size, AccessStreamCopy); - if (output_target != SSBO) + if (output_target != SSBO || (options.type.output_fp16 && !options.type.fp16)) // @HigherIntermediatePrecision We may need higher intermediate precision. { temp_buffer_image = reuse_preallocated_temporary_buffer1 ? std::move(reuse_preallocated_temporary_buffer1) : @@ -629,7 +629,8 @@ FFT::FFT(Context *context, unsigned Nx, unsigned Ny, // If this is the last pass and we're writing to an image, use a special shader variant. bool last_pass = index == last_index && i == radix_direction.size() - 1; - bool input_fp16 = passes.empty() ? options.type.input_fp16 : options.type.output_fp16; + bool input_fp16 = passes.empty() ? options.type.input_fp16 : options.type.fp16; + bool output_fp16 = last_pass ? options.type.output_fp16 : options.type.fp16; Target out_target = last_pass ? output_target : SSBO; Target in_target = passes.empty() ? input_target : SSBO; Direction dir = direction == InverseConvolve && !passes.empty() ? Inverse : direction; @@ -647,7 +648,7 @@ FFT::FFT(Context *context, unsigned Nx, unsigned Ny, out_target, p == 1, radix.shared_banked, - options.type.fp16, input_fp16, options.type.output_fp16, + options.type.fp16, input_fp16, output_fp16, options.type.normalize, input_load_texture_code }; @@ -670,7 +671,6 @@ FFT::FFT(Context *context, unsigned Nx, unsigned Ny, // This way, we avoid having special purpose transforms for all FFT variants. if (index == 0 && (type == ComplexToReal || type == RealToComplex)) { - bool input_fp16 = passes.empty() ? options.type.input_fp16 : options.type.output_fp16; bool last_pass = radices[1].empty(); Direction dir = direction == InverseConvolve && !passes.empty() ? Inverse : direction; Target in_target = passes.empty() ? input_target : SSBO; @@ -679,7 +679,8 @@ FFT::FFT(Context *context, unsigned Nx, unsigned Ny, unsigned uv_scale_x = 1; auto base_opts = options; - base_opts.type.input_fp16 = input_fp16; + base_opts.type.input_fp16 = passes.empty() ? options.type.input_fp16 : options.type.fp16; + base_opts.type.output_fp16 = last_pass ? options.type.output_fp16 : options.type.fp16; auto &opts = wisdom.find_optimal_options_or_default(Nx, Ny, 2, mode, in_target, out_target, base_opts); auto res = build_resolve_radix(Nx, Ny, { opts.workgroup_size_x, opts.workgroup_size_y, 1 }); @@ -1011,7 +1012,7 @@ void FFT::process(CommandBuffer *cmd, Resource *output, Resource *input, Resourc Resource *buffers[2] = { input, - (passes.back().parameters.output_target == SSBO && passes.size() & 1) ? output : temp_buffer.get() + (!temp_buffer_image && passes.size() & 1) ? output : temp_buffer.get() // If no 'temp_buffer_image' is available, we must be use the output buffer directly. }; if (input_aux != 0) @@ -1066,6 +1067,11 @@ void FFT::process(CommandBuffer *cmd, Resource *output, Resource *input, Resourc constant_data.stride = pass.stride; p *= pass.parameters.radix; + if (pass_index + 1 >= passes.size()) // In the last pass we need to inject our output buffer. + { + buffers[1] = output; + } + if (pass.parameters.input_target != SSBO) { cmd->bind_texture(BindingTexture0, static_cast(buffers[0])); @@ -1125,7 +1131,7 @@ void FFT::process(CommandBuffer *cmd, Resource *output, Resource *input, Resourc break; } } - cmd->bind_storage_texture(BindingImage, static_cast(output), format); + cmd->bind_storage_texture(BindingImage, static_cast(buffers[1]), format); } else { @@ -1152,7 +1158,7 @@ void FFT::process(CommandBuffer *cmd, Resource *output, Resource *input, Resourc if (pass_index == 0) { - if (passes.back().parameters.output_target == SSBO) + if (!temp_buffer_image) // If no 'temp_buffer_image' is available, we must be use the output buffer directly. { buffers[0] = passes.size() & 1 ? temp_buffer.get() : output; } diff --git a/glfft_common.hpp b/glfft_common.hpp index b2b6a1a..dc35cb0 100644 --- a/glfft_common.hpp +++ b/glfft_common.hpp @@ -132,7 +132,7 @@ struct FFTOptions struct Type { - /// Whether internal shader should be mediump float. + /// Whether internal shader and intermediate results should be mediump float. bool fp16 = false; /// Whether input SSBO is a packed 2xfp16 format. Otherwise, regular FP32. bool input_fp16 = false; From 1bc68b75fd1b47d479e0b58319a958d0bd90d51a Mon Sep 17 00:00:00 2001 From: ActuallyaDeviloper Date: Sat, 7 Mar 2020 15:45:05 +0100 Subject: [PATCH 5/6] New option to only run some tests. --- test/glfft_cli.cpp | 1 + test/glfft_cli.hpp | 1 + test/glfft_test.cpp | 39 ++++++++++++++++++++++++--------------- 3 files changed, 26 insertions(+), 15 deletions(-) diff --git a/test/glfft_cli.cpp b/test/glfft_cli.cpp index 2ada152..a7c2709 100644 --- a/test/glfft_cli.cpp +++ b/test/glfft_cli.cpp @@ -335,6 +335,7 @@ static int cli_test(Context *context, int argc, char *argv[]) cbs.add("--minimum-snr-fp32", [&args](CLIParser &parser) { args.min_snr_fp32 = parser.next_double(); }); cbs.add("--epsilon-fp16", [&args](CLIParser &parser) { args.epsilon_fp16 = parser.next_double(); }); cbs.add("--epsilon-fp32", [&args](CLIParser &parser) { args.epsilon_fp32 = parser.next_double(); }); + cbs.add("--single-base-size", [&args](CLIParser &parser) { args.single_base_size = true; }); cbs.error_handler = [context]{ cli_test_help(context); }; CLIParser parser(move(cbs), argc, argv); diff --git a/test/glfft_cli.hpp b/test/glfft_cli.hpp index 3d29e19..dff67ba 100644 --- a/test/glfft_cli.hpp +++ b/test/glfft_cli.hpp @@ -40,6 +40,7 @@ namespace GLFFT unsigned test_id_min = 0; unsigned test_id_max = 0; bool exhaustive = true; + bool single_base_size = true; bool throw_on_fail = false; double min_snr_fp16 = 50.0; double min_snr_fp32 = 100.0; diff --git a/test/glfft_test.cpp b/test/glfft_test.cpp index 208b423..300bde3 100644 --- a/test/glfft_test.cpp +++ b/test/glfft_test.cpp @@ -494,11 +494,13 @@ static mufft_buffer convert_fp16_fp32(const uint32_t *input, unsigned N) static void run_test_ssbo(Context *context, const TestSuiteArguments &args, unsigned Nx, unsigned Ny, Type type, Direction direction, const FFTOptions &options, const shared_ptr &cache) { - context->log("Running SSBO -> SSBO FFT, %04u x %04u\n\t%7s transform\n\t%8s\n\tbanked shared %s\n\tvector size %u\n\twork group (%u, %u)\n\tinput fp16 %s\n\toutput fp16 %s ...\n", - Nx, Ny, direction_to_str(direction), type_to_str(type), - options.performance.shared_banked ? "yes" : "no", options.performance.vector_size, options.performance.workgroup_size_x, options.performance.workgroup_size_y, - options.type.input_fp16 ? "yes" : "no", - options.type.output_fp16 ? "yes" : "no"); + context->log("Running SSBO -> SSBO FFT, %04u x %04u\n\t%7s transform\n\t%8s\n\tbanked shared %s\n\tvector size %u\n\twork group (%u, %u)\n\tinput fp16 %s\n\toutput fp16 %s\n\tfp16 %s ...\n", + Nx, Ny, direction_to_str(direction), type_to_str(type), + options.performance.shared_banked ? "yes" : "no", options.performance.vector_size, options.performance.workgroup_size_x, options.performance.workgroup_size_y, + options.type.input_fp16 ? "yes" : "no", + options.type.output_fp16 ? "yes" : "no", + options.type.fp16 ? "yes" : "no"); + unique_ptr test_input; unique_ptr test_output; @@ -545,11 +547,12 @@ static void run_test_ssbo(Context *context, static void run_test_texture(Context *context, const TestSuiteArguments &args, unsigned Nx, unsigned Ny, Type type, Direction direction, const FFTOptions &options, const shared_ptr &cache) { - context->log("Running Texture -> SSBO FFT, %04u x %04u\n\t%7s transform\n\t%8s\n\tbanked shared %s\n\tvector size %u\n\twork group (%u, %u)\n\tinput fp16 %s\n\toutput fp16 %s ...\n", - Nx, Ny, direction_to_str(direction), type_to_str(type), - options.performance.shared_banked ? "yes" : "no", options.performance.vector_size, options.performance.workgroup_size_x, options.performance.workgroup_size_y, - options.type.input_fp16 ? "yes" : "no", - options.type.output_fp16 ? "yes" : "no"); + context->log("Running Texture -> SSBO FFT, %04u x %04u\n\t%7s transform\n\t%8s\n\tbanked shared %s\n\tvector size %u\n\twork group (%u, %u)\n\tinput fp16 %s\n\toutput fp16 %s\n\tfp16 %s ...\n", + Nx, Ny, direction_to_str(direction), type_to_str(type), + options.performance.shared_banked ? "yes" : "no", options.performance.vector_size, options.performance.workgroup_size_x, options.performance.workgroup_size_y, + options.type.input_fp16 ? "yes" : "no", + options.type.output_fp16 ? "yes" : "no", + options.type.fp16 ? "yes" : "no"); unique_ptr test_input; unique_ptr test_output; @@ -634,11 +637,12 @@ static mufft_buffer readback_texture(Context *context, Texture *tex, unsigned co static void run_test_image(Context *context, const TestSuiteArguments &args, unsigned Nx, unsigned Ny, Type type, Direction direction, const FFTOptions &options, const shared_ptr &cache) { - context->log("Running SSBO -> Image FFT, %04u x %04u\n\t%7s transform\n\t%8s\n\tbanked shared %s\n\tvector size %u\n\twork group (%u, %u)\n\tinput fp16 %s\n\toutput fp16 %s ...\n", - Nx, Ny, direction_to_str(direction), type_to_str(type), - options.performance.shared_banked ? "yes" : "no", options.performance.vector_size, options.performance.workgroup_size_x, options.performance.workgroup_size_y, - options.type.input_fp16 ? "yes" : "no", - options.type.output_fp16 ? "yes" : "no"); + context->log("Running SSBO -> Image FFT, %04u x %04u\n\t%7s transform\n\t%8s\n\tbanked shared %s\n\tvector size %u\n\twork group (%u, %u)\n\tinput fp16 %s\n\toutput fp16 %s\n\tfp16 %s ...\n", + Nx, Ny, direction_to_str(direction), type_to_str(type), + options.performance.shared_banked ? "yes" : "no", options.performance.vector_size, options.performance.workgroup_size_x, options.performance.workgroup_size_y, + options.type.input_fp16 ? "yes" : "no", + options.type.output_fp16 ? "yes" : "no", + options.type.fp16 ? "yes" : "no"); unique_ptr test_input; @@ -807,6 +811,11 @@ void GLFFT::Internal::run_test_suite(Context *context, const TestSuiteArguments for (unsigned N = N_mult * (big_workgroup ? 128 : 32); N <= 1024; N <<= 1) { + if (args.single_base_size && N != 256) // Option to make length of test run somewhat reasonable. + { + continue; + } + // Texture -> SSBO enqueue_test(context, tests, args, N, N / 2, ComplexToComplex, Forward, Image, SSBO, options, cache); enqueue_test(context, tests, args, N, N / 2, ComplexToComplex, Inverse, Image, SSBO, options, cache); From 821c677db7edb4f3ccc37d550f1b7817f4dbceaa Mon Sep 17 00:00:00 2001 From: ActuallyaDeviloper Date: Sat, 7 Mar 2020 16:37:55 +0100 Subject: [PATCH 6/6] Fix some more compiler warnings. --- glfft_wisdom.cpp | 2 +- test/glfft_cli.cpp | 4 ++-- test/glfft_test.cpp | 20 ++++++++++---------- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/glfft_wisdom.cpp b/glfft_wisdom.cpp index bb5b306..4d936e2 100644 --- a/glfft_wisdom.cpp +++ b/glfft_wisdom.cpp @@ -394,7 +394,7 @@ std::pair FFTWisdom::study(Context *context, co } FFTOptions::Performance perf; - perf.shared_banked = shared_banked; + perf.shared_banked = !!shared_banked; perf.vector_size = vector_size; perf.workgroup_size_x = workgroup_size_x; perf.workgroup_size_y = workgroup_size_y; diff --git a/test/glfft_cli.cpp b/test/glfft_cli.cpp index a7c2709..db0d751 100644 --- a/test/glfft_cli.cpp +++ b/test/glfft_cli.cpp @@ -143,7 +143,7 @@ struct BenchArguments unsigned warmup = 2; unsigned iterations = 20; unsigned dispatches = 50; - unsigned timeout = 1.0; + unsigned timeout = 1; Type type = ComplexToComplex; unsigned size_for_type = 2; const char *string_for_type = "C2C"; @@ -407,7 +407,7 @@ static int cli_bench(Context *context, int argc, char *argv[]) cbs.add("--warmup", [&args](CLIParser &parser) { args.warmup = parser.next_uint(); }); cbs.add("--iterations", [&args](CLIParser &parser) { args.iterations = parser.next_uint(); }); cbs.add("--dispatches", [&args](CLIParser &parser) { args.dispatches = parser.next_uint(); }); - cbs.add("--timeout", [&args](CLIParser &parser) { args.timeout = parser.next_double(); }); + cbs.add("--timeout", [&args](CLIParser &parser) { args.timeout = (unsigned int)parser.next_double(); }); cbs.add("--fp16", [&args](CLIParser&) { args.fp16 = true; }); cbs.add("--type", [&args](CLIParser &parser) { args.type = parse_type(parser.next_string(), args); }); cbs.add("--input-texture", [&args](CLIParser&) { args.input_texture = true; }); diff --git a/test/glfft_test.cpp b/test/glfft_test.cpp index 300bde3..6dbebfe 100644 --- a/test/glfft_test.cpp +++ b/test/glfft_test.cpp @@ -49,7 +49,7 @@ mufft_buffer alloc(size_t size) using cfloat = complex; -mufft_buffer create_input(unsigned N) +mufft_buffer create_input(size_t N) { auto buffer = alloc(N * sizeof(float)); float *ptr = static_cast(buffer.get()); @@ -231,7 +231,7 @@ static mufft_buffer create_reference(Type type, Direction direction, out = static_cast(output.get()); for (unsigned i = 0; i < output_size / sizeof(cfloat); i++) { - out[i] /= Nx * Ny; + out[i] /= static_cast(Nx * Ny); } return output; @@ -463,7 +463,7 @@ static inline pair fp16_to_fp32(uint32_t v) return make_pair(fp16_to_fp32(lower), fp16_to_fp32(upper)); } -static mufft_buffer convert_fp32_fp16(const float *input, unsigned N) +static mufft_buffer convert_fp32_fp16(const float *input, size_t N) { auto buffer = alloc(N * sizeof(uint16_t)); auto ptr = static_cast(buffer.get()); @@ -476,7 +476,7 @@ static mufft_buffer convert_fp32_fp16(const float *input, unsigned N) return buffer; } -static mufft_buffer convert_fp16_fp32(const uint32_t *input, unsigned N) +static mufft_buffer convert_fp16_fp32(const uint32_t *input, size_t N) { auto buffer = alloc(N * sizeof(float)); auto ptr = static_cast(buffer.get()); @@ -533,8 +533,8 @@ static void run_test_ssbo(Context *context, output_data = convert_fp16_fp32(static_cast(output_data.get()), output_size / sizeof(float)); } - float epsilon = options.type.output_fp16 || options.type.input_fp16 ? args.epsilon_fp16 : args.epsilon_fp32; - float min_snr = options.type.output_fp16 || options.type.input_fp16 ? args.min_snr_fp16 : args.min_snr_fp32; + float epsilon = static_cast(options.type.output_fp16 || options.type.input_fp16 ? args.epsilon_fp16 : args.epsilon_fp32); + float min_snr = static_cast(options.type.output_fp16 || options.type.input_fp16 ? args.min_snr_fp16 : args.min_snr_fp32); if (direction == InverseConvolve) { epsilon *= 1.5f; @@ -600,8 +600,8 @@ static void run_test_texture(Context *context, output_data = convert_fp16_fp32(static_cast(output_data.get()), output_size / sizeof(float)); } - float epsilon = options.type.output_fp16 || options.type.input_fp16 ? args.epsilon_fp16 : args.epsilon_fp32; - float min_snr = options.type.output_fp16 || options.type.input_fp16 ? args.min_snr_fp16 : args.min_snr_fp32; + float epsilon = static_cast(options.type.output_fp16 || options.type.input_fp16 ? args.epsilon_fp16 : args.epsilon_fp32); + float min_snr = static_cast(options.type.output_fp16 || options.type.input_fp16 ? args.min_snr_fp16 : args.min_snr_fp32); if (direction == InverseConvolve) { epsilon *= 1.5f; @@ -694,8 +694,8 @@ static void run_test_image(Context *context, const TestSuiteArguments &args, uns auto output_data = readback_texture(context, tex.get(), components, Nx, Ny); - float epsilon = components > 1 || options.type.output_fp16 || options.type.input_fp16 ? args.epsilon_fp16 : args.epsilon_fp32; - float min_snr = components > 1 || options.type.output_fp16 || options.type.input_fp16 ? args.min_snr_fp16 : args.min_snr_fp32; + float epsilon = static_cast(options.type.output_fp16 || options.type.input_fp16 ? args.epsilon_fp16 : args.epsilon_fp32); + float min_snr = static_cast(options.type.output_fp16 || options.type.input_fp16 ? args.min_snr_fp16 : args.min_snr_fp32); if (direction == InverseConvolve) { epsilon *= 1.5f;