diff --git a/glfft.cpp b/glfft.cpp index bf55422..2c81012 100644 --- a/glfft.cpp +++ b/glfft.cpp @@ -522,18 +522,23 @@ static inline unsigned type_to_input_components(Type type) FFT::FFT(Context *context, unsigned Nx, unsigned Ny, Type type, Direction direction, Target input_target, Target output_target, - std::shared_ptr program_cache, const FFTOptions &options, const FFTWisdom &wisdom) + std::shared_ptr program_cache, const FFTOptions &options, const FFTWisdom &wisdom, + std::string input_load_texture_code, std::unique_ptr reuse_preallocated_temporary_buffer0, std::unique_ptr reuse_preallocated_temporary_buffer1) : context(context), cache(move(program_cache)), size_x(Nx), size_y(Ny) { set_texture_offset_scale(0.5f / Nx, 0.5f / Ny, 1.0f / Nx, 1.0f / Ny); size_t temp_buffer_size = Nx * Ny * sizeof(float) * (type == ComplexToComplexDual ? 4 : 2); - temp_buffer_size >>= options.type.output_fp16; + temp_buffer_size >>= options.type.fp16; - temp_buffer = context->create_buffer(nullptr, temp_buffer_size, AccessStreamCopy); - if (output_target != SSBO) + temp_buffer = reuse_preallocated_temporary_buffer0 ? + std::move(reuse_preallocated_temporary_buffer0) : + context->create_buffer(nullptr, temp_buffer_size, AccessStreamCopy); + if (output_target != SSBO || (options.type.output_fp16 && !options.type.fp16)) // @HigherIntermediatePrecision We may need higher intermediate precision. { - temp_buffer_image = context->create_buffer(nullptr, temp_buffer_size, AccessStreamCopy); + temp_buffer_image = reuse_preallocated_temporary_buffer1 ? + std::move(reuse_preallocated_temporary_buffer1) : + context->create_buffer(nullptr, temp_buffer_size, AccessStreamCopy); } bool expand = false; @@ -624,7 +629,8 @@ FFT::FFT(Context *context, unsigned Nx, unsigned Ny, // If this is the last pass and we're writing to an image, use a special shader variant. bool last_pass = index == last_index && i == radix_direction.size() - 1; - bool input_fp16 = passes.empty() ? options.type.input_fp16 : options.type.output_fp16; + bool input_fp16 = passes.empty() ? options.type.input_fp16 : options.type.fp16; + bool output_fp16 = last_pass ? options.type.output_fp16 : options.type.fp16; Target out_target = last_pass ? output_target : SSBO; Target in_target = passes.empty() ? input_target : SSBO; Direction dir = direction == InverseConvolve && !passes.empty() ? Inverse : direction; @@ -642,8 +648,9 @@ FFT::FFT(Context *context, unsigned Nx, unsigned Ny, out_target, p == 1, radix.shared_banked, - options.type.fp16, input_fp16, options.type.output_fp16, + options.type.fp16, input_fp16, output_fp16, options.type.normalize, + input_load_texture_code }; const Pass pass = { @@ -664,7 +671,6 @@ FFT::FFT(Context *context, unsigned Nx, unsigned Ny, // This way, we avoid having special purpose transforms for all FFT variants. if (index == 0 && (type == ComplexToReal || type == RealToComplex)) { - bool input_fp16 = passes.empty() ? options.type.input_fp16 : options.type.output_fp16; bool last_pass = radices[1].empty(); Direction dir = direction == InverseConvolve && !passes.empty() ? Inverse : direction; Target in_target = passes.empty() ? input_target : SSBO; @@ -673,7 +679,8 @@ FFT::FFT(Context *context, unsigned Nx, unsigned Ny, unsigned uv_scale_x = 1; auto base_opts = options; - base_opts.type.input_fp16 = input_fp16; + base_opts.type.input_fp16 = passes.empty() ? options.type.input_fp16 : options.type.fp16; + base_opts.type.output_fp16 = last_pass ? options.type.output_fp16 : options.type.fp16; auto &opts = wisdom.find_optimal_options_or_default(Nx, Ny, 2, mode, in_target, out_target, base_opts); auto res = build_resolve_radix(Nx, Ny, { opts.workgroup_size_x, opts.workgroup_size_y, 1 }); @@ -731,7 +738,7 @@ void FFT::store_shader_string(const char *path, const string &source) unique_ptr FFT::build_program(const Parameters ¶ms) { string str; - str.reserve(16 * 1024); + str.reserve(64 * 1024); #if 0 context->log("Building program:\n"); @@ -794,6 +801,10 @@ unique_ptr FFT::build_program(const Parameters ¶ms) str += "#define FFT_CONVOLVE\n"; } + str += "#define FFT_LOAD_TEXTURE_CODE "; + str += params.input_load_texture_code.empty() ? input_load_texture_code_default : params.input_load_texture_code; + str += "\n"; + str += params.shared_banked ? "#define FFT_SHARED_BANKED 1\n" : "#define FFT_SHARED_BANKED 0\n"; str += params.direction == Forward ? "#define FFT_FORWARD\n" : "#define FFT_INVERSE\n"; @@ -1001,9 +1012,7 @@ void FFT::process(CommandBuffer *cmd, Resource *output, Resource *input, Resourc Resource *buffers[2] = { input, - passes.size() & 1 ? - (passes.back().parameters.output_target != SSBO ? temp_buffer_image.get() : output) : - temp_buffer.get(), + (!temp_buffer_image && passes.size() & 1) ? output : temp_buffer.get() // If no 'temp_buffer_image' is available, we must be use the output buffer directly. }; if (input_aux != 0) @@ -1058,6 +1067,11 @@ void FFT::process(CommandBuffer *cmd, Resource *output, Resource *input, Resourc constant_data.stride = pass.stride; p *= pass.parameters.radix; + if (pass_index + 1 >= passes.size()) // In the last pass we need to inject our output buffer. + { + buffers[1] = output; + } + if (pass.parameters.input_target != SSBO) { cmd->bind_texture(BindingTexture0, static_cast(buffers[0])); @@ -1117,7 +1131,7 @@ void FFT::process(CommandBuffer *cmd, Resource *output, Resource *input, Resourc break; } } - cmd->bind_storage_texture(BindingImage, static_cast(output), format); + cmd->bind_storage_texture(BindingImage, static_cast(buffers[1]), format); } else { @@ -1144,9 +1158,14 @@ void FFT::process(CommandBuffer *cmd, Resource *output, Resource *input, Resourc if (pass_index == 0) { - buffers[0] = passes.size() & 1 ? - temp_buffer.get() : - (passes.back().parameters.output_target != SSBO ? temp_buffer_image.get() : output); + if (!temp_buffer_image) // If no 'temp_buffer_image' is available, we must be use the output buffer directly. + { + buffers[0] = passes.size() & 1 ? temp_buffer.get() : output; + } + else + { + buffers[0] = temp_buffer_image.get(); + } } swap(buffers[0], buffers[1]); diff --git a/glfft.hpp b/glfft.hpp index c3b1a69..e1c6252 100644 --- a/glfft.hpp +++ b/glfft.hpp @@ -55,10 +55,25 @@ class FFT /// @param options FFT options such as performance related parameters and types. /// @param wisdom GLFFT wisdom which can override performance related options /// (options.performance is used as a fallback). + /// @param input_load_texture_code + /// Custom code for sampling the input texture can be inserted here. + /// This must only use a single line and must define a function with signature + /// "cfloat load_texture(uvec2 coord)" and can call "cfloat load_texture_inner(uvec2 coord)". + /// @param reuse_preallocated_temporary_buffer0 + /// For large FFTs also a large internal temporary buffer is required. To reduce memory consumption + /// you can provide a preallocated buffer here that can be shared with other parts of the program. + /// The buffer must have size at least Nx * Ny * (type == ComplexToComplexDual ? 4 : 2) * (options.type.fp16 ? 2 : 4). + /// The provided buffer must not be used while the FFT is in progress and will contain unpredictable garbage data afterwards. + /// @param reuse_preallocated_temporary_buffer1 + /// Same as reuse_preallocated_temporary_buffer0 and used only if the output is a texture. + /// May be aliased with the input if the input if the input is not needed again after processing. FFT(Context *context, unsigned Nx, unsigned Ny, Type type, Direction direction, Target input_target, Target output_target, std::shared_ptr cache, const FFTOptions &options, - const FFTWisdom &wisdom = FFTWisdom()); + const FFTWisdom &wisdom = FFTWisdom(), + std::string input_load_texture_code = input_load_texture_code_default, + std::unique_ptr reuse_preallocated_temporary_buffer0 = nullptr, + std::unique_ptr reuse_preallocated_temporary_buffer1 = nullptr); /// @brief Creates a single stage FFT. Used mostly internally for benchmarking partial FFTs. /// @@ -115,12 +130,12 @@ class FFT double get_cost() const { return cost; } /// @brief Returns number of passes (glDispatchCompute) in a process() call. - unsigned get_num_passes() const { return passes.size(); } + size_t get_num_passes() const { return passes.size(); } /// @brief Returns Nx. - unsigned get_dimension_x() const { return size_x; } + size_t get_dimension_x() const { return size_x; } /// @brief Returns Ny. - unsigned get_dimension_y() const { return size_y; } + size_t get_dimension_y() const { return size_y; } /// @brief Sets offset and scale parameters for normalized texel coordinates when sampling textures. /// diff --git a/glfft_common.hpp b/glfft_common.hpp index 4065f68..dc35cb0 100644 --- a/glfft_common.hpp +++ b/glfft_common.hpp @@ -33,7 +33,7 @@ namespace GLFFT { -enum Direction +enum Direction : char { /// Forward FFT transform. Forward = -1, @@ -44,7 +44,7 @@ enum Direction Inverse = 1 }; -enum Mode +enum Mode : char { Horizontal, HorizontalDual, @@ -55,7 +55,7 @@ enum Mode ResolveComplexToReal, }; -enum Type +enum Type : char { /// Regular complex-to-complex transform. ComplexToComplex, @@ -68,7 +68,7 @@ enum Type RealToComplex }; -enum Target +enum Target : char { /// GL_SHADER_STORAGE_BUFFER SSBO, @@ -81,6 +81,11 @@ enum Target ImageReal }; +static constexpr char const input_load_texture_code_default[] = + "cfloat load_texture(uvec2 coord) {" + " return load_texture_inner(coord);" + "}"; + struct Parameters { unsigned workgroup_size_x; @@ -96,10 +101,11 @@ struct Parameters bool shared_banked; bool fft_fp16, input_fp16, output_fp16; bool fft_normalize; - + std::string input_load_texture_code; // If empty defaults to input_load_texture_code_default. Unfortunately we can't put it here because that breaks the initializer lists in C++11. bool operator==(const Parameters &other) const { - return std::memcmp(this, &other, sizeof(Parameters)) == 0; + return std::memcmp(this, &other, offsetof(Parameters, input_load_texture_code)) == 0 + && input_load_texture_code == other.input_load_texture_code; } }; @@ -126,7 +132,7 @@ struct FFTOptions struct Type { - /// Whether internal shader should be mediump float. + /// Whether internal shader and intermediate results should be mediump float. bool fp16 = false; /// Whether input SSBO is a packed 2xfp16 format. Otherwise, regular FP32. bool input_fp16 = false; diff --git a/glfft_wisdom.cpp b/glfft_wisdom.cpp index bb5b306..4d936e2 100644 --- a/glfft_wisdom.cpp +++ b/glfft_wisdom.cpp @@ -394,7 +394,7 @@ std::pair FFTWisdom::study(Context *context, co } FFTOptions::Performance perf; - perf.shared_banked = shared_banked; + perf.shared_banked = !!shared_banked; perf.vector_size = vector_size; perf.workgroup_size_x = workgroup_size_x; perf.workgroup_size_y = workgroup_size_y; diff --git a/glsl/fft_common.comp b/glsl/fft_common.comp index c8f0e3c..df95624 100644 --- a/glsl/fft_common.comp +++ b/glsl/fft_common.comp @@ -314,7 +314,7 @@ cfloat load_texture(sampler2D sampler, uvec2 coord) #endif } -cfloat load_texture(uvec2 coord) +cfloat load_texture_inner(uvec2 coord) { #ifdef FFT_CONVOLVE // Convolution in frequency domain is multiplication. @@ -326,6 +326,9 @@ cfloat load_texture(uvec2 coord) #endif } +// This must define a function with signature cfloat load_texture(uvec2 coord) +FFT_LOAD_TEXTURE_CODE + // Implement a dummy load_global, or we have to #ifdef out lots of dead code elsewhere. #ifdef FFT_VEC8 cfloat load_global(uint offset) diff --git a/test/glfft_cli.cpp b/test/glfft_cli.cpp index 2ada152..db0d751 100644 --- a/test/glfft_cli.cpp +++ b/test/glfft_cli.cpp @@ -143,7 +143,7 @@ struct BenchArguments unsigned warmup = 2; unsigned iterations = 20; unsigned dispatches = 50; - unsigned timeout = 1.0; + unsigned timeout = 1; Type type = ComplexToComplex; unsigned size_for_type = 2; const char *string_for_type = "C2C"; @@ -335,6 +335,7 @@ static int cli_test(Context *context, int argc, char *argv[]) cbs.add("--minimum-snr-fp32", [&args](CLIParser &parser) { args.min_snr_fp32 = parser.next_double(); }); cbs.add("--epsilon-fp16", [&args](CLIParser &parser) { args.epsilon_fp16 = parser.next_double(); }); cbs.add("--epsilon-fp32", [&args](CLIParser &parser) { args.epsilon_fp32 = parser.next_double(); }); + cbs.add("--single-base-size", [&args](CLIParser &parser) { args.single_base_size = true; }); cbs.error_handler = [context]{ cli_test_help(context); }; CLIParser parser(move(cbs), argc, argv); @@ -406,7 +407,7 @@ static int cli_bench(Context *context, int argc, char *argv[]) cbs.add("--warmup", [&args](CLIParser &parser) { args.warmup = parser.next_uint(); }); cbs.add("--iterations", [&args](CLIParser &parser) { args.iterations = parser.next_uint(); }); cbs.add("--dispatches", [&args](CLIParser &parser) { args.dispatches = parser.next_uint(); }); - cbs.add("--timeout", [&args](CLIParser &parser) { args.timeout = parser.next_double(); }); + cbs.add("--timeout", [&args](CLIParser &parser) { args.timeout = (unsigned int)parser.next_double(); }); cbs.add("--fp16", [&args](CLIParser&) { args.fp16 = true; }); cbs.add("--type", [&args](CLIParser &parser) { args.type = parse_type(parser.next_string(), args); }); cbs.add("--input-texture", [&args](CLIParser&) { args.input_texture = true; }); diff --git a/test/glfft_cli.hpp b/test/glfft_cli.hpp index 3d29e19..dff67ba 100644 --- a/test/glfft_cli.hpp +++ b/test/glfft_cli.hpp @@ -40,6 +40,7 @@ namespace GLFFT unsigned test_id_min = 0; unsigned test_id_max = 0; bool exhaustive = true; + bool single_base_size = true; bool throw_on_fail = false; double min_snr_fp16 = 50.0; double min_snr_fp32 = 100.0; diff --git a/test/glfft_test.cpp b/test/glfft_test.cpp index 208b423..6dbebfe 100644 --- a/test/glfft_test.cpp +++ b/test/glfft_test.cpp @@ -49,7 +49,7 @@ mufft_buffer alloc(size_t size) using cfloat = complex; -mufft_buffer create_input(unsigned N) +mufft_buffer create_input(size_t N) { auto buffer = alloc(N * sizeof(float)); float *ptr = static_cast(buffer.get()); @@ -231,7 +231,7 @@ static mufft_buffer create_reference(Type type, Direction direction, out = static_cast(output.get()); for (unsigned i = 0; i < output_size / sizeof(cfloat); i++) { - out[i] /= Nx * Ny; + out[i] /= static_cast(Nx * Ny); } return output; @@ -463,7 +463,7 @@ static inline pair fp16_to_fp32(uint32_t v) return make_pair(fp16_to_fp32(lower), fp16_to_fp32(upper)); } -static mufft_buffer convert_fp32_fp16(const float *input, unsigned N) +static mufft_buffer convert_fp32_fp16(const float *input, size_t N) { auto buffer = alloc(N * sizeof(uint16_t)); auto ptr = static_cast(buffer.get()); @@ -476,7 +476,7 @@ static mufft_buffer convert_fp32_fp16(const float *input, unsigned N) return buffer; } -static mufft_buffer convert_fp16_fp32(const uint32_t *input, unsigned N) +static mufft_buffer convert_fp16_fp32(const uint32_t *input, size_t N) { auto buffer = alloc(N * sizeof(float)); auto ptr = static_cast(buffer.get()); @@ -494,11 +494,13 @@ static mufft_buffer convert_fp16_fp32(const uint32_t *input, unsigned N) static void run_test_ssbo(Context *context, const TestSuiteArguments &args, unsigned Nx, unsigned Ny, Type type, Direction direction, const FFTOptions &options, const shared_ptr &cache) { - context->log("Running SSBO -> SSBO FFT, %04u x %04u\n\t%7s transform\n\t%8s\n\tbanked shared %s\n\tvector size %u\n\twork group (%u, %u)\n\tinput fp16 %s\n\toutput fp16 %s ...\n", - Nx, Ny, direction_to_str(direction), type_to_str(type), - options.performance.shared_banked ? "yes" : "no", options.performance.vector_size, options.performance.workgroup_size_x, options.performance.workgroup_size_y, - options.type.input_fp16 ? "yes" : "no", - options.type.output_fp16 ? "yes" : "no"); + context->log("Running SSBO -> SSBO FFT, %04u x %04u\n\t%7s transform\n\t%8s\n\tbanked shared %s\n\tvector size %u\n\twork group (%u, %u)\n\tinput fp16 %s\n\toutput fp16 %s\n\tfp16 %s ...\n", + Nx, Ny, direction_to_str(direction), type_to_str(type), + options.performance.shared_banked ? "yes" : "no", options.performance.vector_size, options.performance.workgroup_size_x, options.performance.workgroup_size_y, + options.type.input_fp16 ? "yes" : "no", + options.type.output_fp16 ? "yes" : "no", + options.type.fp16 ? "yes" : "no"); + unique_ptr test_input; unique_ptr test_output; @@ -531,8 +533,8 @@ static void run_test_ssbo(Context *context, output_data = convert_fp16_fp32(static_cast(output_data.get()), output_size / sizeof(float)); } - float epsilon = options.type.output_fp16 || options.type.input_fp16 ? args.epsilon_fp16 : args.epsilon_fp32; - float min_snr = options.type.output_fp16 || options.type.input_fp16 ? args.min_snr_fp16 : args.min_snr_fp32; + float epsilon = static_cast(options.type.output_fp16 || options.type.input_fp16 ? args.epsilon_fp16 : args.epsilon_fp32); + float min_snr = static_cast(options.type.output_fp16 || options.type.input_fp16 ? args.min_snr_fp16 : args.min_snr_fp32); if (direction == InverseConvolve) { epsilon *= 1.5f; @@ -545,11 +547,12 @@ static void run_test_ssbo(Context *context, static void run_test_texture(Context *context, const TestSuiteArguments &args, unsigned Nx, unsigned Ny, Type type, Direction direction, const FFTOptions &options, const shared_ptr &cache) { - context->log("Running Texture -> SSBO FFT, %04u x %04u\n\t%7s transform\n\t%8s\n\tbanked shared %s\n\tvector size %u\n\twork group (%u, %u)\n\tinput fp16 %s\n\toutput fp16 %s ...\n", - Nx, Ny, direction_to_str(direction), type_to_str(type), - options.performance.shared_banked ? "yes" : "no", options.performance.vector_size, options.performance.workgroup_size_x, options.performance.workgroup_size_y, - options.type.input_fp16 ? "yes" : "no", - options.type.output_fp16 ? "yes" : "no"); + context->log("Running Texture -> SSBO FFT, %04u x %04u\n\t%7s transform\n\t%8s\n\tbanked shared %s\n\tvector size %u\n\twork group (%u, %u)\n\tinput fp16 %s\n\toutput fp16 %s\n\tfp16 %s ...\n", + Nx, Ny, direction_to_str(direction), type_to_str(type), + options.performance.shared_banked ? "yes" : "no", options.performance.vector_size, options.performance.workgroup_size_x, options.performance.workgroup_size_y, + options.type.input_fp16 ? "yes" : "no", + options.type.output_fp16 ? "yes" : "no", + options.type.fp16 ? "yes" : "no"); unique_ptr test_input; unique_ptr test_output; @@ -597,8 +600,8 @@ static void run_test_texture(Context *context, output_data = convert_fp16_fp32(static_cast(output_data.get()), output_size / sizeof(float)); } - float epsilon = options.type.output_fp16 || options.type.input_fp16 ? args.epsilon_fp16 : args.epsilon_fp32; - float min_snr = options.type.output_fp16 || options.type.input_fp16 ? args.min_snr_fp16 : args.min_snr_fp32; + float epsilon = static_cast(options.type.output_fp16 || options.type.input_fp16 ? args.epsilon_fp16 : args.epsilon_fp32); + float min_snr = static_cast(options.type.output_fp16 || options.type.input_fp16 ? args.min_snr_fp16 : args.min_snr_fp32); if (direction == InverseConvolve) { epsilon *= 1.5f; @@ -634,11 +637,12 @@ static mufft_buffer readback_texture(Context *context, Texture *tex, unsigned co static void run_test_image(Context *context, const TestSuiteArguments &args, unsigned Nx, unsigned Ny, Type type, Direction direction, const FFTOptions &options, const shared_ptr &cache) { - context->log("Running SSBO -> Image FFT, %04u x %04u\n\t%7s transform\n\t%8s\n\tbanked shared %s\n\tvector size %u\n\twork group (%u, %u)\n\tinput fp16 %s\n\toutput fp16 %s ...\n", - Nx, Ny, direction_to_str(direction), type_to_str(type), - options.performance.shared_banked ? "yes" : "no", options.performance.vector_size, options.performance.workgroup_size_x, options.performance.workgroup_size_y, - options.type.input_fp16 ? "yes" : "no", - options.type.output_fp16 ? "yes" : "no"); + context->log("Running SSBO -> Image FFT, %04u x %04u\n\t%7s transform\n\t%8s\n\tbanked shared %s\n\tvector size %u\n\twork group (%u, %u)\n\tinput fp16 %s\n\toutput fp16 %s\n\tfp16 %s ...\n", + Nx, Ny, direction_to_str(direction), type_to_str(type), + options.performance.shared_banked ? "yes" : "no", options.performance.vector_size, options.performance.workgroup_size_x, options.performance.workgroup_size_y, + options.type.input_fp16 ? "yes" : "no", + options.type.output_fp16 ? "yes" : "no", + options.type.fp16 ? "yes" : "no"); unique_ptr test_input; @@ -690,8 +694,8 @@ static void run_test_image(Context *context, const TestSuiteArguments &args, uns auto output_data = readback_texture(context, tex.get(), components, Nx, Ny); - float epsilon = components > 1 || options.type.output_fp16 || options.type.input_fp16 ? args.epsilon_fp16 : args.epsilon_fp32; - float min_snr = components > 1 || options.type.output_fp16 || options.type.input_fp16 ? args.min_snr_fp16 : args.min_snr_fp32; + float epsilon = static_cast(options.type.output_fp16 || options.type.input_fp16 ? args.epsilon_fp16 : args.epsilon_fp32); + float min_snr = static_cast(options.type.output_fp16 || options.type.input_fp16 ? args.min_snr_fp16 : args.min_snr_fp32); if (direction == InverseConvolve) { epsilon *= 1.5f; @@ -807,6 +811,11 @@ void GLFFT::Internal::run_test_suite(Context *context, const TestSuiteArguments for (unsigned N = N_mult * (big_workgroup ? 128 : 32); N <= 1024; N <<= 1) { + if (args.single_base_size && N != 256) // Option to make length of test run somewhat reasonable. + { + continue; + } + // Texture -> SSBO enqueue_test(context, tests, args, N, N / 2, ComplexToComplex, Forward, Image, SSBO, options, cache); enqueue_test(context, tests, args, N, N / 2, ComplexToComplex, Inverse, Image, SSBO, options, cache);