From fc5c078b87d5242d21cdbb680c9a84db66d04260 Mon Sep 17 00:00:00 2001
From: ActuallyaDeviloper <ActuallyaDeviloper@users.noreply.github.com>
Date: Mon, 2 Mar 2020 19:20:55 +0100
Subject: [PATCH 1/6] Add new parameters to the FFT's constructor to be able to
 reuse internal buffers and avoid unnecessary GPU memory usage.

---
 glfft.cpp | 26 +++++++++++++++++---------
 glfft.hpp | 11 ++++++++++-
 2 files changed, 27 insertions(+), 10 deletions(-)
diff --git a/glfft.cpp b/glfft.cpp
index bf55422..fae49c0 100644
--- a/glfft.cpp
+++ b/glfft.cpp
@@ -522,7 +522,8 @@ static inline unsigned type_to_input_components(Type type)
 
 FFT::FFT(Context *context, unsigned Nx, unsigned Ny,
         Type type, Direction direction, Target input_target, Target output_target,
-        std::shared_ptr<ProgramCache> program_cache, const FFTOptions &options, const FFTWisdom &wisdom)
+        std::shared_ptr<ProgramCache> program_cache, const FFTOptions &options, const FFTWisdom &wisdom,
+       std::unique_ptr<Buffer> reuse_preallocated_temporary_buffer0, std::unique_ptr<Buffer> reuse_preallocated_temporary_buffer1)
     : context(context), cache(move(program_cache)), size_x(Nx), size_y(Ny)
 {
     set_texture_offset_scale(0.5f / Nx, 0.5f / Ny, 1.0f / Nx, 1.0f / Ny);
@@ -530,10 +531,14 @@ FFT::FFT(Context *context, unsigned Nx, unsigned Ny,
     size_t temp_buffer_size = Nx * Ny * sizeof(float) * (type == ComplexToComplexDual ? 4 : 2);
     temp_buffer_size >>= options.type.output_fp16;
 
-    temp_buffer = context->create_buffer(nullptr, temp_buffer_size, AccessStreamCopy);
+    temp_buffer = reuse_preallocated_temporary_buffer0 ? 
+        std::move(reuse_preallocated_temporary_buffer0) :
+        context->create_buffer(nullptr, temp_buffer_size, AccessStreamCopy);
     if (output_target != SSBO)
     {
-        temp_buffer_image = context->create_buffer(nullptr, temp_buffer_size, AccessStreamCopy);
+        temp_buffer_image = reuse_preallocated_temporary_buffer1 ?
+            std::move(reuse_preallocated_temporary_buffer1) :
+            context->create_buffer(nullptr, temp_buffer_size, AccessStreamCopy);
     }
 
     bool expand = false;
@@ -1001,9 +1006,7 @@ void FFT::process(CommandBuffer *cmd, Resource *output, Resource *input, Resourc
 
     Resource *buffers[2] = {
         input,
-        passes.size() & 1 ?
-            (passes.back().parameters.output_target != SSBO ? temp_buffer_image.get() : output) :
-            temp_buffer.get(),
+        (passes.back().parameters.output_target == SSBO && passes.size() & 1) ? output : temp_buffer.get()
     };
 
     if (input_aux != 0)
@@ -1144,9 +1147,14 @@ void FFT::process(CommandBuffer *cmd, Resource *output, Resource *input, Resourc
 
         if (pass_index == 0)
         {
-            buffers[0] = passes.size() & 1 ?
-                temp_buffer.get() :
-                (passes.back().parameters.output_target != SSBO ? temp_buffer_image.get() : output);
+            if (passes.back().parameters.output_target == SSBO)
+            {
+                buffers[0] = passes.size() & 1 ? temp_buffer.get() : output;
+            }
+            else
+            {
+                buffers[0] = temp_buffer_image.get();
+            }
         }
 
         swap(buffers[0], buffers[1]);
diff --git a/glfft.hpp b/glfft.hpp
index c3b1a69..cd0ff4b 100644
--- a/glfft.hpp
+++ b/glfft.hpp
@@ -55,10 +55,19 @@ class FFT
         /// @param options       FFT options such as performance related parameters and types.
         /// @param wisdom        GLFFT wisdom which can override performance related options
         ///                      (options.performance is used as a fallback).
+        /// @param reuse_preallocated_temporary_buffer0
+        ///                      For large FFTs also a large internal temporary buffer is required. To reduce memory consumption
+        ///                      you can provide a preallocated buffer here that can be shared with other parts of the program.
+        ///                      The buffer must have size at least Nx * Ny * (type == ComplexToComplexDual ? 4 : 2) * (options.type.fp16 ? 2 : 4).
+        ///                      The provided buffer must not be used while the FFT is in progress and will contain unpredictable garbage data afterwards.
+        /// @param reuse_preallocated_temporary_buffer1
+        ///                      Same as reuse_preallocated_temporary_buffer0 and used only if the output is a texture.
+        ///                      May be aliased with the input if the input if the input is not needed again after processing.
         FFT(Context *context, unsigned Nx, unsigned Ny,
                 Type type, Direction direction, Target input_target, Target output_target,
                 std::shared_ptr<ProgramCache> cache, const FFTOptions &options,
-                const FFTWisdom &wisdom = FFTWisdom());
+                const FFTWisdom &wisdom = FFTWisdom(), std::unique_ptr<Buffer> reuse_preallocated_temporary_buffer0 = nullptr, 
+                std::unique_ptr<Buffer> reuse_preallocated_temporary_buffer1 = nullptr);
 
         /// @brief Creates a single stage FFT. Used mostly internally for benchmarking partial FFTs.
         ///

From 8af8eabedc6995bb98e53d168356572c37b84e57 Mon Sep 17 00:00:00 2001
From: ActuallyaDeviloper <ActuallyaDeviloper@users.noreply.github.com>
Date: Mon, 2 Mar 2020 22:19:41 +0100
Subject: [PATCH 2/6] Allow specifying custom sampling code in the shader.

---
 glfft.cpp            |  9 +++++++--
 glfft.hpp            |  8 +++++++-
 glfft_common.hpp     | 18 ++++++++++++------
 glsl/fft_common.comp |  5 ++++-
 4 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/glfft.cpp b/glfft.cpp
index fae49c0..d1bdefe 100644
--- a/glfft.cpp
+++ b/glfft.cpp
@@ -523,7 +523,7 @@ static inline unsigned type_to_input_components(Type type)
 FFT::FFT(Context *context, unsigned Nx, unsigned Ny,
         Type type, Direction direction, Target input_target, Target output_target,
         std::shared_ptr<ProgramCache> program_cache, const FFTOptions &options, const FFTWisdom &wisdom,
-       std::unique_ptr<Buffer> reuse_preallocated_temporary_buffer0, std::unique_ptr<Buffer> reuse_preallocated_temporary_buffer1)
+        std::string input_load_texture_code, std::unique_ptr<Buffer> reuse_preallocated_temporary_buffer0, std::unique_ptr<Buffer> reuse_preallocated_temporary_buffer1)
     : context(context), cache(move(program_cache)), size_x(Nx), size_y(Ny)
 {
     set_texture_offset_scale(0.5f / Nx, 0.5f / Ny, 1.0f / Nx, 1.0f / Ny);
@@ -649,6 +649,7 @@ FFT::FFT(Context *context, unsigned Nx, unsigned Ny,
                 radix.shared_banked,
                 options.type.fp16, input_fp16, options.type.output_fp16,
                 options.type.normalize,
+                input_load_texture_code
             };
 
             const Pass pass = {
@@ -736,7 +737,7 @@ void FFT::store_shader_string(const char *path, const string &source)
 unique_ptr<Program> FFT::build_program(const Parameters &params)
 {
     string str;
-    str.reserve(16 * 1024);
+    str.reserve(64 * 1024);
 
 #if 0
     context->log("Building program:\n");
@@ -799,6 +800,10 @@ unique_ptr<Program> FFT::build_program(const Parameters &params)
         str += "#define FFT_CONVOLVE\n";
     }
 
+    str += "#define FFT_LOAD_TEXTURE_CODE ";
+    str += params.input_load_texture_code.empty() ? input_load_texture_code_default : params.input_load_texture_code;
+    str += "\n";
+
     str += params.shared_banked ? "#define FFT_SHARED_BANKED 1\n" : "#define FFT_SHARED_BANKED 0\n";
 
     str += params.direction == Forward ? "#define FFT_FORWARD\n" : "#define FFT_INVERSE\n";
diff --git a/glfft.hpp b/glfft.hpp
index cd0ff4b..a202098 100644
--- a/glfft.hpp
+++ b/glfft.hpp
@@ -55,6 +55,10 @@ class FFT
         /// @param options       FFT options such as performance related parameters and types.
         /// @param wisdom        GLFFT wisdom which can override performance related options
         ///                      (options.performance is used as a fallback).
+        /// @param input_load_texture_code
+        ///                      Custom code for sampling the input texture can be inserted here.
+        ///                      This must only use a single line and must define a function with signature 
+        ///                      "cfloat load_texture(uvec2 coord)" and can call "cfloat load_texture_inner(uvec2 coord)".
         /// @param reuse_preallocated_temporary_buffer0
         ///                      For large FFTs also a large internal temporary buffer is required. To reduce memory consumption
         ///                      you can provide a preallocated buffer here that can be shared with other parts of the program.
@@ -66,7 +70,9 @@ class FFT
         FFT(Context *context, unsigned Nx, unsigned Ny,
                 Type type, Direction direction, Target input_target, Target output_target,
                 std::shared_ptr<ProgramCache> cache, const FFTOptions &options,
-                const FFTWisdom &wisdom = FFTWisdom(), std::unique_ptr<Buffer> reuse_preallocated_temporary_buffer0 = nullptr, 
+                const FFTWisdom &wisdom = FFTWisdom(), 
+                std::string input_load_texture_code = input_load_texture_code_default,
+                std::unique_ptr<Buffer> reuse_preallocated_temporary_buffer0 = nullptr,
                 std::unique_ptr<Buffer> reuse_preallocated_temporary_buffer1 = nullptr);
 
         /// @brief Creates a single stage FFT. Used mostly internally for benchmarking partial FFTs.
diff --git a/glfft_common.hpp b/glfft_common.hpp
index 4065f68..b2b6a1a 100644
--- a/glfft_common.hpp
+++ b/glfft_common.hpp
@@ -33,7 +33,7 @@
 namespace GLFFT
 {
 
-enum Direction
+enum Direction : char
 {
     /// Forward FFT transform.
     Forward = -1,
@@ -44,7 +44,7 @@ enum Direction
     Inverse = 1
 };
 
-enum Mode
+enum Mode : char
 {
     Horizontal,
     HorizontalDual,
@@ -55,7 +55,7 @@ enum Mode
     ResolveComplexToReal,
 };
 
-enum Type
+enum Type : char
 {
     /// Regular complex-to-complex transform.
     ComplexToComplex,
@@ -68,7 +68,7 @@ enum Type
     RealToComplex
 };
 
-enum Target
+enum Target : char
 {
     /// GL_SHADER_STORAGE_BUFFER
     SSBO,
@@ -81,6 +81,11 @@ enum Target
     ImageReal
 };
 
+static constexpr char const input_load_texture_code_default[] = 
+    "cfloat load_texture(uvec2 coord) {"
+    "    return load_texture_inner(coord);"
+    "}";
+
 struct Parameters
 {
     unsigned workgroup_size_x;
@@ -96,10 +101,11 @@ struct Parameters
     bool shared_banked;
     bool fft_fp16, input_fp16, output_fp16;
     bool fft_normalize;
-
+    std::string input_load_texture_code; // If empty defaults to input_load_texture_code_default. Unfortunately we can't put it here because that breaks the initializer lists in C++11.
     bool operator==(const Parameters &other) const
     {
-        return std::memcmp(this, &other, sizeof(Parameters)) == 0;
+        return std::memcmp(this, &other, offsetof(Parameters, input_load_texture_code)) == 0
+            && input_load_texture_code == other.input_load_texture_code;
     }
 };
 
diff --git a/glsl/fft_common.comp b/glsl/fft_common.comp
index c8f0e3c..df95624 100644
--- a/glsl/fft_common.comp
+++ b/glsl/fft_common.comp
@@ -314,7 +314,7 @@ cfloat load_texture(sampler2D sampler, uvec2 coord)
 #endif
 }
 
-cfloat load_texture(uvec2 coord)
+cfloat load_texture_inner(uvec2 coord)
 {
 #ifdef FFT_CONVOLVE
     // Convolution in frequency domain is multiplication.
@@ -326,6 +326,9 @@ cfloat load_texture(uvec2 coord)
 #endif
 }
 
+// This must define a function with signature cfloat load_texture(uvec2 coord)
+FFT_LOAD_TEXTURE_CODE
+
 // Implement a dummy load_global, or we have to #ifdef out lots of dead code elsewhere.
 #ifdef FFT_VEC8
 cfloat load_global(uint offset)

From 9502db7eca4e88a869882b048b325bf6ba03a7f2 Mon Sep 17 00:00:00 2001
From: ActuallyaDeviloper <ActuallyaDeviloper@users.noreply.github.com>
Date: Tue, 3 Mar 2020 19:28:07 +0100
Subject: [PATCH 3/6] Fix a couple warnings by using the correct data type.

---
 glfft.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/glfft.hpp b/glfft.hpp
index a202098..e1c6252 100644
--- a/glfft.hpp
+++ b/glfft.hpp
@@ -130,12 +130,12 @@ class FFT
         double get_cost() const { return cost; }
 
         /// @brief Returns number of passes (glDispatchCompute) in a process() call.
-        unsigned get_num_passes() const { return passes.size(); }
+        size_t get_num_passes() const { return passes.size(); }
 
         /// @brief Returns Nx.
-        unsigned get_dimension_x() const { return size_x; }
+        size_t get_dimension_x() const { return size_x; }
         /// @brief Returns Ny.
-        unsigned get_dimension_y() const { return size_y; }
+        size_t get_dimension_y() const { return size_y; }
 
         /// @brief Sets offset and scale parameters for normalized texel coordinates when sampling textures.
         ///

From 70d927c7658282a1652cb4e0b00e51a0a1f1cb6d Mon Sep 17 00:00:00 2001
From: ActuallyaDeviloper <ActuallyaDeviloper@users.noreply.github.com>
Date: Sat, 7 Mar 2020 00:28:22 +0100
Subject: [PATCH 4/6] Fix glfft intermediate data formats. Sometimes it is
 internally using half precision for intermediate results nobody asked for.
 This introduces rounding errors.

---
 glfft.cpp        | 24 +++++++++++++++---------
 glfft_common.hpp |  2 +-
 2 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/glfft.cpp b/glfft.cpp
index d1bdefe..2c81012 100644
--- a/glfft.cpp
+++ b/glfft.cpp
@@ -529,12 +529,12 @@ FFT::FFT(Context *context, unsigned Nx, unsigned Ny,
     set_texture_offset_scale(0.5f / Nx, 0.5f / Ny, 1.0f / Nx, 1.0f / Ny);
 
     size_t temp_buffer_size = Nx * Ny * sizeof(float) * (type == ComplexToComplexDual ? 4 : 2);
-    temp_buffer_size >>= options.type.output_fp16;
+    temp_buffer_size >>= options.type.fp16;
 
     temp_buffer = reuse_preallocated_temporary_buffer0 ? 
         std::move(reuse_preallocated_temporary_buffer0) :
         context->create_buffer(nullptr, temp_buffer_size, AccessStreamCopy);
-    if (output_target != SSBO)
+    if (output_target != SSBO || (options.type.output_fp16 && !options.type.fp16)) // @HigherIntermediatePrecision We may need higher intermediate precision.
     {
         temp_buffer_image = reuse_preallocated_temporary_buffer1 ?
             std::move(reuse_preallocated_temporary_buffer1) :
@@ -629,7 +629,8 @@ FFT::FFT(Context *context, unsigned Nx, unsigned Ny,
             // If this is the last pass and we're writing to an image, use a special shader variant.
             bool last_pass = index == last_index && i == radix_direction.size() - 1;
 
-            bool input_fp16 = passes.empty() ? options.type.input_fp16 : options.type.output_fp16;
+            bool input_fp16 = passes.empty() ? options.type.input_fp16 : options.type.fp16;
+            bool output_fp16 = last_pass ? options.type.output_fp16 : options.type.fp16;
             Target out_target = last_pass ? output_target : SSBO;
             Target in_target = passes.empty() ? input_target : SSBO;
             Direction dir = direction == InverseConvolve && !passes.empty() ? Inverse : direction;
@@ -647,7 +648,7 @@ FFT::FFT(Context *context, unsigned Nx, unsigned Ny,
                 out_target,
                 p == 1,
                 radix.shared_banked,
-                options.type.fp16, input_fp16, options.type.output_fp16,
+                options.type.fp16, input_fp16, output_fp16,
                 options.type.normalize,
                 input_load_texture_code
             };
@@ -670,7 +671,6 @@ FFT::FFT(Context *context, unsigned Nx, unsigned Ny,
         // This way, we avoid having special purpose transforms for all FFT variants.
         if (index == 0 && (type == ComplexToReal || type == RealToComplex))
         {
-            bool input_fp16 = passes.empty() ? options.type.input_fp16 : options.type.output_fp16;
             bool last_pass = radices[1].empty();
             Direction dir = direction == InverseConvolve && !passes.empty() ? Inverse : direction;
             Target in_target = passes.empty() ? input_target : SSBO;
@@ -679,7 +679,8 @@ FFT::FFT(Context *context, unsigned Nx, unsigned Ny,
             unsigned uv_scale_x = 1;
 
             auto base_opts = options;
-            base_opts.type.input_fp16 = input_fp16;
+            base_opts.type.input_fp16 = passes.empty() ? options.type.input_fp16 : options.type.fp16;
+            base_opts.type.output_fp16 = last_pass ? options.type.output_fp16 : options.type.fp16;
 
             auto &opts = wisdom.find_optimal_options_or_default(Nx, Ny, 2, mode, in_target, out_target, base_opts);
             auto res = build_resolve_radix(Nx, Ny, { opts.workgroup_size_x, opts.workgroup_size_y, 1 });
@@ -1011,7 +1012,7 @@ void FFT::process(CommandBuffer *cmd, Resource *output, Resource *input, Resourc
 
     Resource *buffers[2] = {
         input,
-        (passes.back().parameters.output_target == SSBO && passes.size() & 1) ? output : temp_buffer.get()
+        (!temp_buffer_image && passes.size() & 1) ? output : temp_buffer.get() // If no 'temp_buffer_image' is available, we must be use the output buffer directly. 
     };
 
     if (input_aux != 0)
@@ -1066,6 +1067,11 @@ void FFT::process(CommandBuffer *cmd, Resource *output, Resource *input, Resourc
         constant_data.stride = pass.stride;
         p *= pass.parameters.radix;
 
+        if (pass_index + 1 >= passes.size()) // In the last pass we need to inject our output buffer.
+        {
+            buffers[1] = output; 
+        }
+
         if (pass.parameters.input_target != SSBO)
         {
             cmd->bind_texture(BindingTexture0, static_cast<Texture*>(buffers[0]));
@@ -1125,7 +1131,7 @@ void FFT::process(CommandBuffer *cmd, Resource *output, Resource *input, Resourc
                         break;
                 }
             }
-            cmd->bind_storage_texture(BindingImage, static_cast<Texture*>(output), format);
+            cmd->bind_storage_texture(BindingImage, static_cast<Texture*>(buffers[1]), format);
         }
         else
         {
@@ -1152,7 +1158,7 @@ void FFT::process(CommandBuffer *cmd, Resource *output, Resource *input, Resourc
 
         if (pass_index == 0)
         {
-            if (passes.back().parameters.output_target == SSBO)
+            if (!temp_buffer_image) // If no 'temp_buffer_image' is available, we must be use the output buffer directly. 
             {
                 buffers[0] = passes.size() & 1 ? temp_buffer.get() : output;
             }
diff --git a/glfft_common.hpp b/glfft_common.hpp
index b2b6a1a..dc35cb0 100644
--- a/glfft_common.hpp
+++ b/glfft_common.hpp
@@ -132,7 +132,7 @@ struct FFTOptions
 
     struct Type
     {
-        /// Whether internal shader should be mediump float.
+        /// Whether internal shader and intermediate results should be mediump float.
         bool fp16 = false;
         /// Whether input SSBO is a packed 2xfp16 format. Otherwise, regular FP32.
         bool input_fp16 = false;

From 1bc68b75fd1b47d479e0b58319a958d0bd90d51a Mon Sep 17 00:00:00 2001
From: ActuallyaDeviloper <ActuallyaDeviloper@users.noreply.github.com>
Date: Sat, 7 Mar 2020 15:45:05 +0100
Subject: [PATCH 5/6] New option to only run some tests.

---
 test/glfft_cli.cpp  |  1 +
 test/glfft_cli.hpp  |  1 +
 test/glfft_test.cpp | 39 ++++++++++++++++++++++++---------------
 3 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/test/glfft_cli.cpp b/test/glfft_cli.cpp
index 2ada152..a7c2709 100644
--- a/test/glfft_cli.cpp
+++ b/test/glfft_cli.cpp
@@ -335,6 +335,7 @@ static int cli_test(Context *context, int argc, char *argv[])
     cbs.add("--minimum-snr-fp32", [&args](CLIParser &parser) { args.min_snr_fp32 = parser.next_double(); });
     cbs.add("--epsilon-fp16",     [&args](CLIParser &parser) { args.epsilon_fp16 = parser.next_double(); });
     cbs.add("--epsilon-fp32",     [&args](CLIParser &parser) { args.epsilon_fp32 = parser.next_double(); });
+    cbs.add("--single-base-size",      [&args](CLIParser &parser) { args.single_base_size = true; });
 
     cbs.error_handler = [context]{ cli_test_help(context); };
     CLIParser parser(move(cbs), argc, argv);
diff --git a/test/glfft_cli.hpp b/test/glfft_cli.hpp
index 3d29e19..dff67ba 100644
--- a/test/glfft_cli.hpp
+++ b/test/glfft_cli.hpp
@@ -40,6 +40,7 @@ namespace GLFFT
             unsigned test_id_min = 0;
             unsigned test_id_max = 0;
             bool exhaustive = true;
+            bool single_base_size = true;
             bool throw_on_fail = false;
             double min_snr_fp16 = 50.0;
             double min_snr_fp32 = 100.0;
diff --git a/test/glfft_test.cpp b/test/glfft_test.cpp
index 208b423..300bde3 100644
--- a/test/glfft_test.cpp
+++ b/test/glfft_test.cpp
@@ -494,11 +494,13 @@ static mufft_buffer convert_fp16_fp32(const uint32_t *input, unsigned N)
 static void run_test_ssbo(Context *context,
         const TestSuiteArguments &args, unsigned Nx, unsigned Ny, Type type, Direction direction, const FFTOptions &options, const shared_ptr<ProgramCache> &cache)
 {
-    context->log("Running SSBO -> SSBO FFT, %04u x %04u\n\t%7s transform\n\t%8s\n\tbanked shared %s\n\tvector size %u\n\twork group (%u, %u)\n\tinput fp16 %s\n\toutput fp16 %s ...\n",
-            Nx, Ny, direction_to_str(direction), type_to_str(type),
-            options.performance.shared_banked ? "yes" : "no", options.performance.vector_size, options.performance.workgroup_size_x, options.performance.workgroup_size_y,
-            options.type.input_fp16 ? "yes" : "no",
-            options.type.output_fp16 ? "yes" : "no");
+    context->log("Running SSBO -> SSBO FFT, %04u x %04u\n\t%7s transform\n\t%8s\n\tbanked shared %s\n\tvector size %u\n\twork group (%u, %u)\n\tinput fp16 %s\n\toutput fp16 %s\n\tfp16 %s ...\n",
+        Nx, Ny, direction_to_str(direction), type_to_str(type),
+        options.performance.shared_banked ? "yes" : "no", options.performance.vector_size, options.performance.workgroup_size_x, options.performance.workgroup_size_y,
+        options.type.input_fp16 ? "yes" : "no",
+        options.type.output_fp16 ? "yes" : "no",
+        options.type.fp16 ? "yes" : "no");
+
 
     unique_ptr<Buffer> test_input;
     unique_ptr<Buffer> test_output;
@@ -545,11 +547,12 @@ static void run_test_ssbo(Context *context,
 static void run_test_texture(Context *context,
         const TestSuiteArguments &args, unsigned Nx, unsigned Ny, Type type, Direction direction, const FFTOptions &options, const shared_ptr<ProgramCache> &cache)
 {
-    context->log("Running Texture -> SSBO FFT, %04u x %04u\n\t%7s transform\n\t%8s\n\tbanked shared %s\n\tvector size %u\n\twork group (%u, %u)\n\tinput fp16 %s\n\toutput fp16 %s ...\n",
-            Nx, Ny, direction_to_str(direction), type_to_str(type),
-            options.performance.shared_banked ? "yes" : "no", options.performance.vector_size, options.performance.workgroup_size_x, options.performance.workgroup_size_y,
-            options.type.input_fp16 ? "yes" : "no",
-            options.type.output_fp16 ? "yes" : "no");
+    context->log("Running Texture -> SSBO FFT, %04u x %04u\n\t%7s transform\n\t%8s\n\tbanked shared %s\n\tvector size %u\n\twork group (%u, %u)\n\tinput fp16 %s\n\toutput fp16 %s\n\tfp16 %s ...\n",
+        Nx, Ny, direction_to_str(direction), type_to_str(type),
+        options.performance.shared_banked ? "yes" : "no", options.performance.vector_size, options.performance.workgroup_size_x, options.performance.workgroup_size_y,
+        options.type.input_fp16 ? "yes" : "no",
+        options.type.output_fp16 ? "yes" : "no",
+        options.type.fp16 ? "yes" : "no");
 
     unique_ptr<Texture> test_input;
     unique_ptr<Buffer> test_output;
@@ -634,11 +637,12 @@ static mufft_buffer readback_texture(Context *context, Texture *tex, unsigned co
 
 static void run_test_image(Context *context, const TestSuiteArguments &args, unsigned Nx, unsigned Ny, Type type, Direction direction, const FFTOptions &options, const shared_ptr<ProgramCache> &cache)
 {
-    context->log("Running SSBO -> Image FFT, %04u x %04u\n\t%7s transform\n\t%8s\n\tbanked shared %s\n\tvector size %u\n\twork group (%u, %u)\n\tinput fp16 %s\n\toutput fp16 %s ...\n",
-            Nx, Ny, direction_to_str(direction), type_to_str(type),
-            options.performance.shared_banked ? "yes" : "no", options.performance.vector_size, options.performance.workgroup_size_x, options.performance.workgroup_size_y,
-            options.type.input_fp16 ? "yes" : "no",
-            options.type.output_fp16 ? "yes" : "no");
+    context->log("Running SSBO -> Image FFT, %04u x %04u\n\t%7s transform\n\t%8s\n\tbanked shared %s\n\tvector size %u\n\twork group (%u, %u)\n\tinput fp16 %s\n\toutput fp16 %s\n\tfp16 %s ...\n",
+        Nx, Ny, direction_to_str(direction), type_to_str(type),
+        options.performance.shared_banked ? "yes" : "no", options.performance.vector_size, options.performance.workgroup_size_x, options.performance.workgroup_size_y,
+        options.type.input_fp16 ? "yes" : "no",
+        options.type.output_fp16 ? "yes" : "no", 
+        options.type.fp16 ? "yes" : "no");
 
     unique_ptr<Buffer> test_input;
 
@@ -807,6 +811,11 @@ void GLFFT::Internal::run_test_suite(Context *context, const TestSuiteArguments
 
         for (unsigned N = N_mult * (big_workgroup ? 128 : 32); N <= 1024; N <<= 1)
         {
+            if (args.single_base_size && N != 256) // Option to make length of test run somewhat reasonable.
+            {
+                continue;
+            }
+
             // Texture -> SSBO
             enqueue_test(context, tests, args, N, N / 2, ComplexToComplex, Forward, Image, SSBO, options, cache);
             enqueue_test(context, tests, args, N, N / 2, ComplexToComplex, Inverse, Image, SSBO, options, cache);

From 821c677db7edb4f3ccc37d550f1b7817f4dbceaa Mon Sep 17 00:00:00 2001
From: ActuallyaDeviloper <ActuallyaDeviloper@users.noreply.github.com>
Date: Sat, 7 Mar 2020 16:37:55 +0100
Subject: [PATCH 6/6] Fix some more compiler warnings.

---
 glfft_wisdom.cpp    |  2 +-
 test/glfft_cli.cpp  |  4 ++--
 test/glfft_test.cpp | 20 ++++++++++----------
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/glfft_wisdom.cpp b/glfft_wisdom.cpp
index bb5b306..4d936e2 100644
--- a/glfft_wisdom.cpp
+++ b/glfft_wisdom.cpp
@@ -394,7 +394,7 @@ std::pair<double, FFTOptions::Performance> FFTWisdom::study(Context *context, co
                     }
 
                     FFTOptions::Performance perf;
-                    perf.shared_banked = shared_banked;
+                    perf.shared_banked = !!shared_banked;
                     perf.vector_size = vector_size;
                     perf.workgroup_size_x = workgroup_size_x;
                     perf.workgroup_size_y = workgroup_size_y;
diff --git a/test/glfft_cli.cpp b/test/glfft_cli.cpp
index a7c2709..db0d751 100644
--- a/test/glfft_cli.cpp
+++ b/test/glfft_cli.cpp
@@ -143,7 +143,7 @@ struct BenchArguments
     unsigned warmup = 2;
     unsigned iterations = 20;
     unsigned dispatches = 50;
-    unsigned timeout = 1.0;
+    unsigned timeout = 1;
     Type type = ComplexToComplex;
     unsigned size_for_type = 2;
     const char *string_for_type = "C2C";
@@ -407,7 +407,7 @@ static int cli_bench(Context *context, int argc, char *argv[])
     cbs.add("--warmup",         [&args](CLIParser &parser) { args.warmup = parser.next_uint(); });
     cbs.add("--iterations",     [&args](CLIParser &parser) { args.iterations = parser.next_uint(); });
     cbs.add("--dispatches",     [&args](CLIParser &parser) { args.dispatches = parser.next_uint(); });
-    cbs.add("--timeout",        [&args](CLIParser &parser) { args.timeout = parser.next_double(); });
+    cbs.add("--timeout",        [&args](CLIParser &parser) { args.timeout = (unsigned int)parser.next_double(); });
     cbs.add("--fp16",           [&args](CLIParser&)        { args.fp16 = true; });
     cbs.add("--type",           [&args](CLIParser &parser) { args.type = parse_type(parser.next_string(), args); });
     cbs.add("--input-texture",  [&args](CLIParser&)        { args.input_texture = true; });
diff --git a/test/glfft_test.cpp b/test/glfft_test.cpp
index 300bde3..6dbebfe 100644
--- a/test/glfft_test.cpp
+++ b/test/glfft_test.cpp
@@ -49,7 +49,7 @@ mufft_buffer alloc(size_t size)
 
 using cfloat = complex<float>;
 
-mufft_buffer create_input(unsigned N)
+mufft_buffer create_input(size_t N)
 {
     auto buffer = alloc(N * sizeof(float));
     float *ptr = static_cast<float*>(buffer.get());
@@ -231,7 +231,7 @@ static mufft_buffer create_reference(Type type, Direction direction,
     out = static_cast<cfloat*>(output.get());
     for (unsigned i = 0; i < output_size / sizeof(cfloat); i++)
     {
-        out[i] /= Nx * Ny;
+        out[i] /= static_cast<float>(Nx * Ny);
     }
 
     return output;
@@ -463,7 +463,7 @@ static inline pair<float, float> fp16_to_fp32(uint32_t v)
     return make_pair(fp16_to_fp32(lower), fp16_to_fp32(upper));
 }
 
-static mufft_buffer convert_fp32_fp16(const float *input, unsigned N)
+static mufft_buffer convert_fp32_fp16(const float *input, size_t N)
 {
     auto buffer = alloc(N * sizeof(uint16_t));
     auto ptr = static_cast<uint32_t*>(buffer.get());
@@ -476,7 +476,7 @@ static mufft_buffer convert_fp32_fp16(const float *input, unsigned N)
     return buffer;
 }
 
-static mufft_buffer convert_fp16_fp32(const uint32_t *input, unsigned N)
+static mufft_buffer convert_fp16_fp32(const uint32_t *input, size_t N)
 {
     auto buffer = alloc(N * sizeof(float));
     auto ptr = static_cast<float*>(buffer.get());
@@ -533,8 +533,8 @@ static void run_test_ssbo(Context *context,
         output_data = convert_fp16_fp32(static_cast<const uint32_t*>(output_data.get()), output_size / sizeof(float));
     }
 
-    float epsilon = options.type.output_fp16 || options.type.input_fp16 ? args.epsilon_fp16 : args.epsilon_fp32;
-    float min_snr = options.type.output_fp16 || options.type.input_fp16 ? args.min_snr_fp16 : args.min_snr_fp32;
+    float epsilon = static_cast<float>(options.type.output_fp16 || options.type.input_fp16 ? args.epsilon_fp16 : args.epsilon_fp32);
+    float min_snr = static_cast<float>(options.type.output_fp16 || options.type.input_fp16 ? args.min_snr_fp16 : args.min_snr_fp32);
     if (direction == InverseConvolve)
     {
         epsilon *= 1.5f;
@@ -600,8 +600,8 @@ static void run_test_texture(Context *context,
         output_data = convert_fp16_fp32(static_cast<const uint32_t*>(output_data.get()), output_size / sizeof(float));
     }
 
-    float epsilon = options.type.output_fp16 || options.type.input_fp16 ? args.epsilon_fp16 : args.epsilon_fp32;
-    float min_snr = options.type.output_fp16 || options.type.input_fp16 ? args.min_snr_fp16 : args.min_snr_fp32;
+    float epsilon = static_cast<float>(options.type.output_fp16 || options.type.input_fp16 ? args.epsilon_fp16 : args.epsilon_fp32);
+    float min_snr = static_cast<float>(options.type.output_fp16 || options.type.input_fp16 ? args.min_snr_fp16 : args.min_snr_fp32);
     if (direction == InverseConvolve)
     {
         epsilon *= 1.5f;
@@ -694,8 +694,8 @@ static void run_test_image(Context *context, const TestSuiteArguments &args, uns
 
     auto output_data = readback_texture(context, tex.get(), components, Nx, Ny);
 
-    float epsilon = components > 1 || options.type.output_fp16 || options.type.input_fp16 ? args.epsilon_fp16 : args.epsilon_fp32;
-    float min_snr = components > 1 || options.type.output_fp16 || options.type.input_fp16 ? args.min_snr_fp16 : args.min_snr_fp32;
+    float epsilon = static_cast<float>(options.type.output_fp16 || options.type.input_fp16 ? args.epsilon_fp16 : args.epsilon_fp32);
+    float min_snr = static_cast<float>(options.type.output_fp16 || options.type.input_fp16 ? args.min_snr_fp16 : args.min_snr_fp32);
     if (direction == InverseConvolve)
     {
         epsilon *= 1.5f;