From e4f897f1d564fc232fe0d6c970cf935246a1ac0b Mon Sep 17 00:00:00 2001
From: Romain Dolbeau <romain.dolbeau@atos.net>
Date: Wed, 11 Mar 2020 11:41:56 +0100
Subject: [PATCH 1/3] Replace Scatter/Gather in AVX512 DP by a sequence of
 instructions to assemble/disassemble the vector in 128 bits chunks.

This is faster on Skylake, but will not work on Knights Landing (as KNL lacks AVX512DQ), so I've added an --enable-avx512-scattergather option to retain the old behavior and enable compiling/using AVX512 on KNL.
This should help with #143.
---
 configure.ac               |  9 +++++++++
 simd-support/simd-avx512.h | 22 +++++++++++++++++++++-
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/configure.ac b/configure.ac
index 5e33b1c2b..c017f8d02 100644
--- a/configure.ac
+++ b/configure.ac
@@ -152,6 +152,15 @@ if test "$have_avx512" = "yes"; then
 fi
 AM_CONDITIONAL(HAVE_AVX512, test "$have_avx512" = "yes")
 
+AC_ARG_ENABLE(avx512-scattergather, [AC_HELP_STRING([--enable-avx512-scattergather],[Favor scatter/gather when using AVX512 (for Xeon Phi/KNL)])], have_avx512_scattergather=$enableval, have_avx512_scattergather=no)
+if test "$have_avx512_scattergather" = "yes"; then
+        AC_DEFINE(AVX512_SCATTERGATHER,1,[Define to favor scatter/gather when using AVX512.])
+        if test "$have_avx512" != "yes"; then
+                AC_MSG_ERROR([AVX512 Scatter/Gather requires AVX512])
+        fi
+fi
+AM_CONDITIONAL(AVX512_SCATTERGATHER, test "$have_avx512_scattergather" = "yes")
+
 dnl 128-bit AVX is special. There is no reason to use it on Intel processors
 dnl since SSE2 is just as fast. However, on AMD processors we can both use
 dnl FMA4, and 128-bit SIMD is better than 256-bit since core pairs in a
diff --git a/simd-support/simd-avx512.h b/simd-support/simd-avx512.h
index 47b60e942..22ed7d8f6 100644
--- a/simd-support/simd-avx512.h
+++ b/simd-support/simd-avx512.h
@@ -56,7 +56,7 @@
 
 typedef DS(__m512d, __m512) V;
 
-#define VLIT(re, im) DS(SUFF(_mm512_setr)(im, re, im, re, im, re, im, re),SUFF(_mm512_setr)(im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re))
+#define VLIT(re, im) DS(SUFF(_mm512_setr)(re, im, re, im, re, im, re, im),SUFF(_mm512_setr)(re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im))
 #define VLIT1(val) SUFF(_mm512_set1)(val)
 #define LDK(x) x
 #define DVK(var, val) V var = VLIT1(val)
@@ -131,23 +131,43 @@ static inline void STu(R *x, V v, INT ovs, const R *aligned_like)
 static inline V LDu(const R *x, INT ivs, const R *aligned_like)
 {
   (void)aligned_like; /* UNUSED */
+#if defined(AVX512_SCATTERGATHER)
   __m256i index = _mm256_set_epi32(3 * ivs + 1, 3 * ivs,
                                    2 * ivs + 1, 2 * ivs,
                                    1 * ivs + 1, 1 * ivs,
                                    0 * ivs + 1, 0 * ivs);
   
   return _mm512_i32gather_pd(index, x, 8);
+#else
+  __m128d va0, va1, va2, va3;
+  __m256d vb0, vb1;
+  int i;
+  va0 = _mm_loadu_pd(x + 0*ivs);
+  va1 = _mm_loadu_pd(x + 1*ivs);
+  va2 = _mm_loadu_pd(x + 2*ivs);
+  va3 = _mm_loadu_pd(x + 3*ivs);
+  vb0 = _mm256_insertf128_pd(_mm256_castpd128_pd256(va0), va1, 1);
+  vb1 = _mm256_insertf128_pd(_mm256_castpd128_pd256(va2), va3, 1);
+  return (_mm512_insertf64x4(_mm512_castpd256_pd512(vb0), vb1, 1));
+#endif
 }
 
 static inline void STu(R *x, V v, INT ovs, const R *aligned_like)
 {
   (void)aligned_like; /* UNUSED */
+#if defined(AVX512_SCATTERGATHER)
   __m256i index = _mm256_set_epi32(3 * ovs + 1, 3 * ovs,
                                    2 * ovs + 1, 2 * ovs,
                                    1 * ovs + 1, 1 * ovs,
                                    0 * ovs + 1, 0 * ovs);
   
   _mm512_i32scatter_pd(x, index, v, 8);
+#else
+  _mm_storeu_pd(x+ovs*0, _mm512_extractf64x2_pd(v, 0));
+  _mm_storeu_pd(x+ovs*1, _mm512_extractf64x2_pd(v, 1));
+  _mm_storeu_pd(x+ovs*2, _mm512_extractf64x2_pd(v, 2));
+  _mm_storeu_pd(x+ovs*3, _mm512_extractf64x2_pd(v, 3));
+#endif
 }
 
 #endif /* FFTW_SINGLE */

From c6cd1cd3b09d3c1f01f1ae89163e7e3a269191f1 Mon Sep 17 00:00:00 2001
From: Romain Dolbeau <romain.dolbeau@atos.net>
Date: Wed, 11 Mar 2020 13:38:09 +0100
Subject: [PATCH 2/3] In AVX-512 LDu/STu, handle pair of single as a double.

This should improves slightly the performance by reducing the number of uops needed to do the gather/scatter.
---
 simd-support/simd-avx512.h | 26 +++++++-------------------
 1 file changed, 7 insertions(+), 19 deletions(-)

diff --git a/simd-support/simd-avx512.h b/simd-support/simd-avx512.h
index 22ed7d8f6..d69b60c77 100644
--- a/simd-support/simd-avx512.h
+++ b/simd-support/simd-avx512.h
@@ -99,31 +99,19 @@ static inline void STA(R *x, V v, INT ovs, const R *aligned_like) {
 static inline V LDu(const R *x, INT ivs, const R *aligned_like)
 {
   (void)aligned_like; /* UNUSED */
-  __m512i index = _mm512_set_epi32(7 * ivs + 1, 7 * ivs,
-                                   6 * ivs + 1, 6 * ivs,
-                                   5 * ivs + 1, 5 * ivs,
-                                   4 * ivs + 1, 4 * ivs,
-                                   3 * ivs + 1, 3 * ivs,
-                                   2 * ivs + 1, 2 * ivs,
-                                   1 * ivs + 1, 1 * ivs,
-                                   0 * ivs + 1, 0 * ivs);
+  /* pretend pair of single are a double */
+  const __m256i index = _mm256_set_epi32(7 * ivs, 6 * ivs, 5 * ivs, 4 * ivs, 3 * ivs, 2 * ivs, 1 * ivs, 0 * ivs);
   
-  return _mm512_i32gather_ps(index, x, 4);
+  return (V)_mm512_i32gather_pd(index, x, 4);
 }
 
 static inline void STu(R *x, V v, INT ovs, const R *aligned_like)
 {
   (void)aligned_like; /* UNUSED */
-  __m512i index = _mm512_set_epi32(7 * ovs + 1, 7 * ovs,
-                                   6 * ovs + 1, 6 * ovs,
-                                   5 * ovs + 1, 5 * ovs,
-                                   4 * ovs + 1, 4 * ovs,
-                                   3 * ovs + 1, 3 * ovs,
-                                   2 * ovs + 1, 2 * ovs,
-                                   1 * ovs + 1, 1 * ovs,
-                                   0 * ovs + 1, 0 * ovs);
-  
-  _mm512_i32scatter_ps(x, index, v, 4);
+  /* pretend pair of single are a double */
+  const __m256i index = _mm256_set_epi32(7 * ovs, 6 * ovs, 5 * ovs, 4 * ovs, 3 * ovs, 2 * ovs, 1 * ovs, 0 * ovs);
+
+  _mm512_i32scatter_pd(x, index, (__m512d)v, 4);
 }
 
 #else /* !FFTW_SINGLE */

From fd08d118da79ffbe0a8319b2bbf9f358a2fbd4f5 Mon Sep 17 00:00:00 2001
From: Romain Dolbeau <romain.dolbeau@sipearl.com>
Date: Sun, 14 Jul 2024 16:28:59 +0200
Subject: [PATCH 3/3] new AVX512 code requires AVX512DQ, add the compiler flags
 in configure.ac

---
 configure.ac | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/configure.ac b/configure.ac
index c017f8d02..24f3143a3 100644
--- a/configure.ac
+++ b/configure.ac
@@ -378,7 +378,13 @@ case "${ax_cv_c_compiler_vendor}" in
         fi
 
         # AVX512
-        if test "$have_avx512" = "yes" -a "x$AVX512_CFLAGS" = x; then
+        if test "$have_avx512" = "yes" -a "$have_avx512_scattergather" = "no" -a "x$AVX512_CFLAGS" = x; then
+            AX_CHECK_COMPILE_FLAG(-mavx512f, [AVX512_CFLAGS="-mavx512f -mavx512dq"],
+            [AC_MSG_ERROR([Need a version of gcc with -mavx512f and -mavx512dq])])
+        fi
+
+        # AVX512 (KNL)
+        if test "$have_avx512_scattergather" = "yes" -a "x$AVX512_CFLAGS" = x; then
             AX_CHECK_COMPILE_FLAG(-mavx512f, [AVX512_CFLAGS="-mavx512f"],
             [AC_MSG_ERROR([Need a version of gcc with -mavx512f])])
         fi