Small performance improvement for avx-512 on Skylake-SP #191

xiegengxin · 2021-07-07T05:54:18Z

return (V)_mm512_i32gather_pd(index, x, 8);
Right?

Wrote that code a while ago, but I think 4 is correct; the indices are still referring to the original datatype - single precision value of 4 bytes. The_pd variant is used only to access 64 bits at a time explicitly.

You're right.

xiegengxin · 2021-07-07T05:54:40Z

_mm512_i32scatter_pd(x, index, (__m512d)v, 8);

Same here - ovs is a stride in 4-bytes elements, so the index vector is also in 4-bytes element.

-Original file line number
+Diff line change
@@ Expand Up / @@ -152,6 +152,15 @@ if test "$have_avx512" = "yes"; then @@
     fi
     AM_CONDITIONAL(HAVE_AVX512, test "$have_avx512" = "yes")
+    AC_ARG_ENABLE(avx512-scattergather, [AC_HELP_STRING([--enable-avx512-scattergather],[Favor scatter/gather when using AVX512 (for Xeon Phi/KNL)])], have_avx512_scattergather=$enableval, have_avx512_scattergather=no)
+    if test "$have_avx512_scattergather" = "yes"; then
+            AC_DEFINE(AVX512_SCATTERGATHER,1,[Define to favor scatter/gather when using AVX512.])
+            if test "$have_avx512" != "yes"; then
+                    AC_MSG_ERROR([AVX512 Scatter/Gather requires AVX512])
+            fi
+    fi
+    AM_CONDITIONAL(AVX512_SCATTERGATHER, test "$have_avx512_scattergather" = "yes")
     dnl 128-bit AVX is special. There is no reason to use it on Intel processors
     dnl since SSE2 is just as fast. However, on AMD processors we can both use
     dnl FMA4, and 128-bit SIMD is better than 256-bit since core pairs in a
@@ Expand Down Expand Up / @@ -369,7 +378,13 @@ case "${ax_cv_c_compiler_vendor}" in @@
             fi
             # AVX512
-            if test "$have_avx512" = "yes" -a "x$AVX512_CFLAGS" = x; then
+            if test "$have_avx512" = "yes" -a "$have_avx512_scattergather" = "no" -a "x$AVX512_CFLAGS" = x; then
+                AX_CHECK_COMPILE_FLAG(-mavx512f, [AVX512_CFLAGS="-mavx512f -mavx512dq"],
+                [AC_MSG_ERROR([Need a version of gcc with -mavx512f and -mavx512dq])])
+            fi
+            # AVX512 (KNL)
+            if test "$have_avx512_scattergather" = "yes" -a "x$AVX512_CFLAGS" = x; then
                 AX_CHECK_COMPILE_FLAG(-mavx512f, [AVX512_CFLAGS="-mavx512f"],
                 [AC_MSG_ERROR([Need a version of gcc with -mavx512f])])
             fi
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -56,7 +56,7 @@ @@
     typedef DS(__m512d, __m512) V;
-    #define VLIT(re, im) DS(SUFF(_mm512_setr)(im, re, im, re, im, re, im, re),SUFF(_mm512_setr)(im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re))
+    #define VLIT(re, im) DS(SUFF(_mm512_setr)(re, im, re, im, re, im, re, im),SUFF(_mm512_setr)(re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im))
     #define VLIT1(val) SUFF(_mm512_set1)(val)
     #define LDK(x) x
     #define DVK(var, val) V var = VLIT1(val)
@@ Expand Down Expand Up @@
     static inline V LDu(const R *x, INT ivs, const R *aligned_like)
     {
       (void)aligned_like; /* UNUSED */
-      __m512i index = _mm512_set_epi32(7 * ivs + 1, 7 * ivs,
-* ivs + 1, 6 * ivs,
-* ivs + 1, 5 * ivs,
-* ivs + 1, 4 * ivs,
-* ivs + 1, 3 * ivs,
-* ivs + 1, 2 * ivs,
-* ivs + 1, 1 * ivs,
-* ivs + 1, 0 * ivs);
+      /* pretend pair of single are a double */
+      const __m256i index = _mm256_set_epi32(7 * ivs, 6 * ivs, 5 * ivs, 4 * ivs, 3 * ivs, 2 * ivs, 1 * ivs, 0 * ivs);
-      return _mm512_i32gather_ps(index, x, 4);
+      return (V)_mm512_i32gather_pd(index, x, 4);
     }
     static inline void STu(R *x, V v, INT ovs, const R *aligned_like)
     {
       (void)aligned_like; /* UNUSED */
-      __m512i index = _mm512_set_epi32(7 * ovs + 1, 7 * ovs,
-* ovs + 1, 6 * ovs,
-* ovs + 1, 5 * ovs,
-* ovs + 1, 4 * ovs,
-* ovs + 1, 3 * ovs,
-* ovs + 1, 2 * ovs,
-* ovs + 1, 1 * ovs,
-* ovs + 1, 0 * ovs);
-      _mm512_i32scatter_ps(x, index, v, 4);
+      /* pretend pair of single are a double */
+      const __m256i index = _mm256_set_epi32(7 * ovs, 6 * ovs, 5 * ovs, 4 * ovs, 3 * ovs, 2 * ovs, 1 * ovs, 0 * ovs);
+      _mm512_i32scatter_pd(x, index, (__m512d)v, 4);
     }
     #else /* !FFTW_SINGLE */
     static inline V LDu(const R *x, INT ivs, const R *aligned_like)
     {
       (void)aligned_like; /* UNUSED */
+    #if defined(AVX512_SCATTERGATHER)
       __m256i index = _mm256_set_epi32(3 * ivs + 1, 3 * ivs,
 * ivs + 1, 2 * ivs,
 * ivs + 1, 1 * ivs,
 * ivs + 1, 0 * ivs);
       return _mm512_i32gather_pd(index, x, 8);
+    #else
+      __m128d va0, va1, va2, va3;
+      __m256d vb0, vb1;
+      int i;
+      va0 = _mm_loadu_pd(x + 0*ivs);
+      va1 = _mm_loadu_pd(x + 1*ivs);
+      va2 = _mm_loadu_pd(x + 2*ivs);
+      va3 = _mm_loadu_pd(x + 3*ivs);
+      vb0 = _mm256_insertf128_pd(_mm256_castpd128_pd256(va0), va1, 1);
+      vb1 = _mm256_insertf128_pd(_mm256_castpd128_pd256(va2), va3, 1);
+      return (_mm512_insertf64x4(_mm512_castpd256_pd512(vb0), vb1, 1));
+    #endif
     }
     static inline void STu(R *x, V v, INT ovs, const R *aligned_like)
     {
       (void)aligned_like; /* UNUSED */
+    #if defined(AVX512_SCATTERGATHER)
       __m256i index = _mm256_set_epi32(3 * ovs + 1, 3 * ovs,
 * ovs + 1, 2 * ovs,
 * ovs + 1, 1 * ovs,
 * ovs + 1, 0 * ovs);
       _mm512_i32scatter_pd(x, index, v, 8);
+    #else
+      _mm_storeu_pd(x+ovs*0, _mm512_extractf64x2_pd(v, 0));
+      _mm_storeu_pd(x+ovs*1, _mm512_extractf64x2_pd(v, 1));
+      _mm_storeu_pd(x+ovs*2, _mm512_extractf64x2_pd(v, 2));
+      _mm_storeu_pd(x+ovs*3, _mm512_extractf64x2_pd(v, 3));
+    #endif
     }
     #endif /* FFTW_SINGLE */
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Small performance improvement for avx-512 on Skylake-SP #191

Uh oh!

Diff view

Diff view

There are no files selected for viewing

xiegengxin Jul 7, 2021

Uh oh!

rdolbeau Jul 7, 2021

Uh oh!

xiegengxin Jul 7, 2021

Uh oh!

xiegengxin Jul 7, 2021

Uh oh!

rdolbeau Jul 7, 2021

Uh oh!

Small performance improvement for avx-512 on Skylake-SP #191

Are you sure you want to change the base?

Uh oh!

Small performance improvement for avx-512 on Skylake-SP #191

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

xiegengxin Jul 7, 2021

Choose a reason for hiding this comment

Uh oh!

rdolbeau Jul 7, 2021

Choose a reason for hiding this comment

Uh oh!

xiegengxin Jul 7, 2021

Choose a reason for hiding this comment

Uh oh!

xiegengxin Jul 7, 2021

Choose a reason for hiding this comment

Uh oh!

rdolbeau Jul 7, 2021

Choose a reason for hiding this comment

Uh oh!