Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,15 @@ if test "$have_avx512" = "yes"; then
fi
AM_CONDITIONAL(HAVE_AVX512, test "$have_avx512" = "yes")

AC_ARG_ENABLE(avx512-scattergather, [AC_HELP_STRING([--enable-avx512-scattergather],[Favor scatter/gather when using AVX512 (for Xeon Phi/KNL)])], have_avx512_scattergather=$enableval, have_avx512_scattergather=no)
if test "$have_avx512_scattergather" = "yes"; then
AC_DEFINE(AVX512_SCATTERGATHER,1,[Define to favor scatter/gather when using AVX512.])
if test "$have_avx512" != "yes"; then
AC_MSG_ERROR([AVX512 Scatter/Gather requires AVX512])
fi
fi
AM_CONDITIONAL(AVX512_SCATTERGATHER, test "$have_avx512_scattergather" = "yes")

dnl 128-bit AVX is special. There is no reason to use it on Intel processors
dnl since SSE2 is just as fast. However, on AMD processors we can both use
dnl FMA4, and 128-bit SIMD is better than 256-bit since core pairs in a
Expand Down Expand Up @@ -369,7 +378,13 @@ case "${ax_cv_c_compiler_vendor}" in
fi

# AVX512
if test "$have_avx512" = "yes" -a "x$AVX512_CFLAGS" = x; then
if test "$have_avx512" = "yes" -a "$have_avx512_scattergather" = "no" -a "x$AVX512_CFLAGS" = x; then
AX_CHECK_COMPILE_FLAG(-mavx512f, [AVX512_CFLAGS="-mavx512f -mavx512dq"],
[AC_MSG_ERROR([Need a version of gcc with -mavx512f and -mavx512dq])])
fi

# AVX512 (KNL)
if test "$have_avx512_scattergather" = "yes" -a "x$AVX512_CFLAGS" = x; then
AX_CHECK_COMPILE_FLAG(-mavx512f, [AVX512_CFLAGS="-mavx512f"],
[AC_MSG_ERROR([Need a version of gcc with -mavx512f])])
fi
Expand Down
48 changes: 28 additions & 20 deletions simd-support/simd-avx512.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@

typedef DS(__m512d, __m512) V;

#define VLIT(re, im) DS(SUFF(_mm512_setr)(im, re, im, re, im, re, im, re),SUFF(_mm512_setr)(im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re))
#define VLIT(re, im) DS(SUFF(_mm512_setr)(re, im, re, im, re, im, re, im),SUFF(_mm512_setr)(re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im))
#define VLIT1(val) SUFF(_mm512_set1)(val)
#define LDK(x) x
#define DVK(var, val) V var = VLIT1(val)
Expand Down Expand Up @@ -99,55 +99,63 @@ static inline void STA(R *x, V v, INT ovs, const R *aligned_like) {
static inline V LDu(const R *x, INT ivs, const R *aligned_like)
{
(void)aligned_like; /* UNUSED */
__m512i index = _mm512_set_epi32(7 * ivs + 1, 7 * ivs,
6 * ivs + 1, 6 * ivs,
5 * ivs + 1, 5 * ivs,
4 * ivs + 1, 4 * ivs,
3 * ivs + 1, 3 * ivs,
2 * ivs + 1, 2 * ivs,
1 * ivs + 1, 1 * ivs,
0 * ivs + 1, 0 * ivs);
/* pretend pair of single are a double */
const __m256i index = _mm256_set_epi32(7 * ivs, 6 * ivs, 5 * ivs, 4 * ivs, 3 * ivs, 2 * ivs, 1 * ivs, 0 * ivs);

return _mm512_i32gather_ps(index, x, 4);
return (V)_mm512_i32gather_pd(index, x, 4);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

return (V)_mm512_i32gather_pd(index, x, 8);
Right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wrote that code a while ago, but I think 4 is correct; the indices are still referring to the original datatype - single precision value of 4 bytes. The_pd variant is used only to access 64 bits at a time explicitly.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're right.

}

static inline void STu(R *x, V v, INT ovs, const R *aligned_like)
{
(void)aligned_like; /* UNUSED */
__m512i index = _mm512_set_epi32(7 * ovs + 1, 7 * ovs,
6 * ovs + 1, 6 * ovs,
5 * ovs + 1, 5 * ovs,
4 * ovs + 1, 4 * ovs,
3 * ovs + 1, 3 * ovs,
2 * ovs + 1, 2 * ovs,
1 * ovs + 1, 1 * ovs,
0 * ovs + 1, 0 * ovs);

_mm512_i32scatter_ps(x, index, v, 4);
/* pretend pair of single are a double */
const __m256i index = _mm256_set_epi32(7 * ovs, 6 * ovs, 5 * ovs, 4 * ovs, 3 * ovs, 2 * ovs, 1 * ovs, 0 * ovs);

_mm512_i32scatter_pd(x, index, (__m512d)v, 4);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_mm512_i32scatter_pd(x, index, (__m512d)v, 8);

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here - ovs is a stride in 4-bytes elements, so the index vector is also in 4-bytes element.

}

#else /* !FFTW_SINGLE */

static inline V LDu(const R *x, INT ivs, const R *aligned_like)
{
(void)aligned_like; /* UNUSED */
#if defined(AVX512_SCATTERGATHER)
__m256i index = _mm256_set_epi32(3 * ivs + 1, 3 * ivs,
2 * ivs + 1, 2 * ivs,
1 * ivs + 1, 1 * ivs,
0 * ivs + 1, 0 * ivs);

return _mm512_i32gather_pd(index, x, 8);
#else
__m128d va0, va1, va2, va3;
__m256d vb0, vb1;
int i;
va0 = _mm_loadu_pd(x + 0*ivs);
va1 = _mm_loadu_pd(x + 1*ivs);
va2 = _mm_loadu_pd(x + 2*ivs);
va3 = _mm_loadu_pd(x + 3*ivs);
vb0 = _mm256_insertf128_pd(_mm256_castpd128_pd256(va0), va1, 1);
vb1 = _mm256_insertf128_pd(_mm256_castpd128_pd256(va2), va3, 1);
return (_mm512_insertf64x4(_mm512_castpd256_pd512(vb0), vb1, 1));
#endif
}

static inline void STu(R *x, V v, INT ovs, const R *aligned_like)
{
(void)aligned_like; /* UNUSED */
#if defined(AVX512_SCATTERGATHER)
__m256i index = _mm256_set_epi32(3 * ovs + 1, 3 * ovs,
2 * ovs + 1, 2 * ovs,
1 * ovs + 1, 1 * ovs,
0 * ovs + 1, 0 * ovs);

_mm512_i32scatter_pd(x, index, v, 8);
#else
_mm_storeu_pd(x+ovs*0, _mm512_extractf64x2_pd(v, 0));
_mm_storeu_pd(x+ovs*1, _mm512_extractf64x2_pd(v, 1));
_mm_storeu_pd(x+ovs*2, _mm512_extractf64x2_pd(v, 2));
_mm_storeu_pd(x+ovs*3, _mm512_extractf64x2_pd(v, 3));
#endif
}

#endif /* FFTW_SINGLE */
Expand Down