diff --git a/configure.ac b/configure.ac
index 5e33b1c2b..24f3143a3 100644
--- a/configure.ac
+++ b/configure.ac
@@ -152,6 +152,15 @@ if test "$have_avx512" = "yes"; then
 fi
 AM_CONDITIONAL(HAVE_AVX512, test "$have_avx512" = "yes")
 
+AC_ARG_ENABLE(avx512-scattergather, [AC_HELP_STRING([--enable-avx512-scattergather],[Favor scatter/gather when using AVX512 (for Xeon Phi/KNL)])], have_avx512_scattergather=$enableval, have_avx512_scattergather=no)
+if test "$have_avx512_scattergather" = "yes"; then
+        AC_DEFINE(AVX512_SCATTERGATHER,1,[Define to favor scatter/gather when using AVX512.])
+        if test "$have_avx512" != "yes"; then
+                AC_MSG_ERROR([AVX512 Scatter/Gather requires AVX512])
+        fi
+fi
+AM_CONDITIONAL(AVX512_SCATTERGATHER, test "$have_avx512_scattergather" = "yes")
+
 dnl 128-bit AVX is special. There is no reason to use it on Intel processors
 dnl since SSE2 is just as fast. However, on AMD processors we can both use
 dnl FMA4, and 128-bit SIMD is better than 256-bit since core pairs in a
@@ -369,7 +378,13 @@ case "${ax_cv_c_compiler_vendor}" in
         fi
 
         # AVX512
-        if test "$have_avx512" = "yes" -a "x$AVX512_CFLAGS" = x; then
+        if test "$have_avx512" = "yes" -a "$have_avx512_scattergather" = "no" -a "x$AVX512_CFLAGS" = x; then
+            AX_CHECK_COMPILE_FLAG(-mavx512f, [AVX512_CFLAGS="-mavx512f -mavx512dq"],
+            [AC_MSG_ERROR([Need a version of gcc with -mavx512f and -mavx512dq])])
+        fi
+
+        # AVX512 (KNL)
+        if test "$have_avx512_scattergather" = "yes" -a "x$AVX512_CFLAGS" = x; then
             AX_CHECK_COMPILE_FLAG(-mavx512f, [AVX512_CFLAGS="-mavx512f"],
             [AC_MSG_ERROR([Need a version of gcc with -mavx512f])])
         fi
diff --git a/simd-support/simd-avx512.h b/simd-support/simd-avx512.h
index 47b60e942..d69b60c77 100644
--- a/simd-support/simd-avx512.h
+++ b/simd-support/simd-avx512.h
@@ -56,7 +56,7 @@
 
 typedef DS(__m512d, __m512) V;
 
-#define VLIT(re, im) DS(SUFF(_mm512_setr)(im, re, im, re, im, re, im, re),SUFF(_mm512_setr)(im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re))
+#define VLIT(re, im) DS(SUFF(_mm512_setr)(re, im, re, im, re, im, re, im),SUFF(_mm512_setr)(re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im))
 #define VLIT1(val) SUFF(_mm512_set1)(val)
 #define LDK(x) x
 #define DVK(var, val) V var = VLIT1(val)
@@ -99,31 +99,19 @@ static inline void STA(R *x, V v, INT ovs, const R *aligned_like) {
 static inline V LDu(const R *x, INT ivs, const R *aligned_like)
 {
   (void)aligned_like; /* UNUSED */
-  __m512i index = _mm512_set_epi32(7 * ivs + 1, 7 * ivs,
-                                   6 * ivs + 1, 6 * ivs,
-                                   5 * ivs + 1, 5 * ivs,
-                                   4 * ivs + 1, 4 * ivs,
-                                   3 * ivs + 1, 3 * ivs,
-                                   2 * ivs + 1, 2 * ivs,
-                                   1 * ivs + 1, 1 * ivs,
-                                   0 * ivs + 1, 0 * ivs);
+  /* pretend pair of single are a double */
+  const __m256i index = _mm256_set_epi32(7 * ivs, 6 * ivs, 5 * ivs, 4 * ivs, 3 * ivs, 2 * ivs, 1 * ivs, 0 * ivs);
   
-  return _mm512_i32gather_ps(index, x, 4);
+  return (V)_mm512_i32gather_pd(index, x, 4);
 }
 
 static inline void STu(R *x, V v, INT ovs, const R *aligned_like)
 {
   (void)aligned_like; /* UNUSED */
-  __m512i index = _mm512_set_epi32(7 * ovs + 1, 7 * ovs,
-                                   6 * ovs + 1, 6 * ovs,
-                                   5 * ovs + 1, 5 * ovs,
-                                   4 * ovs + 1, 4 * ovs,
-                                   3 * ovs + 1, 3 * ovs,
-                                   2 * ovs + 1, 2 * ovs,
-                                   1 * ovs + 1, 1 * ovs,
-                                   0 * ovs + 1, 0 * ovs);
-  
-  _mm512_i32scatter_ps(x, index, v, 4);
+  /* pretend pair of single are a double */
+  const __m256i index = _mm256_set_epi32(7 * ovs, 6 * ovs, 5 * ovs, 4 * ovs, 3 * ovs, 2 * ovs, 1 * ovs, 0 * ovs);
+
+  _mm512_i32scatter_pd(x, index, (__m512d)v, 4);
 }
 
 #else /* !FFTW_SINGLE */
@@ -131,23 +119,43 @@ static inline void STu(R *x, V v, INT ovs, const R *aligned_like)
 static inline V LDu(const R *x, INT ivs, const R *aligned_like)
 {
   (void)aligned_like; /* UNUSED */
+#if defined(AVX512_SCATTERGATHER)
   __m256i index = _mm256_set_epi32(3 * ivs + 1, 3 * ivs,
                                    2 * ivs + 1, 2 * ivs,
                                    1 * ivs + 1, 1 * ivs,
                                    0 * ivs + 1, 0 * ivs);
   
   return _mm512_i32gather_pd(index, x, 8);
+#else
+  __m128d va0, va1, va2, va3;
+  __m256d vb0, vb1;
+  int i;
+  va0 = _mm_loadu_pd(x + 0*ivs);
+  va1 = _mm_loadu_pd(x + 1*ivs);
+  va2 = _mm_loadu_pd(x + 2*ivs);
+  va3 = _mm_loadu_pd(x + 3*ivs);
+  vb0 = _mm256_insertf128_pd(_mm256_castpd128_pd256(va0), va1, 1);
+  vb1 = _mm256_insertf128_pd(_mm256_castpd128_pd256(va2), va3, 1);
+  return (_mm512_insertf64x4(_mm512_castpd256_pd512(vb0), vb1, 1));
+#endif
 }
 
 static inline void STu(R *x, V v, INT ovs, const R *aligned_like)
 {
   (void)aligned_like; /* UNUSED */
+#if defined(AVX512_SCATTERGATHER)
   __m256i index = _mm256_set_epi32(3 * ovs + 1, 3 * ovs,
                                    2 * ovs + 1, 2 * ovs,
                                    1 * ovs + 1, 1 * ovs,
                                    0 * ovs + 1, 0 * ovs);
   
   _mm512_i32scatter_pd(x, index, v, 8);
+#else
+  _mm_storeu_pd(x+ovs*0, _mm512_extractf64x2_pd(v, 0));
+  _mm_storeu_pd(x+ovs*1, _mm512_extractf64x2_pd(v, 1));
+  _mm_storeu_pd(x+ovs*2, _mm512_extractf64x2_pd(v, 2));
+  _mm_storeu_pd(x+ovs*3, _mm512_extractf64x2_pd(v, 3));
+#endif
 }
 
 #endif /* FFTW_SINGLE */