numpy · charris · Nov 4, 2021 · Oct 28, 2021 · Oct 28, 2021 · Oct 28, 2021
diff --git a/numpy/core/src/common/simd/avx2/memory.h b/numpy/core/src/common/simd/avx2/memory.h
@@ -87,7 +87,7 @@ NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride)
 #if 0 // slower
 NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride)
 {
-    const __m256i idx = _mm256_setr_epi64x(0, 1*stride, 2*stride, 3*stride);
+    const __m256i idx = npyv_set_s64(0, 1*stride, 2*stride, 3*stride);
     return _mm256_i64gather_epi64((const void*)ptr, idx, 8);
 }
 NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
@@ -170,9 +170,9 @@ NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
 NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill)
 {
     assert(nlane > 0);
-    const __m256i vfill = _mm256_set1_epi64x(fill);
-    const __m256i steps = _mm256_setr_epi64x(0, 1, 2, 3);
-    __m256i vnlane  = _mm256_set1_epi64x(nlane > 4 ? 4 : (int)nlane);
+    const __m256i vfill = npyv_setall_s64(fill);
+    const __m256i steps = npyv_set_s64(0, 1, 2, 3);
+    __m256i vnlane  = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
     __m256i mask    = _mm256_cmpgt_epi64(vnlane, steps);
     __m256i payload = _mm256_maskload_epi64((const void*)ptr, mask);
     return _mm256_blendv_epi8(vfill, payload, mask);
@@ -181,8 +181,8 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n
 NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
 {
     assert(nlane > 0);
-    const __m256i steps = _mm256_setr_epi64x(0, 1, 2, 3);
-    __m256i vnlane  = _mm256_set1_epi64x(nlane > 4 ? 4 : (int)nlane);
+    const __m256i steps = npyv_set_s64(0, 1, 2, 3);
+    __m256i vnlane  = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
     __m256i mask    = _mm256_cmpgt_epi64(vnlane, steps);
     return _mm256_maskload_epi64((const void*)ptr, mask);
 }
@@ -211,10 +211,10 @@ NPY_FINLINE npyv_s64
 npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_int64 fill)
 {
     assert(nlane > 0);
-    const __m256i vfill = _mm256_set1_epi64x(fill);
-    const __m256i idx   = _mm256_setr_epi64x(0, 1*stride, 2*stride, 3*stride);
-    const __m256i steps = _mm256_setr_epi64x(0, 1, 2, 3);
-    __m256i vnlane  = _mm256_set1_epi64x(nlane > 4 ? 4 : (int)nlane);
+    const __m256i vfill = npyv_setall_s64(fill);
+    const __m256i idx   = npyv_set_s64(0, 1*stride, 2*stride, 3*stride);
+    const __m256i steps = npyv_set_s64(0, 1, 2, 3);
+    __m256i vnlane  = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
     __m256i mask    = _mm256_cmpgt_epi64(vnlane, steps);
     return _mm256_mask_i64gather_epi64(vfill, (const void*)ptr, idx, mask, 8);
 }
@@ -238,8 +238,8 @@ NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a
 NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
 {
     assert(nlane > 0);
-    const __m256i steps = _mm256_setr_epi64x(0, 1, 2, 3);
-    __m256i vnlane = _mm256_set1_epi64x(nlane > 8 ? 8 : (int)nlane);
+    const __m256i steps = npyv_set_s64(0, 1, 2, 3);
+    __m256i vnlane = npyv_setall_s64(nlane > 8 ? 8 : (int)nlane);
     __m256i mask   = _mm256_cmpgt_epi64(vnlane, steps);
     _mm256_maskstore_epi64((void*)ptr, mask, a);
 }

diff --git a/numpy/core/src/common/simd/avx2/misc.h b/numpy/core/src/common/simd/avx2/misc.h
@@ -24,11 +24,27 @@
 #define npyv_setall_s16(VAL) _mm256_set1_epi16((short)VAL)
 #define npyv_setall_u32(VAL) _mm256_set1_epi32((int)VAL)
 #define npyv_setall_s32(VAL) _mm256_set1_epi32(VAL)
-#define npyv_setall_u64(VAL) _mm256_set1_epi64x(VAL)
-#define npyv_setall_s64(VAL) _mm256_set1_epi64x(VAL)
 #define npyv_setall_f32(VAL) _mm256_set1_ps(VAL)
 #define npyv_setall_f64(VAL) _mm256_set1_pd(VAL)
 
+NPY_FINLINE __m256i npyv__setr_epi64(npy_int64, npy_int64, npy_int64, npy_int64);
+NPY_FINLINE npyv_u64 npyv_setall_u64(npy_uint64 a)
+{
+    npy_int64 ai = (npy_int64)a;
+#if defined(_MSC_VER) && defined(_M_IX86) 
+    return npyv__setr_epi64(ai, ai, ai, ai);
+#else
+    return _mm256_set1_epi64x(ai);
+#endif
+}
+NPY_FINLINE npyv_s64 npyv_setall_s64(npy_int64 a)
+{
+#if defined(_MSC_VER) && defined(_M_IX86)
+    return npyv__setr_epi64(a, a, a, a);
+#else
+    return _mm256_set1_epi64x(a);
+#endif
+}
 /*
  * vector with specific values set to each lane and
  * set a specific value to all remained lanes
@@ -59,7 +75,14 @@ NPY_FINLINE __m256i npyv__setr_epi32(int i0, int i1, int i2, int i3, int i4, int
 }
 NPY_FINLINE __m256i npyv__setr_epi64(npy_int64 i0, npy_int64 i1, npy_int64 i2, npy_int64 i3)
 {
+#if defined(_MSC_VER) && defined(_M_IX86)
+    return _mm256_setr_epi32(
+        (int)i0, (int)(i0 >> 32), (int)i1, (int)(i1 >> 32),
+        (int)i2, (int)(i2 >> 32), (int)i3, (int)(i3 >> 32)
+    );
+#else
     return _mm256_setr_epi64x(i0, i1, i2, i3);
+#endif
 }
 
 NPY_FINLINE __m256 npyv__setr_ps(float i0, float i1, float i2, float i3, float i4, float i5,

diff --git a/numpy/core/src/common/simd/avx512/math.h b/numpy/core/src/common/simd/avx512/math.h
@@ -35,7 +35,7 @@ NPY_FINLINE npyv_f64 npyv_abs_f64(npyv_f64 a)
     return _mm512_range_pd(a, a, 8);
 #else
     return npyv_and_f64(
-        a, _mm512_castsi512_pd(_mm512_set1_epi64(0x7fffffffffffffffLL))
+        a, _mm512_castsi512_pd(npyv_setall_s64(0x7fffffffffffffffLL))
     );
 #endif
 }

diff --git a/numpy/core/src/common/simd/avx512/memory.h b/numpy/core/src/common/simd/avx512/memory.h
@@ -110,7 +110,7 @@ NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride)
 //// 64
 NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride)
 {
-    const __m512i idx = _mm512_setr_epi64(
+    const __m512i idx = npyv_set_s64(
         0*stride, 1*stride, 2*stride, 3*stride,
         4*stride, 5*stride, 6*stride, 7*stride
     );
@@ -140,7 +140,7 @@ NPY_FINLINE void npyv_storen_f32(float *ptr, npy_intp stride, npyv_f32 a)
 //// 64
 NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
 {
-    const __m512i idx = _mm512_setr_epi64(
+    const __m512i idx = npyv_set_s64(
         0*stride, 1*stride, 2*stride, 3*stride,
         4*stride, 5*stride, 6*stride, 7*stride
     );
@@ -173,7 +173,7 @@ NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
 NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill)
 {
     assert(nlane > 0);
-    const __m512i vfill = _mm512_set1_epi64(fill);
+    const __m512i vfill = npyv_setall_s64(fill);
     const __mmask8 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
     return _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
 }
@@ -210,11 +210,11 @@ NPY_FINLINE npyv_s64
 npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_int64 fill)
 {
     assert(nlane > 0);
-    const __m512i idx = _mm512_setr_epi64(
+    const __m512i idx = npyv_set_s64(
         0*stride, 1*stride, 2*stride, 3*stride,
         4*stride, 5*stride, 6*stride, 7*stride
     );
-    const __m512i vfill = _mm512_set1_epi64(fill);
+    const __m512i vfill = npyv_setall_s64(fill);
     const __mmask8 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
     return _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 8);
 }
@@ -258,7 +258,7 @@ NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp
 NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a)
 {
     assert(nlane > 0);
-    const __m512i idx = _mm512_setr_epi64(
+    const __m512i idx = npyv_set_s64(
         0*stride, 1*stride, 2*stride, 3*stride,
         4*stride, 5*stride, 6*stride, 7*stride
     );

diff --git a/numpy/core/src/common/simd/avx512/misc.h b/numpy/core/src/common/simd/avx512/misc.h
@@ -24,11 +24,30 @@
 #define npyv_setall_s16(VAL) _mm512_set1_epi16((short)VAL)
 #define npyv_setall_u32(VAL) _mm512_set1_epi32((int)VAL)
 #define npyv_setall_s32(VAL) _mm512_set1_epi32(VAL)
-#define npyv_setall_u64(VAL) _mm512_set1_epi64(VAL)
-#define npyv_setall_s64(VAL) _mm512_set1_epi64(VAL)
 #define npyv_setall_f32(VAL) _mm512_set1_ps(VAL)
 #define npyv_setall_f64(VAL) _mm512_set1_pd(VAL)
 
+NPY_FINLINE __m512i npyv__setr_epi64(
+    npy_int64, npy_int64, npy_int64, npy_int64,
+    npy_int64, npy_int64, npy_int64, npy_int64
+);
+NPY_FINLINE npyv_u64 npyv_setall_u64(npy_uint64 a)
+{
+    npy_int64 ai = (npy_int64)a;
+#if defined(_MSC_VER) && defined(_M_IX86) 
+    return npyv__setr_epi64(ai, ai, ai, ai, ai, ai, ai, ai);
+#else
+    return _mm512_set1_epi64(ai);
+#endif
+}
+NPY_FINLINE npyv_s64 npyv_setall_s64(npy_int64 a)
+{
+#if defined(_MSC_VER) && defined(_M_IX86)
+    return npyv__setr_epi64(a, a, a, a, a, a, a, a);
+#else
+    return _mm512_set1_epi64(a);
+#endif
+}
 /**
  * vector with specific values set to each lane and
  * set a specific value to all remained lanes
@@ -76,7 +95,16 @@ NPY_FINLINE __m512i npyv__setr_epi32(
 NPY_FINLINE __m512i npyv__setr_epi64(npy_int64 i0, npy_int64 i1, npy_int64 i2, npy_int64 i3,
                                      npy_int64 i4, npy_int64 i5, npy_int64 i6, npy_int64 i7)
 {
+#if defined(_MSC_VER) && defined(_M_IX86)
+    return _mm512_setr_epi32(
+        (int)i0, (int)(i0 >> 32), (int)i1, (int)(i1 >> 32),
+        (int)i2, (int)(i2 >> 32), (int)i3, (int)(i3 >> 32),
+        (int)i4, (int)(i4 >> 32), (int)i5, (int)(i5 >> 32),
+        (int)i6, (int)(i6 >> 32), (int)i7, (int)(i7 >> 32)
+    );
+#else
     return _mm512_setr_epi64(i0, i1, i2, i3, i4, i5, i6, i7);
+#endif
 }
 
 NPY_FINLINE __m512 npyv__setr_ps(

diff --git a/numpy/core/src/common/simd/simd.h b/numpy/core/src/common/simd/simd.h
@@ -27,6 +27,25 @@ typedef npy_int64  npyv_lanetype_s64;
 typedef float      npyv_lanetype_f32;
 typedef double     npyv_lanetype_f64;
 
+#if defined(_MSC_VER) && defined(_M_IX86)
+/*
+ * Avoid using any of the following intrinsics with MSVC 32-bit,
+ * even if they are apparently work on newer versions.
+ * They had bad impact on the generated instructions,
+ * sometimes the compiler deal with them without the respect
+ * of 32-bit mode which lead to crush due to execute 64-bit
+ * instructions and other times generate bad emulated instructions. 
+ */
+    #undef _mm512_set1_epi64
+    #undef _mm256_set1_epi64x
+    #undef _mm_set1_epi64x
+    #undef _mm512_setr_epi64x
+    #undef _mm256_setr_epi64x
+    #undef _mm_setr_epi64x
+    #undef _mm512_set_epi64x
+    #undef _mm256_set_epi64x
+    #undef _mm_set_epi64x
+#endif
 #if defined(NPY_HAVE_AVX512F) && !defined(NPY_SIMD_FORCE_256) && !defined(NPY_SIMD_FORCE_128)
     #include "avx512/avx512.h"
 #elif defined(NPY_HAVE_AVX2) && !defined(NPY_SIMD_FORCE_128)

diff --git a/numpy/core/src/common/simd/sse/misc.h b/numpy/core/src/common/simd/sse/misc.h
@@ -24,11 +24,28 @@
 #define npyv_setall_s16(VAL) _mm_set1_epi16((short)(VAL))
 #define npyv_setall_u32(VAL) _mm_set1_epi32((int)(VAL))
 #define npyv_setall_s32(VAL) _mm_set1_epi32((int)(VAL))
-#define npyv_setall_u64(VAL) _mm_set1_epi64x((npy_int64)(VAL))
-#define npyv_setall_s64(VAL) _mm_set1_epi64x((npy_int64)(VAL))
 #define npyv_setall_f32 _mm_set1_ps
 #define npyv_setall_f64 _mm_set1_pd
 
+NPY_FINLINE __m128i npyv__setr_epi64(npy_int64 i0, npy_int64 i1);
+
+NPY_FINLINE npyv_u64 npyv_setall_u64(npy_uint64 a)
+{
+#if defined(_MSC_VER) && defined(_M_IX86)
+    return npyv__setr_epi64((npy_int64)a, (npy_int64)a);
+#else
+    return _mm_set1_epi64x((npy_int64)a);
+#endif
+}
+NPY_FINLINE npyv_s64 npyv_setall_s64(npy_int64 a)
+{
+#if defined(_MSC_VER) && defined(_M_IX86)
+    return npyv__setr_epi64(a, a);
+#else
+    return _mm_set1_epi64x((npy_int64)a);
+#endif
+}
+
 /**
  * vector with specific values set to each lane and
  * set a specific value to all remained lanes
@@ -53,7 +70,11 @@ NPY_FINLINE __m128i npyv__setr_epi32(int i0, int i1, int i2, int i3)
 }
 NPY_FINLINE __m128i npyv__setr_epi64(npy_int64 i0, npy_int64 i1)
 {
+#if defined(_MSC_VER) && defined(_M_IX86)
+    return _mm_setr_epi32((int)i0, (int)(i0 >> 32), (int)i1, (int)(i1 >> 32));
+#else
     return _mm_set_epi64x(i1, i0);
+#endif
 }
 NPY_FINLINE __m128 npyv__setr_ps(float i0, float i1, float i2, float i3)
 {