Skip to content

Commit

Permalink
Merge pull request #20298 from charris/backport-20219
Browse files Browse the repository at this point in the history
BUG, SIMD: Workaround broadcasting SIMD 64-bit integers on MSVC 32-bit
  • Loading branch information
charris committed Nov 4, 2021
2 parents 85f64da + 30fe7fd commit 6b3d17e
Show file tree
Hide file tree
Showing 7 changed files with 116 additions and 25 deletions.
24 changes: 12 additions & 12 deletions numpy/core/src/common/simd/avx2/memory.h
Expand Up @@ -87,7 +87,7 @@ NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride)
#if 0 // slower
NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride)
{
const __m256i idx = _mm256_setr_epi64x(0, 1*stride, 2*stride, 3*stride);
const __m256i idx = npyv_set_s64(0, 1*stride, 2*stride, 3*stride);
return _mm256_i64gather_epi64((const void*)ptr, idx, 8);
}
NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
Expand Down Expand Up @@ -170,9 +170,9 @@ NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill)
{
assert(nlane > 0);
const __m256i vfill = _mm256_set1_epi64x(fill);
const __m256i steps = _mm256_setr_epi64x(0, 1, 2, 3);
__m256i vnlane = _mm256_set1_epi64x(nlane > 4 ? 4 : (int)nlane);
const __m256i vfill = npyv_setall_s64(fill);
const __m256i steps = npyv_set_s64(0, 1, 2, 3);
__m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
__m256i payload = _mm256_maskload_epi64((const void*)ptr, mask);
return _mm256_blendv_epi8(vfill, payload, mask);
Expand All @@ -181,8 +181,8 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n
NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
{
assert(nlane > 0);
const __m256i steps = _mm256_setr_epi64x(0, 1, 2, 3);
__m256i vnlane = _mm256_set1_epi64x(nlane > 4 ? 4 : (int)nlane);
const __m256i steps = npyv_set_s64(0, 1, 2, 3);
__m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
return _mm256_maskload_epi64((const void*)ptr, mask);
}
Expand Down Expand Up @@ -211,10 +211,10 @@ NPY_FINLINE npyv_s64
npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_int64 fill)
{
assert(nlane > 0);
const __m256i vfill = _mm256_set1_epi64x(fill);
const __m256i idx = _mm256_setr_epi64x(0, 1*stride, 2*stride, 3*stride);
const __m256i steps = _mm256_setr_epi64x(0, 1, 2, 3);
__m256i vnlane = _mm256_set1_epi64x(nlane > 4 ? 4 : (int)nlane);
const __m256i vfill = npyv_setall_s64(fill);
const __m256i idx = npyv_set_s64(0, 1*stride, 2*stride, 3*stride);
const __m256i steps = npyv_set_s64(0, 1, 2, 3);
__m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
return _mm256_mask_i64gather_epi64(vfill, (const void*)ptr, idx, mask, 8);
}
Expand All @@ -238,8 +238,8 @@ NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a
NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
{
assert(nlane > 0);
const __m256i steps = _mm256_setr_epi64x(0, 1, 2, 3);
__m256i vnlane = _mm256_set1_epi64x(nlane > 8 ? 8 : (int)nlane);
const __m256i steps = npyv_set_s64(0, 1, 2, 3);
__m256i vnlane = npyv_setall_s64(nlane > 8 ? 8 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
_mm256_maskstore_epi64((void*)ptr, mask, a);
}
Expand Down
27 changes: 25 additions & 2 deletions numpy/core/src/common/simd/avx2/misc.h
Expand Up @@ -24,11 +24,27 @@
#define npyv_setall_s16(VAL) _mm256_set1_epi16((short)VAL)
#define npyv_setall_u32(VAL) _mm256_set1_epi32((int)VAL)
#define npyv_setall_s32(VAL) _mm256_set1_epi32(VAL)
#define npyv_setall_u64(VAL) _mm256_set1_epi64x(VAL)
#define npyv_setall_s64(VAL) _mm256_set1_epi64x(VAL)
#define npyv_setall_f32(VAL) _mm256_set1_ps(VAL)
#define npyv_setall_f64(VAL) _mm256_set1_pd(VAL)

NPY_FINLINE __m256i npyv__setr_epi64(npy_int64, npy_int64, npy_int64, npy_int64);
NPY_FINLINE npyv_u64 npyv_setall_u64(npy_uint64 a)
{
npy_int64 ai = (npy_int64)a;
#if defined(_MSC_VER) && defined(_M_IX86)
return npyv__setr_epi64(ai, ai, ai, ai);
#else
return _mm256_set1_epi64x(ai);
#endif
}
NPY_FINLINE npyv_s64 npyv_setall_s64(npy_int64 a)
{
#if defined(_MSC_VER) && defined(_M_IX86)
return npyv__setr_epi64(a, a, a, a);
#else
return _mm256_set1_epi64x(a);
#endif
}
/*
* vector with specific values set to each lane and
* set a specific value to all remained lanes
Expand Down Expand Up @@ -59,7 +75,14 @@ NPY_FINLINE __m256i npyv__setr_epi32(int i0, int i1, int i2, int i3, int i4, int
}
NPY_FINLINE __m256i npyv__setr_epi64(npy_int64 i0, npy_int64 i1, npy_int64 i2, npy_int64 i3)
{
#if defined(_MSC_VER) && defined(_M_IX86)
return _mm256_setr_epi32(
(int)i0, (int)(i0 >> 32), (int)i1, (int)(i1 >> 32),
(int)i2, (int)(i2 >> 32), (int)i3, (int)(i3 >> 32)
);
#else
return _mm256_setr_epi64x(i0, i1, i2, i3);
#endif
}

NPY_FINLINE __m256 npyv__setr_ps(float i0, float i1, float i2, float i3, float i4, float i5,
Expand Down
2 changes: 1 addition & 1 deletion numpy/core/src/common/simd/avx512/math.h
Expand Up @@ -35,7 +35,7 @@ NPY_FINLINE npyv_f64 npyv_abs_f64(npyv_f64 a)
return _mm512_range_pd(a, a, 8);
#else
return npyv_and_f64(
a, _mm512_castsi512_pd(_mm512_set1_epi64(0x7fffffffffffffffLL))
a, _mm512_castsi512_pd(npyv_setall_s64(0x7fffffffffffffffLL))
);
#endif
}
Expand Down
12 changes: 6 additions & 6 deletions numpy/core/src/common/simd/avx512/memory.h
Expand Up @@ -110,7 +110,7 @@ NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride)
//// 64
NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride)
{
const __m512i idx = _mm512_setr_epi64(
const __m512i idx = npyv_set_s64(
0*stride, 1*stride, 2*stride, 3*stride,
4*stride, 5*stride, 6*stride, 7*stride
);
Expand Down Expand Up @@ -140,7 +140,7 @@ NPY_FINLINE void npyv_storen_f32(float *ptr, npy_intp stride, npyv_f32 a)
//// 64
NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
{
const __m512i idx = _mm512_setr_epi64(
const __m512i idx = npyv_set_s64(
0*stride, 1*stride, 2*stride, 3*stride,
4*stride, 5*stride, 6*stride, 7*stride
);
Expand Down Expand Up @@ -173,7 +173,7 @@ NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill)
{
assert(nlane > 0);
const __m512i vfill = _mm512_set1_epi64(fill);
const __m512i vfill = npyv_setall_s64(fill);
const __mmask8 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
return _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
}
Expand Down Expand Up @@ -210,11 +210,11 @@ NPY_FINLINE npyv_s64
npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_int64 fill)
{
assert(nlane > 0);
const __m512i idx = _mm512_setr_epi64(
const __m512i idx = npyv_set_s64(
0*stride, 1*stride, 2*stride, 3*stride,
4*stride, 5*stride, 6*stride, 7*stride
);
const __m512i vfill = _mm512_set1_epi64(fill);
const __m512i vfill = npyv_setall_s64(fill);
const __mmask8 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
return _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 8);
}
Expand Down Expand Up @@ -258,7 +258,7 @@ NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp
NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a)
{
assert(nlane > 0);
const __m512i idx = _mm512_setr_epi64(
const __m512i idx = npyv_set_s64(
0*stride, 1*stride, 2*stride, 3*stride,
4*stride, 5*stride, 6*stride, 7*stride
);
Expand Down
32 changes: 30 additions & 2 deletions numpy/core/src/common/simd/avx512/misc.h
Expand Up @@ -24,11 +24,30 @@
#define npyv_setall_s16(VAL) _mm512_set1_epi16((short)VAL)
#define npyv_setall_u32(VAL) _mm512_set1_epi32((int)VAL)
#define npyv_setall_s32(VAL) _mm512_set1_epi32(VAL)
#define npyv_setall_u64(VAL) _mm512_set1_epi64(VAL)
#define npyv_setall_s64(VAL) _mm512_set1_epi64(VAL)
#define npyv_setall_f32(VAL) _mm512_set1_ps(VAL)
#define npyv_setall_f64(VAL) _mm512_set1_pd(VAL)

NPY_FINLINE __m512i npyv__setr_epi64(
npy_int64, npy_int64, npy_int64, npy_int64,
npy_int64, npy_int64, npy_int64, npy_int64
);
NPY_FINLINE npyv_u64 npyv_setall_u64(npy_uint64 a)
{
npy_int64 ai = (npy_int64)a;
#if defined(_MSC_VER) && defined(_M_IX86)
return npyv__setr_epi64(ai, ai, ai, ai, ai, ai, ai, ai);
#else
return _mm512_set1_epi64(ai);
#endif
}
NPY_FINLINE npyv_s64 npyv_setall_s64(npy_int64 a)
{
#if defined(_MSC_VER) && defined(_M_IX86)
return npyv__setr_epi64(a, a, a, a, a, a, a, a);
#else
return _mm512_set1_epi64(a);
#endif
}
/**
* vector with specific values set to each lane and
* set a specific value to all remained lanes
Expand Down Expand Up @@ -76,7 +95,16 @@ NPY_FINLINE __m512i npyv__setr_epi32(
NPY_FINLINE __m512i npyv__setr_epi64(npy_int64 i0, npy_int64 i1, npy_int64 i2, npy_int64 i3,
npy_int64 i4, npy_int64 i5, npy_int64 i6, npy_int64 i7)
{
#if defined(_MSC_VER) && defined(_M_IX86)
return _mm512_setr_epi32(
(int)i0, (int)(i0 >> 32), (int)i1, (int)(i1 >> 32),
(int)i2, (int)(i2 >> 32), (int)i3, (int)(i3 >> 32),
(int)i4, (int)(i4 >> 32), (int)i5, (int)(i5 >> 32),
(int)i6, (int)(i6 >> 32), (int)i7, (int)(i7 >> 32)
);
#else
return _mm512_setr_epi64(i0, i1, i2, i3, i4, i5, i6, i7);
#endif
}

NPY_FINLINE __m512 npyv__setr_ps(
Expand Down
19 changes: 19 additions & 0 deletions numpy/core/src/common/simd/simd.h
Expand Up @@ -27,6 +27,25 @@ typedef npy_int64 npyv_lanetype_s64;
typedef float npyv_lanetype_f32;
typedef double npyv_lanetype_f64;

#if defined(_MSC_VER) && defined(_M_IX86)
/*
* Avoid using any of the following intrinsics with MSVC 32-bit,
* even if they are apparently work on newer versions.
* They had bad impact on the generated instructions,
* sometimes the compiler deal with them without the respect
* of 32-bit mode which lead to crush due to execute 64-bit
* instructions and other times generate bad emulated instructions.
*/
#undef _mm512_set1_epi64
#undef _mm256_set1_epi64x
#undef _mm_set1_epi64x
#undef _mm512_setr_epi64x
#undef _mm256_setr_epi64x
#undef _mm_setr_epi64x
#undef _mm512_set_epi64x
#undef _mm256_set_epi64x
#undef _mm_set_epi64x
#endif
#if defined(NPY_HAVE_AVX512F) && !defined(NPY_SIMD_FORCE_256) && !defined(NPY_SIMD_FORCE_128)
#include "avx512/avx512.h"
#elif defined(NPY_HAVE_AVX2) && !defined(NPY_SIMD_FORCE_128)
Expand Down
25 changes: 23 additions & 2 deletions numpy/core/src/common/simd/sse/misc.h
Expand Up @@ -24,11 +24,28 @@
#define npyv_setall_s16(VAL) _mm_set1_epi16((short)(VAL))
#define npyv_setall_u32(VAL) _mm_set1_epi32((int)(VAL))
#define npyv_setall_s32(VAL) _mm_set1_epi32((int)(VAL))
#define npyv_setall_u64(VAL) _mm_set1_epi64x((npy_int64)(VAL))
#define npyv_setall_s64(VAL) _mm_set1_epi64x((npy_int64)(VAL))
#define npyv_setall_f32 _mm_set1_ps
#define npyv_setall_f64 _mm_set1_pd

NPY_FINLINE __m128i npyv__setr_epi64(npy_int64 i0, npy_int64 i1);

NPY_FINLINE npyv_u64 npyv_setall_u64(npy_uint64 a)
{
#if defined(_MSC_VER) && defined(_M_IX86)
return npyv__setr_epi64((npy_int64)a, (npy_int64)a);
#else
return _mm_set1_epi64x((npy_int64)a);
#endif
}
NPY_FINLINE npyv_s64 npyv_setall_s64(npy_int64 a)
{
#if defined(_MSC_VER) && defined(_M_IX86)
return npyv__setr_epi64(a, a);
#else
return _mm_set1_epi64x((npy_int64)a);
#endif
}

/**
* vector with specific values set to each lane and
* set a specific value to all remained lanes
Expand All @@ -53,7 +70,11 @@ NPY_FINLINE __m128i npyv__setr_epi32(int i0, int i1, int i2, int i3)
}
NPY_FINLINE __m128i npyv__setr_epi64(npy_int64 i0, npy_int64 i1)
{
#if defined(_MSC_VER) && defined(_M_IX86)
return _mm_setr_epi32((int)i0, (int)(i0 >> 32), (int)i1, (int)(i1 >> 32));
#else
return _mm_set_epi64x(i1, i0);
#endif
}
NPY_FINLINE __m128 npyv__setr_ps(float i0, float i1, float i2, float i3)
{
Expand Down

0 comments on commit 6b3d17e

Please sign in to comment.