Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG, SIMD: Workaround broadcasting SIMD 64-bit integers on MSVC 32-bit #20298

Merged
merged 3 commits into from Nov 4, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
24 changes: 12 additions & 12 deletions numpy/core/src/common/simd/avx2/memory.h
Expand Up @@ -87,7 +87,7 @@ NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride)
#if 0 // slower
NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride)
{
const __m256i idx = _mm256_setr_epi64x(0, 1*stride, 2*stride, 3*stride);
const __m256i idx = npyv_set_s64(0, 1*stride, 2*stride, 3*stride);
return _mm256_i64gather_epi64((const void*)ptr, idx, 8);
}
NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
Expand Down Expand Up @@ -170,9 +170,9 @@ NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill)
{
assert(nlane > 0);
const __m256i vfill = _mm256_set1_epi64x(fill);
const __m256i steps = _mm256_setr_epi64x(0, 1, 2, 3);
__m256i vnlane = _mm256_set1_epi64x(nlane > 4 ? 4 : (int)nlane);
const __m256i vfill = npyv_setall_s64(fill);
const __m256i steps = npyv_set_s64(0, 1, 2, 3);
__m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
__m256i payload = _mm256_maskload_epi64((const void*)ptr, mask);
return _mm256_blendv_epi8(vfill, payload, mask);
Expand All @@ -181,8 +181,8 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n
NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
{
assert(nlane > 0);
const __m256i steps = _mm256_setr_epi64x(0, 1, 2, 3);
__m256i vnlane = _mm256_set1_epi64x(nlane > 4 ? 4 : (int)nlane);
const __m256i steps = npyv_set_s64(0, 1, 2, 3);
__m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
return _mm256_maskload_epi64((const void*)ptr, mask);
}
Expand Down Expand Up @@ -211,10 +211,10 @@ NPY_FINLINE npyv_s64
npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_int64 fill)
{
assert(nlane > 0);
const __m256i vfill = _mm256_set1_epi64x(fill);
const __m256i idx = _mm256_setr_epi64x(0, 1*stride, 2*stride, 3*stride);
const __m256i steps = _mm256_setr_epi64x(0, 1, 2, 3);
__m256i vnlane = _mm256_set1_epi64x(nlane > 4 ? 4 : (int)nlane);
const __m256i vfill = npyv_setall_s64(fill);
const __m256i idx = npyv_set_s64(0, 1*stride, 2*stride, 3*stride);
const __m256i steps = npyv_set_s64(0, 1, 2, 3);
__m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
return _mm256_mask_i64gather_epi64(vfill, (const void*)ptr, idx, mask, 8);
}
Expand All @@ -238,8 +238,8 @@ NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a
NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
{
assert(nlane > 0);
const __m256i steps = _mm256_setr_epi64x(0, 1, 2, 3);
__m256i vnlane = _mm256_set1_epi64x(nlane > 8 ? 8 : (int)nlane);
const __m256i steps = npyv_set_s64(0, 1, 2, 3);
__m256i vnlane = npyv_setall_s64(nlane > 8 ? 8 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
_mm256_maskstore_epi64((void*)ptr, mask, a);
}
Expand Down
27 changes: 25 additions & 2 deletions numpy/core/src/common/simd/avx2/misc.h
Expand Up @@ -24,11 +24,27 @@
#define npyv_setall_s16(VAL) _mm256_set1_epi16((short)VAL)
#define npyv_setall_u32(VAL) _mm256_set1_epi32((int)VAL)
#define npyv_setall_s32(VAL) _mm256_set1_epi32(VAL)
#define npyv_setall_u64(VAL) _mm256_set1_epi64x(VAL)
#define npyv_setall_s64(VAL) _mm256_set1_epi64x(VAL)
#define npyv_setall_f32(VAL) _mm256_set1_ps(VAL)
#define npyv_setall_f64(VAL) _mm256_set1_pd(VAL)

NPY_FINLINE __m256i npyv__setr_epi64(npy_int64, npy_int64, npy_int64, npy_int64);
NPY_FINLINE npyv_u64 npyv_setall_u64(npy_uint64 a)
{
npy_int64 ai = (npy_int64)a;
#if defined(_MSC_VER) && defined(_M_IX86)
return npyv__setr_epi64(ai, ai, ai, ai);
#else
return _mm256_set1_epi64x(ai);
#endif
}
NPY_FINLINE npyv_s64 npyv_setall_s64(npy_int64 a)
{
#if defined(_MSC_VER) && defined(_M_IX86)
return npyv__setr_epi64(a, a, a, a);
#else
return _mm256_set1_epi64x(a);
#endif
}
/*
* vector with specific values set to each lane and
* set a specific value to all remained lanes
Expand Down Expand Up @@ -59,7 +75,14 @@ NPY_FINLINE __m256i npyv__setr_epi32(int i0, int i1, int i2, int i3, int i4, int
}
NPY_FINLINE __m256i npyv__setr_epi64(npy_int64 i0, npy_int64 i1, npy_int64 i2, npy_int64 i3)
{
#if defined(_MSC_VER) && defined(_M_IX86)
return _mm256_setr_epi32(
(int)i0, (int)(i0 >> 32), (int)i1, (int)(i1 >> 32),
(int)i2, (int)(i2 >> 32), (int)i3, (int)(i3 >> 32)
);
#else
return _mm256_setr_epi64x(i0, i1, i2, i3);
#endif
}

NPY_FINLINE __m256 npyv__setr_ps(float i0, float i1, float i2, float i3, float i4, float i5,
Expand Down
2 changes: 1 addition & 1 deletion numpy/core/src/common/simd/avx512/math.h
Expand Up @@ -35,7 +35,7 @@ NPY_FINLINE npyv_f64 npyv_abs_f64(npyv_f64 a)
return _mm512_range_pd(a, a, 8);
#else
return npyv_and_f64(
a, _mm512_castsi512_pd(_mm512_set1_epi64(0x7fffffffffffffffLL))
a, _mm512_castsi512_pd(npyv_setall_s64(0x7fffffffffffffffLL))
);
#endif
}
Expand Down
12 changes: 6 additions & 6 deletions numpy/core/src/common/simd/avx512/memory.h
Expand Up @@ -110,7 +110,7 @@ NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride)
//// 64
NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride)
{
const __m512i idx = _mm512_setr_epi64(
const __m512i idx = npyv_set_s64(
0*stride, 1*stride, 2*stride, 3*stride,
4*stride, 5*stride, 6*stride, 7*stride
);
Expand Down Expand Up @@ -140,7 +140,7 @@ NPY_FINLINE void npyv_storen_f32(float *ptr, npy_intp stride, npyv_f32 a)
//// 64
NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
{
const __m512i idx = _mm512_setr_epi64(
const __m512i idx = npyv_set_s64(
0*stride, 1*stride, 2*stride, 3*stride,
4*stride, 5*stride, 6*stride, 7*stride
);
Expand Down Expand Up @@ -173,7 +173,7 @@ NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill)
{
assert(nlane > 0);
const __m512i vfill = _mm512_set1_epi64(fill);
const __m512i vfill = npyv_setall_s64(fill);
const __mmask8 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
return _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
}
Expand Down Expand Up @@ -210,11 +210,11 @@ NPY_FINLINE npyv_s64
npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_int64 fill)
{
assert(nlane > 0);
const __m512i idx = _mm512_setr_epi64(
const __m512i idx = npyv_set_s64(
0*stride, 1*stride, 2*stride, 3*stride,
4*stride, 5*stride, 6*stride, 7*stride
);
const __m512i vfill = _mm512_set1_epi64(fill);
const __m512i vfill = npyv_setall_s64(fill);
const __mmask8 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
return _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 8);
}
Expand Down Expand Up @@ -258,7 +258,7 @@ NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp
NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a)
{
assert(nlane > 0);
const __m512i idx = _mm512_setr_epi64(
const __m512i idx = npyv_set_s64(
0*stride, 1*stride, 2*stride, 3*stride,
4*stride, 5*stride, 6*stride, 7*stride
);
Expand Down
32 changes: 30 additions & 2 deletions numpy/core/src/common/simd/avx512/misc.h
Expand Up @@ -24,11 +24,30 @@
#define npyv_setall_s16(VAL) _mm512_set1_epi16((short)VAL)
#define npyv_setall_u32(VAL) _mm512_set1_epi32((int)VAL)
#define npyv_setall_s32(VAL) _mm512_set1_epi32(VAL)
#define npyv_setall_u64(VAL) _mm512_set1_epi64(VAL)
#define npyv_setall_s64(VAL) _mm512_set1_epi64(VAL)
#define npyv_setall_f32(VAL) _mm512_set1_ps(VAL)
#define npyv_setall_f64(VAL) _mm512_set1_pd(VAL)

NPY_FINLINE __m512i npyv__setr_epi64(
npy_int64, npy_int64, npy_int64, npy_int64,
npy_int64, npy_int64, npy_int64, npy_int64
);
NPY_FINLINE npyv_u64 npyv_setall_u64(npy_uint64 a)
{
npy_int64 ai = (npy_int64)a;
#if defined(_MSC_VER) && defined(_M_IX86)
return npyv__setr_epi64(ai, ai, ai, ai, ai, ai, ai, ai);
#else
return _mm512_set1_epi64(ai);
#endif
}
NPY_FINLINE npyv_s64 npyv_setall_s64(npy_int64 a)
{
#if defined(_MSC_VER) && defined(_M_IX86)
return npyv__setr_epi64(a, a, a, a, a, a, a, a);
#else
return _mm512_set1_epi64(a);
#endif
}
/**
* vector with specific values set to each lane and
* set a specific value to all remained lanes
Expand Down Expand Up @@ -76,7 +95,16 @@ NPY_FINLINE __m512i npyv__setr_epi32(
NPY_FINLINE __m512i npyv__setr_epi64(npy_int64 i0, npy_int64 i1, npy_int64 i2, npy_int64 i3,
npy_int64 i4, npy_int64 i5, npy_int64 i6, npy_int64 i7)
{
#if defined(_MSC_VER) && defined(_M_IX86)
return _mm512_setr_epi32(
(int)i0, (int)(i0 >> 32), (int)i1, (int)(i1 >> 32),
(int)i2, (int)(i2 >> 32), (int)i3, (int)(i3 >> 32),
(int)i4, (int)(i4 >> 32), (int)i5, (int)(i5 >> 32),
(int)i6, (int)(i6 >> 32), (int)i7, (int)(i7 >> 32)
);
#else
return _mm512_setr_epi64(i0, i1, i2, i3, i4, i5, i6, i7);
#endif
}

NPY_FINLINE __m512 npyv__setr_ps(
Expand Down
19 changes: 19 additions & 0 deletions numpy/core/src/common/simd/simd.h
Expand Up @@ -27,6 +27,25 @@ typedef npy_int64 npyv_lanetype_s64;
typedef float npyv_lanetype_f32;
typedef double npyv_lanetype_f64;

#if defined(_MSC_VER) && defined(_M_IX86)
/*
* Avoid using any of the following intrinsics with MSVC 32-bit,
* even if they are apparently work on newer versions.
* They had bad impact on the generated instructions,
* sometimes the compiler deal with them without the respect
* of 32-bit mode which lead to crush due to execute 64-bit
* instructions and other times generate bad emulated instructions.
*/
#undef _mm512_set1_epi64
#undef _mm256_set1_epi64x
#undef _mm_set1_epi64x
#undef _mm512_setr_epi64x
#undef _mm256_setr_epi64x
#undef _mm_setr_epi64x
#undef _mm512_set_epi64x
#undef _mm256_set_epi64x
#undef _mm_set_epi64x
#endif
#if defined(NPY_HAVE_AVX512F) && !defined(NPY_SIMD_FORCE_256) && !defined(NPY_SIMD_FORCE_128)
#include "avx512/avx512.h"
#elif defined(NPY_HAVE_AVX2) && !defined(NPY_SIMD_FORCE_128)
Expand Down
25 changes: 23 additions & 2 deletions numpy/core/src/common/simd/sse/misc.h
Expand Up @@ -24,11 +24,28 @@
#define npyv_setall_s16(VAL) _mm_set1_epi16((short)(VAL))
#define npyv_setall_u32(VAL) _mm_set1_epi32((int)(VAL))
#define npyv_setall_s32(VAL) _mm_set1_epi32((int)(VAL))
#define npyv_setall_u64(VAL) _mm_set1_epi64x((npy_int64)(VAL))
#define npyv_setall_s64(VAL) _mm_set1_epi64x((npy_int64)(VAL))
#define npyv_setall_f32 _mm_set1_ps
#define npyv_setall_f64 _mm_set1_pd

NPY_FINLINE __m128i npyv__setr_epi64(npy_int64 i0, npy_int64 i1);

NPY_FINLINE npyv_u64 npyv_setall_u64(npy_uint64 a)
{
#if defined(_MSC_VER) && defined(_M_IX86)
return npyv__setr_epi64((npy_int64)a, (npy_int64)a);
#else
return _mm_set1_epi64x((npy_int64)a);
#endif
}
NPY_FINLINE npyv_s64 npyv_setall_s64(npy_int64 a)
{
#if defined(_MSC_VER) && defined(_M_IX86)
return npyv__setr_epi64(a, a);
#else
return _mm_set1_epi64x((npy_int64)a);
#endif
}

/**
* vector with specific values set to each lane and
* set a specific value to all remained lanes
Expand All @@ -53,7 +70,11 @@ NPY_FINLINE __m128i npyv__setr_epi32(int i0, int i1, int i2, int i3)
}
NPY_FINLINE __m128i npyv__setr_epi64(npy_int64 i0, npy_int64 i1)
{
#if defined(_MSC_VER) && defined(_M_IX86)
return _mm_setr_epi32((int)i0, (int)(i0 >> 32), (int)i1, (int)(i1 >> 32));
#else
return _mm_set_epi64x(i1, i0);
#endif
}
NPY_FINLINE __m128 npyv__setr_ps(float i0, float i1, float i2, float i3)
{
Expand Down