Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG, SIMD: Fix invalid value encountered in several ufuncs #22834

Merged
merged 8 commits into from Dec 20, 2022
37 changes: 20 additions & 17 deletions numpy/core/setup.py
Expand Up @@ -1162,23 +1162,26 @@ def generate_umath_doc_header(ext, build_dir):
# SIMD module #
#######################################################################

config.add_extension('_simd', sources=[
join('src', 'common', 'npy_cpu_features.c'),
join('src', '_simd', '_simd.c'),
join('src', '_simd', '_simd_inc.h.src'),
join('src', '_simd', '_simd_data.inc.src'),
join('src', '_simd', '_simd.dispatch.c.src'),
], depends=[
join('src', 'common', 'npy_cpu_dispatch.h'),
join('src', 'common', 'simd', 'simd.h'),
join('src', '_simd', '_simd.h'),
join('src', '_simd', '_simd_inc.h.src'),
join('src', '_simd', '_simd_data.inc.src'),
join('src', '_simd', '_simd_arg.inc'),
join('src', '_simd', '_simd_convert.inc'),
join('src', '_simd', '_simd_easyintrin.inc'),
join('src', '_simd', '_simd_vector.inc'),
])
config.add_extension('_simd',
sources=[
join('src', 'common', 'npy_cpu_features.c'),
join('src', '_simd', '_simd.c'),
join('src', '_simd', '_simd_inc.h.src'),
join('src', '_simd', '_simd_data.inc.src'),
join('src', '_simd', '_simd.dispatch.c.src'),
], depends=[
join('src', 'common', 'npy_cpu_dispatch.h'),
join('src', 'common', 'simd', 'simd.h'),
join('src', '_simd', '_simd.h'),
join('src', '_simd', '_simd_inc.h.src'),
join('src', '_simd', '_simd_data.inc.src'),
join('src', '_simd', '_simd_arg.inc'),
join('src', '_simd', '_simd_convert.inc'),
join('src', '_simd', '_simd_easyintrin.inc'),
join('src', '_simd', '_simd_vector.inc'),
],
libraries=['npymath']
)

config.add_subpackage('tests')
config.add_data_dir('tests/data')
Expand Down
24 changes: 23 additions & 1 deletion numpy/core/src/_simd/_simd.c
@@ -1,11 +1,33 @@
#include "_simd.h"

#include "numpy/npy_math.h"

static PyObject *
get_floatstatus(PyObject* NPY_UNUSED(self), PyObject *NPY_UNUSED(args))
{
return PyLong_FromLong(npy_get_floatstatus());
}

static PyObject *
clear_floatstatus(PyObject* NPY_UNUSED(self), PyObject *NPY_UNUSED(args))
{
npy_clear_floatstatus();
Py_RETURN_NONE;
}

static PyMethodDef _simd_methods[] = {
{"get_floatstatus", get_floatstatus, METH_NOARGS, NULL},
{"clear_floatstatus", clear_floatstatus, METH_NOARGS, NULL},
{NULL, NULL, 0, NULL}
};

PyMODINIT_FUNC PyInit__simd(void)
{
static struct PyModuleDef defs = {
.m_base = PyModuleDef_HEAD_INIT,
.m_name = "numpy.core._simd",
.m_size = -1
.m_size = -1,
.m_methods = _simd_methods
};
if (npy_cpu_init() < 0) {
return NULL;
Expand Down
1 change: 1 addition & 0 deletions numpy/core/src/common/simd/avx2/avx2.h
Expand Up @@ -11,6 +11,7 @@
#define NPY_SIMD_FMA3 0 // fast emulated
#endif
#define NPY_SIMD_BIGENDIAN 0
#define NPY_SIMD_CMPSIGNAL 0
// Enough limit to allow us to use _mm256_i32gather_*
#define NPY_SIMD_MAXLOAD_STRIDE32 (0x7fffffff / 8)

Expand Down
2 changes: 1 addition & 1 deletion numpy/core/src/common/simd/avx2/operators.h
Expand Up @@ -205,7 +205,7 @@ NPY_FINLINE __m256i npyv_cmpge_u32(__m256i a, __m256i b)
#define npyv_cmple_u64(A, B) npyv_cmpge_u64(B, A)
#define npyv_cmple_s64(A, B) npyv_cmpge_s64(B, A)

// precision comparison
// precision comparison (orderd)
#define npyv_cmpeq_f32(A, B) _mm256_castps_si256(_mm256_cmp_ps(A, B, _CMP_EQ_OQ))
#define npyv_cmpeq_f64(A, B) _mm256_castpd_si256(_mm256_cmp_pd(A, B, _CMP_EQ_OQ))
#define npyv_cmpneq_f32(A, B) _mm256_castps_si256(_mm256_cmp_ps(A, B, _CMP_NEQ_UQ))
Expand Down
1 change: 1 addition & 0 deletions numpy/core/src/common/simd/avx512/avx512.h
Expand Up @@ -7,6 +7,7 @@
#define NPY_SIMD_F64 1
#define NPY_SIMD_FMA3 1 // native support
#define NPY_SIMD_BIGENDIAN 0
#define NPY_SIMD_CMPSIGNAL 0
// Enough limit to allow us to use _mm512_i32gather_* and _mm512_i32scatter_*
#define NPY_SIMD_MAXLOAD_STRIDE32 (0x7fffffff / 16)
#define NPY_SIMD_MAXSTORE_STRIDE32 (0x7fffffff / 16)
Expand Down
149 changes: 81 additions & 68 deletions numpy/core/src/common/simd/neon/math.h
Expand Up @@ -278,20 +278,25 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a)
return vrndnq_f32(a);
#else
// ARMv7 NEON only supports fp to int truncate conversion.
// a magic trick of adding 1.5 * 2**23 is used for rounding
// a magic trick of adding 1.5 * 2^23 is used for rounding
// to nearest even and then subtract this magic number to get
// the integer.
const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f));
const npyv_f32 magic = vdupq_n_f32(12582912.0f); // 1.5 * 2**23
npyv_f32 round = vsubq_f32(vaddq_f32(a, magic), magic);
npyv_b32 overflow = vcleq_f32(vabsq_f32(a), vreinterpretq_f32_u32(vdupq_n_u32(0x4b000000)));
round = vbslq_f32(overflow, round, a);
// signed zero
round = vreinterpretq_f32_s32(vorrq_s32(
vreinterpretq_s32_f32(round),
vandq_s32(vreinterpretq_s32_f32(a), szero)
));
return round;
//
const npyv_u32 szero = vreinterpretq_u32_f32(vdupq_n_f32(-0.0f));
const npyv_u32 sign_mask = vandq_u32(vreinterpretq_u32_f32(a), szero);
const npyv_f32 two_power_23 = vdupq_n_f32(8388608.0); // 2^23
const npyv_f32 two_power_23h = vdupq_n_f32(12582912.0f); // 1.5 * 2^23
npyv_u32 nnan_mask = vceqq_f32(a, a);
// eliminate nans to avoid invalid fp errors
npyv_f32 abs_x = vabsq_f32(vreinterpretq_f32_u32(vandq_u32(nnan_mask, vreinterpretq_u32_f32(a))));
// round by add magic number 1.5 * 2^23
npyv_f32 round = vsubq_f32(vaddq_f32(two_power_23h, abs_x), two_power_23h);
// copysign
round = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(round), sign_mask ));
// a if |a| >= 2^23 or a == NaN
npyv_u32 mask = vcleq_f32(abs_x, two_power_23);
mask = vandq_u32(mask, nnan_mask);
return vbslq_f32(mask, round, a);
#endif
}
#if NPY_SIMD_F64
Expand All @@ -302,33 +307,30 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a)
#ifdef NPY_HAVE_ASIMD
#define npyv_ceil_f32 vrndpq_f32
#else
NPY_FINLINE npyv_f32 npyv_ceil_f32(npyv_f32 a)
{
const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f));
NPY_FINLINE npyv_f32 npyv_ceil_f32(npyv_f32 a)
{
const npyv_u32 one = vreinterpretq_u32_f32(vdupq_n_f32(1.0f));
const npyv_s32 max_int = vdupq_n_s32(0x7fffffff);
/**
* On armv7, vcvtq.f32 handles special cases as follows:
* NaN return 0
* +inf or +outrange return 0x80000000(-0.0f)
* -inf or -outrange return 0x7fffffff(nan)
*/
npyv_s32 roundi = vcvtq_s32_f32(a);
npyv_f32 round = vcvtq_f32_s32(roundi);
const npyv_u32 szero = vreinterpretq_u32_f32(vdupq_n_f32(-0.0f));
const npyv_u32 sign_mask = vandq_u32(vreinterpretq_u32_f32(a), szero);
const npyv_f32 two_power_23 = vdupq_n_f32(8388608.0); // 2^23
const npyv_f32 two_power_23h = vdupq_n_f32(12582912.0f); // 1.5 * 2^23
npyv_u32 nnan_mask = vceqq_f32(a, a);
npyv_f32 x = vreinterpretq_f32_u32(vandq_u32(nnan_mask, vreinterpretq_u32_f32(a)));
// eliminate nans to avoid invalid fp errors
npyv_f32 abs_x = vabsq_f32(x);
// round by add magic number 1.5 * 2^23
npyv_f32 round = vsubq_f32(vaddq_f32(two_power_23h, abs_x), two_power_23h);
// copysign
round = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(round), sign_mask));
npyv_f32 ceil = vaddq_f32(round, vreinterpretq_f32_u32(
vandq_u32(vcltq_f32(round, a), one))
);
// respect signed zero, e.g. -0.5 -> -0.0
npyv_f32 rzero = vreinterpretq_f32_s32(vorrq_s32(
vreinterpretq_s32_f32(ceil),
vandq_s32(vreinterpretq_s32_f32(a), szero)
));
// if nan or overflow return a
npyv_u32 nnan = npyv_notnan_f32(a);
npyv_u32 overflow = vorrq_u32(
vceqq_s32(roundi, szero), vceqq_s32(roundi, max_int)
vandq_u32(vcltq_f32(round, x), one))
);
return vbslq_f32(vbicq_u32(nnan, overflow), rzero, a);
// respects signed zero
ceil = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(ceil), sign_mask));
// a if |a| >= 2^23 or a == NaN
npyv_u32 mask = vcleq_f32(abs_x, two_power_23);
mask = vandq_u32(mask, nnan_mask);
return vbslq_f32(mask, ceil, a);
}
#endif
#if NPY_SIMD_F64
Expand All @@ -339,29 +341,37 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a)
#ifdef NPY_HAVE_ASIMD
#define npyv_trunc_f32 vrndq_f32
#else
NPY_FINLINE npyv_f32 npyv_trunc_f32(npyv_f32 a)
{
const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f));
NPY_FINLINE npyv_f32 npyv_trunc_f32(npyv_f32 a)
{
const npyv_s32 max_int = vdupq_n_s32(0x7fffffff);
const npyv_u32 exp_mask = vdupq_n_u32(0xff000000);
const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f));
const npyv_u32 sign_mask = vandq_u32(
vreinterpretq_u32_f32(a), vreinterpretq_u32_s32(szero));

npyv_u32 nfinite_mask = vshlq_n_u32(vreinterpretq_u32_f32(a), 1);
nfinite_mask = vandq_u32(nfinite_mask, exp_mask);
nfinite_mask = vceqq_u32(nfinite_mask, exp_mask);
// elminate nans/inf to avoid invalid fp errors
npyv_f32 x = vreinterpretq_f32_u32(
veorq_u32(nfinite_mask, vreinterpretq_u32_f32(a)));
/**
* On armv7, vcvtq.f32 handles special cases as follows:
* NaN return 0
* +inf or +outrange return 0x80000000(-0.0f)
* -inf or -outrange return 0x7fffffff(nan)
*/
npyv_s32 roundi = vcvtq_s32_f32(a);
npyv_f32 round = vcvtq_f32_s32(roundi);
npyv_s32 trunci = vcvtq_s32_f32(x);
npyv_f32 trunc = vcvtq_f32_s32(trunci);
// respect signed zero, e.g. -0.5 -> -0.0
npyv_f32 rzero = vreinterpretq_f32_s32(vorrq_s32(
vreinterpretq_s32_f32(round),
vandq_s32(vreinterpretq_s32_f32(a), szero)
));
// if nan or overflow return a
npyv_u32 nnan = npyv_notnan_f32(a);
npyv_u32 overflow = vorrq_u32(
vceqq_s32(roundi, szero), vceqq_s32(roundi, max_int)
trunc = vreinterpretq_f32_u32(
vorrq_u32(vreinterpretq_u32_f32(trunc), sign_mask));
// if overflow return a
npyv_u32 overflow_mask = vorrq_u32(
vceqq_s32(trunci, szero), vceqq_s32(trunci, max_int)
);
return vbslq_f32(vbicq_u32(nnan, overflow), rzero, a);
// a if a overflow or nonfinite
return vbslq_f32(vorrq_u32(nfinite_mask, overflow_mask), a, trunc);
}
#endif
#if NPY_SIMD_F64
Expand All @@ -372,28 +382,31 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a)
#ifdef NPY_HAVE_ASIMD
#define npyv_floor_f32 vrndmq_f32
#else
NPY_FINLINE npyv_f32 npyv_floor_f32(npyv_f32 a)
{
const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f));
NPY_FINLINE npyv_f32 npyv_floor_f32(npyv_f32 a)
{
const npyv_u32 one = vreinterpretq_u32_f32(vdupq_n_f32(1.0f));
const npyv_s32 max_int = vdupq_n_s32(0x7fffffff);
const npyv_u32 szero = vreinterpretq_u32_f32(vdupq_n_f32(-0.0f));
const npyv_u32 sign_mask = vandq_u32(vreinterpretq_u32_f32(a), szero);
const npyv_f32 two_power_23 = vdupq_n_f32(8388608.0); // 2^23
const npyv_f32 two_power_23h = vdupq_n_f32(12582912.0f); // 1.5 * 2^23

npyv_s32 roundi = vcvtq_s32_f32(a);
npyv_f32 round = vcvtq_f32_s32(roundi);
npyv_u32 nnan_mask = vceqq_f32(a, a);
npyv_f32 x = vreinterpretq_f32_u32(vandq_u32(nnan_mask, vreinterpretq_u32_f32(a)));
// eliminate nans to avoid invalid fp errors
npyv_f32 abs_x = vabsq_f32(x);
// round by add magic number 1.5 * 2^23
npyv_f32 round = vsubq_f32(vaddq_f32(two_power_23h, abs_x), two_power_23h);
// copysign
round = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(round), sign_mask));
npyv_f32 floor = vsubq_f32(round, vreinterpretq_f32_u32(
vandq_u32(vcgtq_f32(round, a), one)
));
// respect signed zero
npyv_f32 rzero = vreinterpretq_f32_s32(vorrq_s32(
vreinterpretq_s32_f32(floor),
vandq_s32(vreinterpretq_s32_f32(a), szero)
vandq_u32(vcgtq_f32(round, x), one)
));
npyv_u32 nnan = npyv_notnan_f32(a);
npyv_u32 overflow = vorrq_u32(
vceqq_s32(roundi, szero), vceqq_s32(roundi, max_int)
);

return vbslq_f32(vbicq_u32(nnan, overflow), rzero, a);
// respects signed zero
floor = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(floor), sign_mask));
// a if |a| >= 2^23 or a == NaN
npyv_u32 mask = vcleq_f32(abs_x, two_power_23);
mask = vandq_u32(mask, nnan_mask);
return vbslq_f32(mask, floor, a);
}
#endif // NPY_HAVE_ASIMD
#if NPY_SIMD_F64
Expand Down
1 change: 1 addition & 0 deletions numpy/core/src/common/simd/neon/neon.h
Expand Up @@ -16,6 +16,7 @@
#define NPY_SIMD_FMA3 0 // HW emulated
#endif
#define NPY_SIMD_BIGENDIAN 0
#define NPY_SIMD_CMPSIGNAL 1

typedef uint8x16_t npyv_u8;
typedef int8x16_t npyv_s8;
Expand Down
3 changes: 3 additions & 0 deletions numpy/core/src/common/simd/simd.h
Expand Up @@ -82,6 +82,9 @@ typedef double npyv_lanetype_f64;
#define NPY_SIMD_FMA3 0
/// 1 if the enabled SIMD extension is running on big-endian mode otherwise 0.
#define NPY_SIMD_BIGENDIAN 0
/// 1 if the supported comparison intrinsics(lt, le, gt, ge)
/// raises FP invalid exception for quite NaNs.
#define NPY_SIMD_CMPSIGNAL 0
#endif

// enable emulated mask operations for all SIMD extension except for AVX512
Expand Down