Skip to content

Commit

Permalink
Merge pull request #22834 from charris/backport-22771
Browse files Browse the repository at this point in the history
BUG, SIMD: Fix invalid value encountered in several ufuncs
  • Loading branch information
charris committed Dec 20, 2022
2 parents bd87203 + f930e48 commit d337ba9
Show file tree
Hide file tree
Showing 15 changed files with 401 additions and 173 deletions.
37 changes: 20 additions & 17 deletions numpy/core/setup.py
Expand Up @@ -1162,23 +1162,26 @@ def generate_umath_doc_header(ext, build_dir):
# SIMD module #
#######################################################################

config.add_extension('_simd', sources=[
join('src', 'common', 'npy_cpu_features.c'),
join('src', '_simd', '_simd.c'),
join('src', '_simd', '_simd_inc.h.src'),
join('src', '_simd', '_simd_data.inc.src'),
join('src', '_simd', '_simd.dispatch.c.src'),
], depends=[
join('src', 'common', 'npy_cpu_dispatch.h'),
join('src', 'common', 'simd', 'simd.h'),
join('src', '_simd', '_simd.h'),
join('src', '_simd', '_simd_inc.h.src'),
join('src', '_simd', '_simd_data.inc.src'),
join('src', '_simd', '_simd_arg.inc'),
join('src', '_simd', '_simd_convert.inc'),
join('src', '_simd', '_simd_easyintrin.inc'),
join('src', '_simd', '_simd_vector.inc'),
])
config.add_extension('_simd',
sources=[
join('src', 'common', 'npy_cpu_features.c'),
join('src', '_simd', '_simd.c'),
join('src', '_simd', '_simd_inc.h.src'),
join('src', '_simd', '_simd_data.inc.src'),
join('src', '_simd', '_simd.dispatch.c.src'),
], depends=[
join('src', 'common', 'npy_cpu_dispatch.h'),
join('src', 'common', 'simd', 'simd.h'),
join('src', '_simd', '_simd.h'),
join('src', '_simd', '_simd_inc.h.src'),
join('src', '_simd', '_simd_data.inc.src'),
join('src', '_simd', '_simd_arg.inc'),
join('src', '_simd', '_simd_convert.inc'),
join('src', '_simd', '_simd_easyintrin.inc'),
join('src', '_simd', '_simd_vector.inc'),
],
libraries=['npymath']
)

config.add_subpackage('tests')
config.add_data_dir('tests/data')
Expand Down
24 changes: 23 additions & 1 deletion numpy/core/src/_simd/_simd.c
@@ -1,11 +1,33 @@
#include "_simd.h"

#include "numpy/npy_math.h"

static PyObject *
get_floatstatus(PyObject* NPY_UNUSED(self), PyObject *NPY_UNUSED(args))
{
return PyLong_FromLong(npy_get_floatstatus());
}

static PyObject *
clear_floatstatus(PyObject* NPY_UNUSED(self), PyObject *NPY_UNUSED(args))
{
npy_clear_floatstatus();
Py_RETURN_NONE;
}

static PyMethodDef _simd_methods[] = {
{"get_floatstatus", get_floatstatus, METH_NOARGS, NULL},
{"clear_floatstatus", clear_floatstatus, METH_NOARGS, NULL},
{NULL, NULL, 0, NULL}
};

PyMODINIT_FUNC PyInit__simd(void)
{
static struct PyModuleDef defs = {
.m_base = PyModuleDef_HEAD_INIT,
.m_name = "numpy.core._simd",
.m_size = -1
.m_size = -1,
.m_methods = _simd_methods
};
if (npy_cpu_init() < 0) {
return NULL;
Expand Down
1 change: 1 addition & 0 deletions numpy/core/src/common/simd/avx2/avx2.h
Expand Up @@ -11,6 +11,7 @@
#define NPY_SIMD_FMA3 0 // fast emulated
#endif
#define NPY_SIMD_BIGENDIAN 0
#define NPY_SIMD_CMPSIGNAL 0
// Enough limit to allow us to use _mm256_i32gather_*
#define NPY_SIMD_MAXLOAD_STRIDE32 (0x7fffffff / 8)

Expand Down
2 changes: 1 addition & 1 deletion numpy/core/src/common/simd/avx2/operators.h
Expand Up @@ -205,7 +205,7 @@ NPY_FINLINE __m256i npyv_cmpge_u32(__m256i a, __m256i b)
#define npyv_cmple_u64(A, B) npyv_cmpge_u64(B, A)
#define npyv_cmple_s64(A, B) npyv_cmpge_s64(B, A)

// precision comparison
// precision comparison (orderd)
#define npyv_cmpeq_f32(A, B) _mm256_castps_si256(_mm256_cmp_ps(A, B, _CMP_EQ_OQ))
#define npyv_cmpeq_f64(A, B) _mm256_castpd_si256(_mm256_cmp_pd(A, B, _CMP_EQ_OQ))
#define npyv_cmpneq_f32(A, B) _mm256_castps_si256(_mm256_cmp_ps(A, B, _CMP_NEQ_UQ))
Expand Down
1 change: 1 addition & 0 deletions numpy/core/src/common/simd/avx512/avx512.h
Expand Up @@ -7,6 +7,7 @@
#define NPY_SIMD_F64 1
#define NPY_SIMD_FMA3 1 // native support
#define NPY_SIMD_BIGENDIAN 0
#define NPY_SIMD_CMPSIGNAL 0
// Enough limit to allow us to use _mm512_i32gather_* and _mm512_i32scatter_*
#define NPY_SIMD_MAXLOAD_STRIDE32 (0x7fffffff / 16)
#define NPY_SIMD_MAXSTORE_STRIDE32 (0x7fffffff / 16)
Expand Down
149 changes: 81 additions & 68 deletions numpy/core/src/common/simd/neon/math.h
Expand Up @@ -278,20 +278,25 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a)
return vrndnq_f32(a);
#else
// ARMv7 NEON only supports fp to int truncate conversion.
// a magic trick of adding 1.5 * 2**23 is used for rounding
// a magic trick of adding 1.5 * 2^23 is used for rounding
// to nearest even and then subtract this magic number to get
// the integer.
const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f));
const npyv_f32 magic = vdupq_n_f32(12582912.0f); // 1.5 * 2**23
npyv_f32 round = vsubq_f32(vaddq_f32(a, magic), magic);
npyv_b32 overflow = vcleq_f32(vabsq_f32(a), vreinterpretq_f32_u32(vdupq_n_u32(0x4b000000)));
round = vbslq_f32(overflow, round, a);
// signed zero
round = vreinterpretq_f32_s32(vorrq_s32(
vreinterpretq_s32_f32(round),
vandq_s32(vreinterpretq_s32_f32(a), szero)
));
return round;
//
const npyv_u32 szero = vreinterpretq_u32_f32(vdupq_n_f32(-0.0f));
const npyv_u32 sign_mask = vandq_u32(vreinterpretq_u32_f32(a), szero);
const npyv_f32 two_power_23 = vdupq_n_f32(8388608.0); // 2^23
const npyv_f32 two_power_23h = vdupq_n_f32(12582912.0f); // 1.5 * 2^23
npyv_u32 nnan_mask = vceqq_f32(a, a);
// eliminate nans to avoid invalid fp errors
npyv_f32 abs_x = vabsq_f32(vreinterpretq_f32_u32(vandq_u32(nnan_mask, vreinterpretq_u32_f32(a))));
// round by add magic number 1.5 * 2^23
npyv_f32 round = vsubq_f32(vaddq_f32(two_power_23h, abs_x), two_power_23h);
// copysign
round = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(round), sign_mask ));
// a if |a| >= 2^23 or a == NaN
npyv_u32 mask = vcleq_f32(abs_x, two_power_23);
mask = vandq_u32(mask, nnan_mask);
return vbslq_f32(mask, round, a);
#endif
}
#if NPY_SIMD_F64
Expand All @@ -302,33 +307,30 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a)
#ifdef NPY_HAVE_ASIMD
#define npyv_ceil_f32 vrndpq_f32
#else
NPY_FINLINE npyv_f32 npyv_ceil_f32(npyv_f32 a)
{
const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f));
NPY_FINLINE npyv_f32 npyv_ceil_f32(npyv_f32 a)
{
const npyv_u32 one = vreinterpretq_u32_f32(vdupq_n_f32(1.0f));
const npyv_s32 max_int = vdupq_n_s32(0x7fffffff);
/**
* On armv7, vcvtq.f32 handles special cases as follows:
* NaN return 0
* +inf or +outrange return 0x80000000(-0.0f)
* -inf or -outrange return 0x7fffffff(nan)
*/
npyv_s32 roundi = vcvtq_s32_f32(a);
npyv_f32 round = vcvtq_f32_s32(roundi);
const npyv_u32 szero = vreinterpretq_u32_f32(vdupq_n_f32(-0.0f));
const npyv_u32 sign_mask = vandq_u32(vreinterpretq_u32_f32(a), szero);
const npyv_f32 two_power_23 = vdupq_n_f32(8388608.0); // 2^23
const npyv_f32 two_power_23h = vdupq_n_f32(12582912.0f); // 1.5 * 2^23
npyv_u32 nnan_mask = vceqq_f32(a, a);
npyv_f32 x = vreinterpretq_f32_u32(vandq_u32(nnan_mask, vreinterpretq_u32_f32(a)));
// eliminate nans to avoid invalid fp errors
npyv_f32 abs_x = vabsq_f32(x);
// round by add magic number 1.5 * 2^23
npyv_f32 round = vsubq_f32(vaddq_f32(two_power_23h, abs_x), two_power_23h);
// copysign
round = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(round), sign_mask));
npyv_f32 ceil = vaddq_f32(round, vreinterpretq_f32_u32(
vandq_u32(vcltq_f32(round, a), one))
);
// respect signed zero, e.g. -0.5 -> -0.0
npyv_f32 rzero = vreinterpretq_f32_s32(vorrq_s32(
vreinterpretq_s32_f32(ceil),
vandq_s32(vreinterpretq_s32_f32(a), szero)
));
// if nan or overflow return a
npyv_u32 nnan = npyv_notnan_f32(a);
npyv_u32 overflow = vorrq_u32(
vceqq_s32(roundi, szero), vceqq_s32(roundi, max_int)
vandq_u32(vcltq_f32(round, x), one))
);
return vbslq_f32(vbicq_u32(nnan, overflow), rzero, a);
// respects signed zero
ceil = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(ceil), sign_mask));
// a if |a| >= 2^23 or a == NaN
npyv_u32 mask = vcleq_f32(abs_x, two_power_23);
mask = vandq_u32(mask, nnan_mask);
return vbslq_f32(mask, ceil, a);
}
#endif
#if NPY_SIMD_F64
Expand All @@ -339,29 +341,37 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a)
#ifdef NPY_HAVE_ASIMD
#define npyv_trunc_f32 vrndq_f32
#else
NPY_FINLINE npyv_f32 npyv_trunc_f32(npyv_f32 a)
{
const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f));
NPY_FINLINE npyv_f32 npyv_trunc_f32(npyv_f32 a)
{
const npyv_s32 max_int = vdupq_n_s32(0x7fffffff);
const npyv_u32 exp_mask = vdupq_n_u32(0xff000000);
const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f));
const npyv_u32 sign_mask = vandq_u32(
vreinterpretq_u32_f32(a), vreinterpretq_u32_s32(szero));

npyv_u32 nfinite_mask = vshlq_n_u32(vreinterpretq_u32_f32(a), 1);
nfinite_mask = vandq_u32(nfinite_mask, exp_mask);
nfinite_mask = vceqq_u32(nfinite_mask, exp_mask);
// elminate nans/inf to avoid invalid fp errors
npyv_f32 x = vreinterpretq_f32_u32(
veorq_u32(nfinite_mask, vreinterpretq_u32_f32(a)));
/**
* On armv7, vcvtq.f32 handles special cases as follows:
* NaN return 0
* +inf or +outrange return 0x80000000(-0.0f)
* -inf or -outrange return 0x7fffffff(nan)
*/
npyv_s32 roundi = vcvtq_s32_f32(a);
npyv_f32 round = vcvtq_f32_s32(roundi);
npyv_s32 trunci = vcvtq_s32_f32(x);
npyv_f32 trunc = vcvtq_f32_s32(trunci);
// respect signed zero, e.g. -0.5 -> -0.0
npyv_f32 rzero = vreinterpretq_f32_s32(vorrq_s32(
vreinterpretq_s32_f32(round),
vandq_s32(vreinterpretq_s32_f32(a), szero)
));
// if nan or overflow return a
npyv_u32 nnan = npyv_notnan_f32(a);
npyv_u32 overflow = vorrq_u32(
vceqq_s32(roundi, szero), vceqq_s32(roundi, max_int)
trunc = vreinterpretq_f32_u32(
vorrq_u32(vreinterpretq_u32_f32(trunc), sign_mask));
// if overflow return a
npyv_u32 overflow_mask = vorrq_u32(
vceqq_s32(trunci, szero), vceqq_s32(trunci, max_int)
);
return vbslq_f32(vbicq_u32(nnan, overflow), rzero, a);
// a if a overflow or nonfinite
return vbslq_f32(vorrq_u32(nfinite_mask, overflow_mask), a, trunc);
}
#endif
#if NPY_SIMD_F64
Expand All @@ -372,28 +382,31 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a)
#ifdef NPY_HAVE_ASIMD
#define npyv_floor_f32 vrndmq_f32
#else
NPY_FINLINE npyv_f32 npyv_floor_f32(npyv_f32 a)
{
const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f));
NPY_FINLINE npyv_f32 npyv_floor_f32(npyv_f32 a)
{
const npyv_u32 one = vreinterpretq_u32_f32(vdupq_n_f32(1.0f));
const npyv_s32 max_int = vdupq_n_s32(0x7fffffff);
const npyv_u32 szero = vreinterpretq_u32_f32(vdupq_n_f32(-0.0f));
const npyv_u32 sign_mask = vandq_u32(vreinterpretq_u32_f32(a), szero);
const npyv_f32 two_power_23 = vdupq_n_f32(8388608.0); // 2^23
const npyv_f32 two_power_23h = vdupq_n_f32(12582912.0f); // 1.5 * 2^23

npyv_s32 roundi = vcvtq_s32_f32(a);
npyv_f32 round = vcvtq_f32_s32(roundi);
npyv_u32 nnan_mask = vceqq_f32(a, a);
npyv_f32 x = vreinterpretq_f32_u32(vandq_u32(nnan_mask, vreinterpretq_u32_f32(a)));
// eliminate nans to avoid invalid fp errors
npyv_f32 abs_x = vabsq_f32(x);
// round by add magic number 1.5 * 2^23
npyv_f32 round = vsubq_f32(vaddq_f32(two_power_23h, abs_x), two_power_23h);
// copysign
round = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(round), sign_mask));
npyv_f32 floor = vsubq_f32(round, vreinterpretq_f32_u32(
vandq_u32(vcgtq_f32(round, a), one)
));
// respect signed zero
npyv_f32 rzero = vreinterpretq_f32_s32(vorrq_s32(
vreinterpretq_s32_f32(floor),
vandq_s32(vreinterpretq_s32_f32(a), szero)
vandq_u32(vcgtq_f32(round, x), one)
));
npyv_u32 nnan = npyv_notnan_f32(a);
npyv_u32 overflow = vorrq_u32(
vceqq_s32(roundi, szero), vceqq_s32(roundi, max_int)
);

return vbslq_f32(vbicq_u32(nnan, overflow), rzero, a);
// respects signed zero
floor = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(floor), sign_mask));
// a if |a| >= 2^23 or a == NaN
npyv_u32 mask = vcleq_f32(abs_x, two_power_23);
mask = vandq_u32(mask, nnan_mask);
return vbslq_f32(mask, floor, a);
}
#endif // NPY_HAVE_ASIMD
#if NPY_SIMD_F64
Expand Down
1 change: 1 addition & 0 deletions numpy/core/src/common/simd/neon/neon.h
Expand Up @@ -16,6 +16,7 @@
#define NPY_SIMD_FMA3 0 // HW emulated
#endif
#define NPY_SIMD_BIGENDIAN 0
#define NPY_SIMD_CMPSIGNAL 1

typedef uint8x16_t npyv_u8;
typedef int8x16_t npyv_s8;
Expand Down
3 changes: 3 additions & 0 deletions numpy/core/src/common/simd/simd.h
Expand Up @@ -82,6 +82,9 @@ typedef double npyv_lanetype_f64;
#define NPY_SIMD_FMA3 0
/// 1 if the enabled SIMD extension is running on big-endian mode otherwise 0.
#define NPY_SIMD_BIGENDIAN 0
/// 1 if the supported comparison intrinsics(lt, le, gt, ge)
/// raises FP invalid exception for quite NaNs.
#define NPY_SIMD_CMPSIGNAL 0
#endif

// enable emulated mask operations for all SIMD extension except for AVX512
Expand Down

0 comments on commit d337ba9

Please sign in to comment.