diff --git a/numpy/core/setup.py b/numpy/core/setup.py index 0a5cb5a0f199..1a7b8ea706a7 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -1162,23 +1162,26 @@ def generate_umath_doc_header(ext, build_dir): # SIMD module # ####################################################################### - config.add_extension('_simd', sources=[ - join('src', 'common', 'npy_cpu_features.c'), - join('src', '_simd', '_simd.c'), - join('src', '_simd', '_simd_inc.h.src'), - join('src', '_simd', '_simd_data.inc.src'), - join('src', '_simd', '_simd.dispatch.c.src'), - ], depends=[ - join('src', 'common', 'npy_cpu_dispatch.h'), - join('src', 'common', 'simd', 'simd.h'), - join('src', '_simd', '_simd.h'), - join('src', '_simd', '_simd_inc.h.src'), - join('src', '_simd', '_simd_data.inc.src'), - join('src', '_simd', '_simd_arg.inc'), - join('src', '_simd', '_simd_convert.inc'), - join('src', '_simd', '_simd_easyintrin.inc'), - join('src', '_simd', '_simd_vector.inc'), - ]) + config.add_extension('_simd', + sources=[ + join('src', 'common', 'npy_cpu_features.c'), + join('src', '_simd', '_simd.c'), + join('src', '_simd', '_simd_inc.h.src'), + join('src', '_simd', '_simd_data.inc.src'), + join('src', '_simd', '_simd.dispatch.c.src'), + ], depends=[ + join('src', 'common', 'npy_cpu_dispatch.h'), + join('src', 'common', 'simd', 'simd.h'), + join('src', '_simd', '_simd.h'), + join('src', '_simd', '_simd_inc.h.src'), + join('src', '_simd', '_simd_data.inc.src'), + join('src', '_simd', '_simd_arg.inc'), + join('src', '_simd', '_simd_convert.inc'), + join('src', '_simd', '_simd_easyintrin.inc'), + join('src', '_simd', '_simd_vector.inc'), + ], + libraries=['npymath'] + ) config.add_subpackage('tests') config.add_data_dir('tests/data') diff --git a/numpy/core/src/_simd/_simd.c b/numpy/core/src/_simd/_simd.c index b1fdd4478d9d..52b66e7652a8 100644 --- a/numpy/core/src/_simd/_simd.c +++ b/numpy/core/src/_simd/_simd.c @@ -1,11 +1,33 @@ #include "_simd.h" +#include "numpy/npy_math.h" + +static PyObject * +get_floatstatus(PyObject* NPY_UNUSED(self), PyObject *NPY_UNUSED(args)) +{ + return PyLong_FromLong(npy_get_floatstatus()); +} + +static PyObject * +clear_floatstatus(PyObject* NPY_UNUSED(self), PyObject *NPY_UNUSED(args)) +{ + npy_clear_floatstatus(); + Py_RETURN_NONE; +} + +static PyMethodDef _simd_methods[] = { + {"get_floatstatus", get_floatstatus, METH_NOARGS, NULL}, + {"clear_floatstatus", clear_floatstatus, METH_NOARGS, NULL}, + {NULL, NULL, 0, NULL} +}; + PyMODINIT_FUNC PyInit__simd(void) { static struct PyModuleDef defs = { .m_base = PyModuleDef_HEAD_INIT, .m_name = "numpy.core._simd", - .m_size = -1 + .m_size = -1, + .m_methods = _simd_methods }; if (npy_cpu_init() < 0) { return NULL; diff --git a/numpy/core/src/common/simd/avx2/avx2.h b/numpy/core/src/common/simd/avx2/avx2.h index 8cb74df2b8ec..d64f3c6d652f 100644 --- a/numpy/core/src/common/simd/avx2/avx2.h +++ b/numpy/core/src/common/simd/avx2/avx2.h @@ -11,6 +11,7 @@ #define NPY_SIMD_FMA3 0 // fast emulated #endif #define NPY_SIMD_BIGENDIAN 0 +#define NPY_SIMD_CMPSIGNAL 0 // Enough limit to allow us to use _mm256_i32gather_* #define NPY_SIMD_MAXLOAD_STRIDE32 (0x7fffffff / 8) diff --git a/numpy/core/src/common/simd/avx2/operators.h b/numpy/core/src/common/simd/avx2/operators.h index c10267b21348..86e0038d9953 100644 --- a/numpy/core/src/common/simd/avx2/operators.h +++ b/numpy/core/src/common/simd/avx2/operators.h @@ -205,7 +205,7 @@ NPY_FINLINE __m256i npyv_cmpge_u32(__m256i a, __m256i b) #define npyv_cmple_u64(A, B) npyv_cmpge_u64(B, A) #define npyv_cmple_s64(A, B) npyv_cmpge_s64(B, A) -// precision comparison +// precision comparison (orderd) #define npyv_cmpeq_f32(A, B) _mm256_castps_si256(_mm256_cmp_ps(A, B, _CMP_EQ_OQ)) #define npyv_cmpeq_f64(A, B) _mm256_castpd_si256(_mm256_cmp_pd(A, B, _CMP_EQ_OQ)) #define npyv_cmpneq_f32(A, B) _mm256_castps_si256(_mm256_cmp_ps(A, B, _CMP_NEQ_UQ)) diff --git a/numpy/core/src/common/simd/avx512/avx512.h b/numpy/core/src/common/simd/avx512/avx512.h index 0946e6443e62..aa6abe256424 100644 --- a/numpy/core/src/common/simd/avx512/avx512.h +++ b/numpy/core/src/common/simd/avx512/avx512.h @@ -7,6 +7,7 @@ #define NPY_SIMD_F64 1 #define NPY_SIMD_FMA3 1 // native support #define NPY_SIMD_BIGENDIAN 0 +#define NPY_SIMD_CMPSIGNAL 0 // Enough limit to allow us to use _mm512_i32gather_* and _mm512_i32scatter_* #define NPY_SIMD_MAXLOAD_STRIDE32 (0x7fffffff / 16) #define NPY_SIMD_MAXSTORE_STRIDE32 (0x7fffffff / 16) diff --git a/numpy/core/src/common/simd/neon/math.h b/numpy/core/src/common/simd/neon/math.h index c0a771b5d1e7..01344a41fe25 100644 --- a/numpy/core/src/common/simd/neon/math.h +++ b/numpy/core/src/common/simd/neon/math.h @@ -278,20 +278,25 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a) return vrndnq_f32(a); #else // ARMv7 NEON only supports fp to int truncate conversion. - // a magic trick of adding 1.5 * 2**23 is used for rounding + // a magic trick of adding 1.5 * 2^23 is used for rounding // to nearest even and then subtract this magic number to get // the integer. - const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f)); - const npyv_f32 magic = vdupq_n_f32(12582912.0f); // 1.5 * 2**23 - npyv_f32 round = vsubq_f32(vaddq_f32(a, magic), magic); - npyv_b32 overflow = vcleq_f32(vabsq_f32(a), vreinterpretq_f32_u32(vdupq_n_u32(0x4b000000))); - round = vbslq_f32(overflow, round, a); - // signed zero - round = vreinterpretq_f32_s32(vorrq_s32( - vreinterpretq_s32_f32(round), - vandq_s32(vreinterpretq_s32_f32(a), szero) - )); - return round; + // + const npyv_u32 szero = vreinterpretq_u32_f32(vdupq_n_f32(-0.0f)); + const npyv_u32 sign_mask = vandq_u32(vreinterpretq_u32_f32(a), szero); + const npyv_f32 two_power_23 = vdupq_n_f32(8388608.0); // 2^23 + const npyv_f32 two_power_23h = vdupq_n_f32(12582912.0f); // 1.5 * 2^23 + npyv_u32 nnan_mask = vceqq_f32(a, a); + // eliminate nans to avoid invalid fp errors + npyv_f32 abs_x = vabsq_f32(vreinterpretq_f32_u32(vandq_u32(nnan_mask, vreinterpretq_u32_f32(a)))); + // round by add magic number 1.5 * 2^23 + npyv_f32 round = vsubq_f32(vaddq_f32(two_power_23h, abs_x), two_power_23h); + // copysign + round = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(round), sign_mask )); + // a if |a| >= 2^23 or a == NaN + npyv_u32 mask = vcleq_f32(abs_x, two_power_23); + mask = vandq_u32(mask, nnan_mask); + return vbslq_f32(mask, round, a); #endif } #if NPY_SIMD_F64 @@ -302,33 +307,30 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a) #ifdef NPY_HAVE_ASIMD #define npyv_ceil_f32 vrndpq_f32 #else - NPY_FINLINE npyv_f32 npyv_ceil_f32(npyv_f32 a) - { - const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f)); + NPY_FINLINE npyv_f32 npyv_ceil_f32(npyv_f32 a) + { const npyv_u32 one = vreinterpretq_u32_f32(vdupq_n_f32(1.0f)); - const npyv_s32 max_int = vdupq_n_s32(0x7fffffff); - /** - * On armv7, vcvtq.f32 handles special cases as follows: - * NaN return 0 - * +inf or +outrange return 0x80000000(-0.0f) - * -inf or -outrange return 0x7fffffff(nan) - */ - npyv_s32 roundi = vcvtq_s32_f32(a); - npyv_f32 round = vcvtq_f32_s32(roundi); + const npyv_u32 szero = vreinterpretq_u32_f32(vdupq_n_f32(-0.0f)); + const npyv_u32 sign_mask = vandq_u32(vreinterpretq_u32_f32(a), szero); + const npyv_f32 two_power_23 = vdupq_n_f32(8388608.0); // 2^23 + const npyv_f32 two_power_23h = vdupq_n_f32(12582912.0f); // 1.5 * 2^23 + npyv_u32 nnan_mask = vceqq_f32(a, a); + npyv_f32 x = vreinterpretq_f32_u32(vandq_u32(nnan_mask, vreinterpretq_u32_f32(a))); + // eliminate nans to avoid invalid fp errors + npyv_f32 abs_x = vabsq_f32(x); + // round by add magic number 1.5 * 2^23 + npyv_f32 round = vsubq_f32(vaddq_f32(two_power_23h, abs_x), two_power_23h); + // copysign + round = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(round), sign_mask)); npyv_f32 ceil = vaddq_f32(round, vreinterpretq_f32_u32( - vandq_u32(vcltq_f32(round, a), one)) - ); - // respect signed zero, e.g. -0.5 -> -0.0 - npyv_f32 rzero = vreinterpretq_f32_s32(vorrq_s32( - vreinterpretq_s32_f32(ceil), - vandq_s32(vreinterpretq_s32_f32(a), szero) - )); - // if nan or overflow return a - npyv_u32 nnan = npyv_notnan_f32(a); - npyv_u32 overflow = vorrq_u32( - vceqq_s32(roundi, szero), vceqq_s32(roundi, max_int) + vandq_u32(vcltq_f32(round, x), one)) ); - return vbslq_f32(vbicq_u32(nnan, overflow), rzero, a); + // respects signed zero + ceil = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(ceil), sign_mask)); + // a if |a| >= 2^23 or a == NaN + npyv_u32 mask = vcleq_f32(abs_x, two_power_23); + mask = vandq_u32(mask, nnan_mask); + return vbslq_f32(mask, ceil, a); } #endif #if NPY_SIMD_F64 @@ -339,29 +341,37 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a) #ifdef NPY_HAVE_ASIMD #define npyv_trunc_f32 vrndq_f32 #else - NPY_FINLINE npyv_f32 npyv_trunc_f32(npyv_f32 a) - { - const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f)); + NPY_FINLINE npyv_f32 npyv_trunc_f32(npyv_f32 a) + { const npyv_s32 max_int = vdupq_n_s32(0x7fffffff); + const npyv_u32 exp_mask = vdupq_n_u32(0xff000000); + const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f)); + const npyv_u32 sign_mask = vandq_u32( + vreinterpretq_u32_f32(a), vreinterpretq_u32_s32(szero)); + + npyv_u32 nfinite_mask = vshlq_n_u32(vreinterpretq_u32_f32(a), 1); + nfinite_mask = vandq_u32(nfinite_mask, exp_mask); + nfinite_mask = vceqq_u32(nfinite_mask, exp_mask); + // elminate nans/inf to avoid invalid fp errors + npyv_f32 x = vreinterpretq_f32_u32( + veorq_u32(nfinite_mask, vreinterpretq_u32_f32(a))); /** * On armv7, vcvtq.f32 handles special cases as follows: * NaN return 0 * +inf or +outrange return 0x80000000(-0.0f) * -inf or -outrange return 0x7fffffff(nan) */ - npyv_s32 roundi = vcvtq_s32_f32(a); - npyv_f32 round = vcvtq_f32_s32(roundi); + npyv_s32 trunci = vcvtq_s32_f32(x); + npyv_f32 trunc = vcvtq_f32_s32(trunci); // respect signed zero, e.g. -0.5 -> -0.0 - npyv_f32 rzero = vreinterpretq_f32_s32(vorrq_s32( - vreinterpretq_s32_f32(round), - vandq_s32(vreinterpretq_s32_f32(a), szero) - )); - // if nan or overflow return a - npyv_u32 nnan = npyv_notnan_f32(a); - npyv_u32 overflow = vorrq_u32( - vceqq_s32(roundi, szero), vceqq_s32(roundi, max_int) + trunc = vreinterpretq_f32_u32( + vorrq_u32(vreinterpretq_u32_f32(trunc), sign_mask)); + // if overflow return a + npyv_u32 overflow_mask = vorrq_u32( + vceqq_s32(trunci, szero), vceqq_s32(trunci, max_int) ); - return vbslq_f32(vbicq_u32(nnan, overflow), rzero, a); + // a if a overflow or nonfinite + return vbslq_f32(vorrq_u32(nfinite_mask, overflow_mask), a, trunc); } #endif #if NPY_SIMD_F64 @@ -372,28 +382,31 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a) #ifdef NPY_HAVE_ASIMD #define npyv_floor_f32 vrndmq_f32 #else - NPY_FINLINE npyv_f32 npyv_floor_f32(npyv_f32 a) - { - const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f)); + NPY_FINLINE npyv_f32 npyv_floor_f32(npyv_f32 a) + { const npyv_u32 one = vreinterpretq_u32_f32(vdupq_n_f32(1.0f)); - const npyv_s32 max_int = vdupq_n_s32(0x7fffffff); + const npyv_u32 szero = vreinterpretq_u32_f32(vdupq_n_f32(-0.0f)); + const npyv_u32 sign_mask = vandq_u32(vreinterpretq_u32_f32(a), szero); + const npyv_f32 two_power_23 = vdupq_n_f32(8388608.0); // 2^23 + const npyv_f32 two_power_23h = vdupq_n_f32(12582912.0f); // 1.5 * 2^23 - npyv_s32 roundi = vcvtq_s32_f32(a); - npyv_f32 round = vcvtq_f32_s32(roundi); + npyv_u32 nnan_mask = vceqq_f32(a, a); + npyv_f32 x = vreinterpretq_f32_u32(vandq_u32(nnan_mask, vreinterpretq_u32_f32(a))); + // eliminate nans to avoid invalid fp errors + npyv_f32 abs_x = vabsq_f32(x); + // round by add magic number 1.5 * 2^23 + npyv_f32 round = vsubq_f32(vaddq_f32(two_power_23h, abs_x), two_power_23h); + // copysign + round = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(round), sign_mask)); npyv_f32 floor = vsubq_f32(round, vreinterpretq_f32_u32( - vandq_u32(vcgtq_f32(round, a), one) - )); - // respect signed zero - npyv_f32 rzero = vreinterpretq_f32_s32(vorrq_s32( - vreinterpretq_s32_f32(floor), - vandq_s32(vreinterpretq_s32_f32(a), szero) + vandq_u32(vcgtq_f32(round, x), one) )); - npyv_u32 nnan = npyv_notnan_f32(a); - npyv_u32 overflow = vorrq_u32( - vceqq_s32(roundi, szero), vceqq_s32(roundi, max_int) - ); - - return vbslq_f32(vbicq_u32(nnan, overflow), rzero, a); + // respects signed zero + floor = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(floor), sign_mask)); + // a if |a| >= 2^23 or a == NaN + npyv_u32 mask = vcleq_f32(abs_x, two_power_23); + mask = vandq_u32(mask, nnan_mask); + return vbslq_f32(mask, floor, a); } #endif // NPY_HAVE_ASIMD #if NPY_SIMD_F64 diff --git a/numpy/core/src/common/simd/neon/neon.h b/numpy/core/src/common/simd/neon/neon.h index b0807152728a..49c35c415e25 100644 --- a/numpy/core/src/common/simd/neon/neon.h +++ b/numpy/core/src/common/simd/neon/neon.h @@ -16,6 +16,7 @@ #define NPY_SIMD_FMA3 0 // HW emulated #endif #define NPY_SIMD_BIGENDIAN 0 +#define NPY_SIMD_CMPSIGNAL 1 typedef uint8x16_t npyv_u8; typedef int8x16_t npyv_s8; diff --git a/numpy/core/src/common/simd/simd.h b/numpy/core/src/common/simd/simd.h index 92a77ad808fd..8c9b14251aa0 100644 --- a/numpy/core/src/common/simd/simd.h +++ b/numpy/core/src/common/simd/simd.h @@ -82,6 +82,9 @@ typedef double npyv_lanetype_f64; #define NPY_SIMD_FMA3 0 /// 1 if the enabled SIMD extension is running on big-endian mode otherwise 0. #define NPY_SIMD_BIGENDIAN 0 + /// 1 if the supported comparison intrinsics(lt, le, gt, ge) + /// raises FP invalid exception for quite NaNs. + #define NPY_SIMD_CMPSIGNAL 0 #endif // enable emulated mask operations for all SIMD extension except for AVX512 diff --git a/numpy/core/src/common/simd/sse/math.h b/numpy/core/src/common/simd/sse/math.h index 967240d09eab..6b42d8ba3818 100644 --- a/numpy/core/src/common/simd/sse/math.h +++ b/numpy/core/src/common/simd/sse/math.h @@ -269,13 +269,23 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a) #ifdef NPY_HAVE_SSE41 return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); #else - const npyv_f32 szero = _mm_set1_ps(-0.0f); - __m128i roundi = _mm_cvtps_epi32(a); - __m128i overflow = _mm_cmpeq_epi32(roundi, _mm_castps_si128(szero)); - __m128 r = _mm_cvtepi32_ps(roundi); - // respect sign of zero - r = _mm_or_ps(r, _mm_and_ps(a, szero)); - return npyv_select_f32(overflow, a, r); + const __m128 szero = _mm_set1_ps(-0.0f); + const __m128i exp_mask = _mm_set1_epi32(0xff000000); + + __m128i nfinite_mask = _mm_slli_epi32(_mm_castps_si128(a), 1); + nfinite_mask = _mm_and_si128(nfinite_mask, exp_mask); + nfinite_mask = _mm_cmpeq_epi32(nfinite_mask, exp_mask); + + // eliminate nans/inf to avoid invalid fp errors + __m128 x = _mm_xor_ps(a, _mm_castsi128_ps(nfinite_mask)); + __m128i roundi = _mm_cvtps_epi32(x); + __m128 round = _mm_cvtepi32_ps(roundi); + // respect signed zero + round = _mm_or_ps(round, _mm_and_ps(a, szero)); + // if overflow return a + __m128i overflow_mask = _mm_cmpeq_epi32(roundi, _mm_castps_si128(szero)); + // a if a overflow or nonfinite + return npyv_select_f32(_mm_or_si128(nfinite_mask, overflow_mask), a, round); #endif } @@ -285,16 +295,20 @@ NPY_FINLINE npyv_f64 npyv_rint_f64(npyv_f64 a) #ifdef NPY_HAVE_SSE41 return _mm_round_pd(a, _MM_FROUND_TO_NEAREST_INT); #else - const npyv_f64 szero = _mm_set1_pd(-0.0); - const npyv_f64 two_power_52 = _mm_set1_pd(0x10000000000000); - npyv_f64 abs_a = npyv_abs_f64(a); + const __m128d szero = _mm_set1_pd(-0.0); + const __m128d two_power_52 = _mm_set1_pd(0x10000000000000); + __m128d nan_mask = _mm_cmpunord_pd(a, a); + // elminate nans to avoid invalid fp errors within cmpge + __m128d abs_x = npyv_abs_f64(_mm_xor_pd(nan_mask, a)); // round by add magic number 2^52 // assuming that MXCSR register is set to rounding - npyv_f64 abs_round = _mm_sub_pd(_mm_add_pd(two_power_52, abs_a), two_power_52); + __m128d round = _mm_sub_pd(_mm_add_pd(two_power_52, abs_x), two_power_52); // copysign - npyv_f64 round = _mm_or_pd(abs_round, _mm_and_pd(a, szero)); - // a if a >= 2^52 - return npyv_select_f64(npyv_cmpge_f64(abs_a, two_power_52), a, round); + round = _mm_or_pd(round, _mm_and_pd(a, szero)); + // a if |a| >= 2^52 or a == NaN + __m128d mask = _mm_cmpge_pd(abs_x, two_power_52); + mask = _mm_or_pd(mask, nan_mask); + return npyv_select_f64(_mm_castpd_si128(mask), a, round); #endif } // ceil @@ -304,24 +318,48 @@ NPY_FINLINE npyv_f64 npyv_rint_f64(npyv_f64 a) #else NPY_FINLINE npyv_f32 npyv_ceil_f32(npyv_f32 a) { - const npyv_f32 szero = _mm_set1_ps(-0.0f); - const npyv_f32 one = _mm_set1_ps(1.0f); - npyv_s32 roundi = _mm_cvttps_epi32(a); - npyv_f32 round = _mm_cvtepi32_ps(roundi); - npyv_f32 ceil = _mm_add_ps(round, _mm_and_ps(_mm_cmplt_ps(round, a), one)); - // respect signed zero, e.g. -0.5 -> -0.0 - npyv_f32 rzero = _mm_or_ps(ceil, _mm_and_ps(a, szero)); + const __m128 one = _mm_set1_ps(1.0f); + const __m128 szero = _mm_set1_ps(-0.0f); + const __m128i exp_mask = _mm_set1_epi32(0xff000000); + + __m128i nfinite_mask = _mm_slli_epi32(_mm_castps_si128(a), 1); + nfinite_mask = _mm_and_si128(nfinite_mask, exp_mask); + nfinite_mask = _mm_cmpeq_epi32(nfinite_mask, exp_mask); + + // eliminate nans/inf to avoid invalid fp errors + __m128 x = _mm_xor_ps(a, _mm_castsi128_ps(nfinite_mask)); + __m128i roundi = _mm_cvtps_epi32(x); + __m128 round = _mm_cvtepi32_ps(roundi); + __m128 ceil = _mm_add_ps(round, _mm_and_ps(_mm_cmplt_ps(round, x), one)); + // respect signed zero + ceil = _mm_or_ps(ceil, _mm_and_ps(a, szero)); // if overflow return a - return npyv_select_f32(_mm_cmpeq_epi32(roundi, _mm_castps_si128(szero)), a, rzero); + __m128i overflow_mask = _mm_cmpeq_epi32(roundi, _mm_castps_si128(szero)); + // a if a overflow or nonfinite + return npyv_select_f32(_mm_or_si128(nfinite_mask, overflow_mask), a, ceil); } NPY_FINLINE npyv_f64 npyv_ceil_f64(npyv_f64 a) { - const npyv_f64 szero = _mm_set1_pd(-0.0); - const npyv_f64 one = _mm_set1_pd(1.0); - npyv_f64 round = npyv_rint_f64(a); - npyv_f64 ceil = _mm_add_pd(round, _mm_and_pd(_mm_cmplt_pd(round, a), one)); - // respect signed zero, e.g. -0.5 -> -0.0 - return _mm_or_pd(ceil, _mm_and_pd(a, szero)); + const __m128d one = _mm_set1_pd(1.0); + const __m128d szero = _mm_set1_pd(-0.0); + const __m128d two_power_52 = _mm_set1_pd(0x10000000000000); + __m128d nan_mask = _mm_cmpunord_pd(a, a); + // elminate nans to avoid invalid fp errors within cmpge + __m128d x = _mm_xor_pd(nan_mask, a); + __m128d abs_x = npyv_abs_f64(x); + __m128d sign_x = _mm_and_pd(x, szero); + // round by add magic number 2^52 + // assuming that MXCSR register is set to rounding + __m128d round = _mm_sub_pd(_mm_add_pd(two_power_52, abs_x), two_power_52); + // copysign + round = _mm_or_pd(round, sign_x); + __m128d ceil = _mm_add_pd(round, _mm_and_pd(_mm_cmplt_pd(round, x), one)); + // respects sign of 0.0 + ceil = _mm_or_pd(ceil, sign_x); + // a if |a| >= 2^52 or a == NaN + __m128d mask = _mm_cmpge_pd(abs_x, two_power_52); + mask = _mm_or_pd(mask, nan_mask); + return npyv_select_f64(_mm_castpd_si128(mask), a, ceil); } #endif @@ -332,26 +370,43 @@ NPY_FINLINE npyv_f64 npyv_rint_f64(npyv_f64 a) #else NPY_FINLINE npyv_f32 npyv_trunc_f32(npyv_f32 a) { - const npyv_f32 szero = _mm_set1_ps(-0.0f); - npyv_s32 roundi = _mm_cvttps_epi32(a); - npyv_f32 trunc = _mm_cvtepi32_ps(roundi); + const __m128 szero = _mm_set1_ps(-0.0f); + const __m128i exp_mask = _mm_set1_epi32(0xff000000); + + __m128i nfinite_mask = _mm_slli_epi32(_mm_castps_si128(a), 1); + nfinite_mask = _mm_and_si128(nfinite_mask, exp_mask); + nfinite_mask = _mm_cmpeq_epi32(nfinite_mask, exp_mask); + + // elminate nans/inf to avoid invalid fp errors + __m128 x = _mm_xor_ps(a, _mm_castsi128_ps(nfinite_mask)); + __m128i trunci = _mm_cvttps_epi32(x); + __m128 trunc = _mm_cvtepi32_ps(trunci); // respect signed zero, e.g. -0.5 -> -0.0 - npyv_f32 rzero = _mm_or_ps(trunc, _mm_and_ps(a, szero)); + trunc = _mm_or_ps(trunc, _mm_and_ps(a, szero)); // if overflow return a - return npyv_select_f32(_mm_cmpeq_epi32(roundi, _mm_castps_si128(szero)), a, rzero); + __m128i overflow_mask = _mm_cmpeq_epi32(trunci, _mm_castps_si128(szero)); + // a if a overflow or nonfinite + return npyv_select_f32(_mm_or_si128(nfinite_mask, overflow_mask), a, trunc); } NPY_FINLINE npyv_f64 npyv_trunc_f64(npyv_f64 a) { - const npyv_f64 szero = _mm_set1_pd(-0.0); - const npyv_f64 one = _mm_set1_pd(1.0); - const npyv_f64 two_power_52 = _mm_set1_pd(0x10000000000000); - npyv_f64 abs_a = npyv_abs_f64(a); + const __m128d one = _mm_set1_pd(1.0); + const __m128d szero = _mm_set1_pd(-0.0); + const __m128d two_power_52 = _mm_set1_pd(0x10000000000000); + __m128d nan_mask = _mm_cmpunord_pd(a, a); + // elminate nans to avoid invalid fp errors within cmpge + __m128d abs_x = npyv_abs_f64(_mm_xor_pd(nan_mask, a)); // round by add magic number 2^52 - npyv_f64 abs_round = _mm_sub_pd(_mm_add_pd(abs_a, two_power_52), two_power_52); - npyv_f64 subtrahend = _mm_and_pd(_mm_cmpgt_pd(abs_round, abs_a), one); - npyv_f64 trunc = _mm_or_pd(_mm_sub_pd(abs_round, subtrahend), _mm_and_pd(a, szero)); - // a if a >= 2^52 - return npyv_select_f64(npyv_cmpge_f64(abs_a, two_power_52), a, trunc); + // assuming that MXCSR register is set to rounding + __m128d abs_round = _mm_sub_pd(_mm_add_pd(two_power_52, abs_x), two_power_52); + __m128d subtrahend = _mm_and_pd(_mm_cmpgt_pd(abs_round, abs_x), one); + __m128d trunc = _mm_sub_pd(abs_round, subtrahend); + // copysign + trunc = _mm_or_pd(trunc, _mm_and_pd(a, szero)); + // a if |a| >= 2^52 or a == NaN + __m128d mask = _mm_cmpge_pd(abs_x, two_power_52); + mask = _mm_or_pd(mask, nan_mask); + return npyv_select_f64(_mm_castpd_si128(mask), a, trunc); } #endif @@ -362,15 +417,46 @@ NPY_FINLINE npyv_f64 npyv_rint_f64(npyv_f64 a) #else NPY_FINLINE npyv_f32 npyv_floor_f32(npyv_f32 a) { - const npyv_f32 one = _mm_set1_ps(1.0f); - npyv_f32 round = npyv_rint_f32(a); - return _mm_sub_ps(round, _mm_and_ps(_mm_cmpgt_ps(round, a), one)); + const __m128 one = _mm_set1_ps(1.0f); + const __m128 szero = _mm_set1_ps(-0.0f); + const __m128i exp_mask = _mm_set1_epi32(0xff000000); + + __m128i nfinite_mask = _mm_slli_epi32(_mm_castps_si128(a), 1); + nfinite_mask = _mm_and_si128(nfinite_mask, exp_mask); + nfinite_mask = _mm_cmpeq_epi32(nfinite_mask, exp_mask); + + // eliminate nans/inf to avoid invalid fp errors + __m128 x = _mm_xor_ps(a, _mm_castsi128_ps(nfinite_mask)); + __m128i roundi = _mm_cvtps_epi32(x); + __m128 round = _mm_cvtepi32_ps(roundi); + __m128 floor = _mm_sub_ps(round, _mm_and_ps(_mm_cmpgt_ps(round, x), one)); + // respect signed zero + floor = _mm_or_ps(floor, _mm_and_ps(a, szero)); + // if overflow return a + __m128i overflow_mask = _mm_cmpeq_epi32(roundi, _mm_castps_si128(szero)); + // a if a overflow or nonfinite + return npyv_select_f32(_mm_or_si128(nfinite_mask, overflow_mask), a, floor); } NPY_FINLINE npyv_f64 npyv_floor_f64(npyv_f64 a) { - const npyv_f64 one = _mm_set1_pd(1.0); - npyv_f64 round = npyv_rint_f64(a); - return _mm_sub_pd(round, _mm_and_pd(_mm_cmpgt_pd(round, a), one)); + const __m128d one = _mm_set1_pd(1.0f); + const __m128d szero = _mm_set1_pd(-0.0f); + const __m128d two_power_52 = _mm_set1_pd(0x10000000000000); + __m128d nan_mask = _mm_cmpunord_pd(a, a); + // elminate nans to avoid invalid fp errors within cmpge + __m128d x = _mm_xor_pd(nan_mask, a); + __m128d abs_x = npyv_abs_f64(x); + __m128d sign_x = _mm_and_pd(x, szero); + // round by add magic number 2^52 + // assuming that MXCSR register is set to rounding + __m128d round = _mm_sub_pd(_mm_add_pd(two_power_52, abs_x), two_power_52); + // copysign + round = _mm_or_pd(round, sign_x); + __m128d floor = _mm_sub_pd(round, _mm_and_pd(_mm_cmpgt_pd(round, x), one)); + // a if |a| >= 2^52 or a == NaN + __m128d mask = _mm_cmpge_pd(abs_x, two_power_52); + mask = _mm_or_pd(mask, nan_mask); + return npyv_select_f64(_mm_castpd_si128(mask), a, floor); } #endif // NPY_HAVE_SSE41 diff --git a/numpy/core/src/common/simd/sse/sse.h b/numpy/core/src/common/simd/sse/sse.h index c21bbfda724d..0c6b8cdbab2e 100644 --- a/numpy/core/src/common/simd/sse/sse.h +++ b/numpy/core/src/common/simd/sse/sse.h @@ -12,6 +12,7 @@ #define NPY_SIMD_FMA3 0 // fast emulated #endif #define NPY_SIMD_BIGENDIAN 0 +#define NPY_SIMD_CMPSIGNAL 1 typedef __m128i npyv_u8; typedef __m128i npyv_s8; diff --git a/numpy/core/src/common/simd/vec/vec.h b/numpy/core/src/common/simd/vec/vec.h index abcd33ce14bf..1d4508669286 100644 --- a/numpy/core/src/common/simd/vec/vec.h +++ b/numpy/core/src/common/simd/vec/vec.h @@ -39,8 +39,10 @@ #ifdef NPY_HAVE_VX #define NPY_SIMD_BIGENDIAN 1 + #define NPY_SIMD_CMPSIGNAL 0 #else #define NPY_SIMD_BIGENDIAN 0 + #define NPY_SIMD_CMPSIGNAL 1 #endif typedef __vector unsigned char npyv_u8; diff --git a/numpy/core/src/umath/loops_trigonometric.dispatch.c.src b/numpy/core/src/umath/loops_trigonometric.dispatch.c.src index 78685e807ad2..9f9978e66824 100644 --- a/numpy/core/src/umath/loops_trigonometric.dispatch.c.src +++ b/numpy/core/src/umath/loops_trigonometric.dispatch.c.src @@ -124,6 +124,11 @@ simd_sincos_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst, } else { x_in = npyv_loadn_tillz_f32(src, ssrc, len); } + npyv_b32 nnan_mask = npyv_notnan_f32(x_in); + #if NPY_SIMD_CMPSIGNAL + // Eliminate NaN to avoid FP invalid exception + x_in = npyv_and_f32(x_in, npyv_reinterpret_f32_u32(npyv_cvt_u32_b32(nnan_mask))); + #endif npyv_b32 simd_mask = npyv_cmple_f32(npyv_abs_f32(x_in), max_cody); npy_uint64 simd_maski = npyv_tobits_b32(simd_mask); /* @@ -132,7 +137,6 @@ simd_sincos_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst, * these numbers */ if (simd_maski != 0) { - npyv_b32 nnan_mask = npyv_notnan_f32(x_in); npyv_f32 x = npyv_select_f32(npyv_and_b32(nnan_mask, simd_mask), x_in, zerosf); npyv_f32 quadrant = npyv_mul_f32(x, two_over_pi); diff --git a/numpy/core/src/umath/loops_umath_fp.dispatch.c.src b/numpy/core/src/umath/loops_umath_fp.dispatch.c.src index 46ce51824358..d6703bfb94fa 100644 --- a/numpy/core/src/umath/loops_umath_fp.dispatch.c.src +++ b/numpy/core/src/umath/loops_umath_fp.dispatch.c.src @@ -12,10 +12,12 @@ /**begin repeat * #sfx = f32, f64# * #func_suffix = f16, 8# + * #len = 32, 64# */ /**begin repeat1 * #func = exp2, log2, log10, expm1, log1p, cbrt, tan, asin, acos, atan, sinh, cosh, asinh, acosh, atanh# * #default_val = 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0# + * #fxnan = 0, 0, 0, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0# */ static void simd_@func@_@sfx@(const npyv_lanetype_@sfx@ *src, npy_intp ssrc, @@ -37,7 +39,15 @@ simd_@func@_@sfx@(const npyv_lanetype_@sfx@ *src, npy_intp ssrc, x = npyv_loadn_tillz_@sfx@(src, ssrc, len); } #endif + #if @fxnan@ == @len@ + // Workaround, invalid value encountered when x is set to nan + npyv_b@len@ nnan_mask = npyv_notnan_@sfx@(x); + npyv_@sfx@ x_exnan = npyv_select_@sfx@(nnan_mask, x, npyv_setall_@sfx@(@default_val@)); + npyv_@sfx@ out = __svml_@func@@func_suffix@(x_exnan); + out = npyv_select_@sfx@(nnan_mask, out, x); + #else npyv_@sfx@ out = __svml_@func@@func_suffix@(x); + #endif if (sdst == 1) { npyv_store_till_@sfx@(dst, len, out); } else { diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py index 264300621a16..2c16243db306 100644 --- a/numpy/core/tests/test_simd.py +++ b/numpy/core/tests/test_simd.py @@ -2,9 +2,24 @@ # may be involved in their functionality. import pytest, math, re import itertools -from numpy.core._simd import targets +import operator +from numpy.core._simd import targets, clear_floatstatus, get_floatstatus from numpy.core._multiarray_umath import __cpu_baseline__ +def check_floatstatus(divbyzero=False, overflow=False, + underflow=False, invalid=False, + all=False): + #define NPY_FPE_DIVIDEBYZERO 1 + #define NPY_FPE_OVERFLOW 2 + #define NPY_FPE_UNDERFLOW 4 + #define NPY_FPE_INVALID 8 + err = get_floatstatus() + ret = (all or divbyzero) and (err & 1) != 0 + ret |= (all or overflow) and (err & 2) != 0 + ret |= (all or underflow) and (err & 4) != 0 + ret |= (all or invalid) and (err & 8) != 0 + return ret + class _Test_Utility: # submodule of the desired SIMD extension, e.g. targets["AVX512F"] npyv = None @@ -553,7 +568,16 @@ def test_special_cases(self): nnan = self.notnan(self.setall(self._nan())) assert nnan == [0]*self.nlanes - import operator + @pytest.mark.parametrize("intrin_name", [ + "rint", "trunc", "ceil", "floor" + ]) + def test_unary_invalid_fpexception(self, intrin_name): + intrin = getattr(self, intrin_name) + for d in [float("nan"), float("inf"), -float("inf")]: + v = self.setall(d) + clear_floatstatus() + intrin(v) + assert check_floatstatus(invalid=True) == False @pytest.mark.parametrize('py_comp,np_comp', [ (operator.lt, "cmplt"), @@ -571,7 +595,8 @@ def to_bool(vector): return [lane == mask_true for lane in vector] intrin = getattr(self, np_comp) - cmp_cases = ((0, nan), (nan, 0), (nan, nan), (pinf, nan), (ninf, nan)) + cmp_cases = ((0, nan), (nan, 0), (nan, nan), (pinf, nan), + (ninf, nan), (-0.0, +0.0)) for case_operand1, case_operand2 in cmp_cases: data_a = [case_operand1]*self.nlanes data_b = [case_operand2]*self.nlanes @@ -881,41 +906,29 @@ def test_reorder_rev64(self): rev64 = self.rev64(self.load(range(self.nlanes))) assert rev64 == data_rev64 - def test_operators_comparison(self): + @pytest.mark.parametrize('func, intrin', [ + (operator.lt, "cmplt"), + (operator.le, "cmple"), + (operator.gt, "cmpgt"), + (operator.ge, "cmpge"), + (operator.eq, "cmpeq") + ]) + def test_operators_comparison(self, func, intrin): if self._is_fp(): data_a = self._data() else: data_a = self._data(self._int_max() - self.nlanes) data_b = self._data(self._int_min(), reverse=True) vdata_a, vdata_b = self.load(data_a), self.load(data_b) + intrin = getattr(self, intrin) mask_true = self._true_mask() def to_bool(vector): return [lane == mask_true for lane in vector] - # equal - data_eq = [a == b for a, b in zip(data_a, data_b)] - cmpeq = to_bool(self.cmpeq(vdata_a, vdata_b)) - assert cmpeq == data_eq - # not equal - data_neq = [a != b for a, b in zip(data_a, data_b)] - cmpneq = to_bool(self.cmpneq(vdata_a, vdata_b)) - assert cmpneq == data_neq - # greater than - data_gt = [a > b for a, b in zip(data_a, data_b)] - cmpgt = to_bool(self.cmpgt(vdata_a, vdata_b)) - assert cmpgt == data_gt - # greater than and equal - data_ge = [a >= b for a, b in zip(data_a, data_b)] - cmpge = to_bool(self.cmpge(vdata_a, vdata_b)) - assert cmpge == data_ge - # less than - data_lt = [a < b for a, b in zip(data_a, data_b)] - cmplt = to_bool(self.cmplt(vdata_a, vdata_b)) - assert cmplt == data_lt - # less than and equal - data_le = [a <= b for a, b in zip(data_a, data_b)] - cmple = to_bool(self.cmple(vdata_a, vdata_b)) - assert cmple == data_le + + data_cmp = [func(a, b) for a, b in zip(data_a, data_b)] + cmp = to_bool(intrin(vdata_a, vdata_b)) + assert cmp == data_cmp def test_operators_logical(self): if self._is_fp(): diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py index e0f91e326745..884578860712 100644 --- a/numpy/core/tests/test_umath.py +++ b/numpy/core/tests/test_umath.py @@ -21,6 +21,15 @@ ) from numpy.testing._private.utils import _glibc_older_than +UFUNCS = [obj for obj in np.core.umath.__dict__.values() + if isinstance(obj, np.ufunc)] + +UFUNCS_UNARY = [ + uf for uf in UFUNCS if uf.nin == 1 +] +UFUNCS_UNARY_FP = [ + uf for uf in UFUNCS_UNARY if 'f->f' in uf.types +] def interesting_binop_operands(val1, val2, dtype): """ @@ -1604,15 +1613,74 @@ def test_expm1(self): np.array(value, dtype=dt)) # test to ensure no spurious FP exceptions are raised due to SIMD - def test_spurious_fpexception(self): - for dt in ['e', 'f', 'd']: - arr = np.array([1.0, 2.0], dtype=dt) - with assert_no_warnings(): - np.log(arr) - np.log2(arr) - np.log10(arr) - np.arccosh(arr) - + INF_INVALID_ERR = [ + np.cos, np.sin, np.tan, np.arccos, np.arcsin, np.spacing, np.arctanh + ] + NEG_INVALID_ERR = [ + np.log, np.log2, np.log10, np.log1p, np.sqrt, np.arccosh, + np.arctanh + ] + ONE_INVALID_ERR = [ + np.arctanh, + ] + LTONE_INVALID_ERR = [ + np.arccosh, + ] + BYZERO_ERR = [ + np.log, np.log2, np.log10, np.reciprocal, np.arccosh + ] + + @pytest.mark.parametrize("ufunc", UFUNCS_UNARY_FP) + @pytest.mark.parametrize("dtype", ('e', 'f', 'd')) + @pytest.mark.parametrize("data, escape", ( + ([0.03], LTONE_INVALID_ERR), + ([0.03]*32, LTONE_INVALID_ERR), + # neg + ([-1.0], NEG_INVALID_ERR), + ([-1.0]*32, NEG_INVALID_ERR), + # flat + ([1.0], ONE_INVALID_ERR), + ([1.0]*32, ONE_INVALID_ERR), + # zero + ([0.0], BYZERO_ERR), + ([0.0]*32, BYZERO_ERR), + ([-0.0], BYZERO_ERR), + ([-0.0]*32, BYZERO_ERR), + # nan + ([0.5, 0.5, 0.5, np.nan], LTONE_INVALID_ERR), + ([0.5, 0.5, 0.5, np.nan]*32, LTONE_INVALID_ERR), + ([np.nan, 1.0, 1.0, 1.0], ONE_INVALID_ERR), + ([np.nan, 1.0, 1.0, 1.0]*32, ONE_INVALID_ERR), + ([np.nan], []), + ([np.nan]*32, []), + # inf + ([0.5, 0.5, 0.5, np.inf], INF_INVALID_ERR + LTONE_INVALID_ERR), + ([0.5, 0.5, 0.5, np.inf]*32, INF_INVALID_ERR + LTONE_INVALID_ERR), + ([np.inf, 1.0, 1.0, 1.0], INF_INVALID_ERR), + ([np.inf, 1.0, 1.0, 1.0]*32, INF_INVALID_ERR), + ([np.inf], INF_INVALID_ERR), + ([np.inf]*32, INF_INVALID_ERR), + # ninf + ([0.5, 0.5, 0.5, -np.inf], + NEG_INVALID_ERR + INF_INVALID_ERR + LTONE_INVALID_ERR), + ([0.5, 0.5, 0.5, -np.inf]*32, + NEG_INVALID_ERR + INF_INVALID_ERR + LTONE_INVALID_ERR), + ([-np.inf, 1.0, 1.0, 1.0], NEG_INVALID_ERR + INF_INVALID_ERR), + ([-np.inf, 1.0, 1.0, 1.0]*32, NEG_INVALID_ERR + INF_INVALID_ERR), + ([-np.inf], NEG_INVALID_ERR + INF_INVALID_ERR), + ([-np.inf]*32, NEG_INVALID_ERR + INF_INVALID_ERR), + )) + def test_unary_spurious_fpexception(self, ufunc, dtype, data, escape): + if escape and ufunc in escape: + return + # FIXME: NAN raises FP invalid exception: + # - ceil/float16 on MSVC:32-bit + # - spacing/float16 on almost all platforms + if ufunc in (np.spacing, np.ceil) and dtype == 'e': + return + array = np.array(data, dtype=dtype) + with assert_no_warnings(): + ufunc(array) class TestFPClass: @pytest.mark.parametrize("stride", [-4,-2,-1,1,2,4])