numpy/core/src/umath/loops_unary_fp.dispatch.c.src

/*@targets
 ** $maxopt baseline
 ** sse2 vsx2 neon
 **/
/**
 * Force use SSE only on x86, even if AVX2 or AVX512F are enabled
 * through the baseline, since scatter(AVX512F) and gather very costly
 * to handle non-contiguous memory access comparing with SSE for
 * such small operations that this file covers.
*/
#define NPY_SIMD_FORCE_128
#include "numpy/npy_math.h"
#include "simd/simd.h"
#include "loops_utils.h"
#include "loops.h"
/**********************************************************
 ** Scalars
 **********************************************************/
#if !NPY_SIMD
NPY_FINLINE float c_recip_f32(float a)
{ return 1.0f / a; }
NPY_FINLINE float c_abs_f32(float a)
{
    const float tmp = a > 0 ? a : -a;
    /* add 0 to clear -0.0 */
    return tmp + 0;
}
NPY_FINLINE float c_square_f32(float a)
{ return a * a; }
#endif // !NPY_SIMD

#if !NPY_SIMD_F64
NPY_FINLINE double c_recip_f64(double a)
{ return 1.0 / a; }
NPY_FINLINE double c_abs_f64(double a)
{
    const double tmp = a > 0 ? a : -a;
    /* add 0 to clear -0.0 */
    return tmp + 0;
}
NPY_FINLINE double c_square_f64(double a)
{ return a * a; }
#endif // !NPY_SIMD_F64
/**
 * MSVC(32-bit mode) requires a clarified contiguous loop
 * in order to use SSE, otherwise it uses a soft version of square root
 * that doesn't raise a domain error.
 */
#if defined(_MSC_VER) && defined(_M_IX86) && !NPY_SIMD
    #include <emmintrin.h>
    NPY_FINLINE float c_sqrt_f32(float _a)
    {
        __m128 a = _mm_load_ss(&_a);
        __m128 lower = _mm_sqrt_ss(a);
        return _mm_cvtss_f32(lower);
    }
    NPY_FINLINE double c_sqrt_f64(double _a)
    {
        __m128d a = _mm_load_sd(&_a);
        __m128d lower = _mm_sqrt_pd(a);
        return _mm_cvtsd_f64(lower);
    }
#else
    #define c_sqrt_f32 npy_sqrtf
    #define c_sqrt_f64 npy_sqrt
#endif

/********************************************************************************
 ** Defining the SIMD kernels
 ********************************************************************************/
/** Notes:
 * - avoid the use of libmath to unify fp/domain errors
 *   for both scalars and vectors among all compilers/architectures.
 * - use intrinsic npyv_load_till_* instead of npyv_load_tillz_
 *   to fill the remind lanes with 1.0 to avoid divide by zero fp
 *   exception in reciprocal.
 */
#define CONTIG  0
#define NCONTIG 1

/*
 * clang has a bug that's present at -O1 or greater.  When partially loading a
 * vector register for a reciprocal operation, the remaining elements are set
 * to 1 to avoid divide-by-zero.  The partial load is paired with a partial
 * store after the reciprocal operation.  clang notices that the entire register
 * is not needed for the store and optimizes out the fill of 1 to the remaining
 * elements.  This causes either a divide-by-zero or 0/0 with invalid exception
 * that we were trying to avoid by filling.
 *
 * Using a dummy variable marked 'volatile' convinces clang not to ignore
 * the explicit fill of remaining elements.  If `-ftrapping-math` is
 * supported, then it'll also avoid the bug.  `-ftrapping-math` is supported
 * on Apple clang v12+ for x86_64.  It is not currently supported for arm64.
 * `-ftrapping-math` is set by default of Numpy builds in
 * numpy/distutils/ccompiler.py.
 *
 * Note: Apple clang and clang upstream have different versions that overlap
 */
#if defined(__clang__)
    #if defined(__apple_build_version__)
    // Apple Clang
        #if __apple_build_version__ < 12000000
        // Apple Clang before v12
        #define WORKAROUND_CLANG_RECIPROCAL_BUG 1
        #elif defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64)
        // Apple Clang after v12, targeting i386 or x86_64
        #define WORKAROUND_CLANG_RECIPROCAL_BUG 0
        #else
        // Apple Clang after v12, not targeting i386 or x86_64
        #define WORKAROUND_CLANG_RECIPROCAL_BUG 1
        #endif
    #else
    // Clang, not Apple Clang
        #if __clang_major__ < 10
        // Clang before v10
        #define WORKAROUND_CLANG_RECIPROCAL_BUG 1
        #elif defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64)
        // Clang v10+, targeting i386 or x86_64
        #define WORKAROUND_CLANG_RECIPROCAL_BUG 0
        #else
        // Clang v10+, not targeting i386 or x86_64
        #define WORKAROUND_CLANG_RECIPROCAL_BUG 1
        #endif
    #endif
#else
// Not a Clang compiler
#define WORKAROUND_CLANG_RECIPROCAL_BUG 0
#endif

/**begin repeat
 * #TYPE = FLOAT, DOUBLE#
 * #sfx  = f32, f64#
 * #VCHK = NPY_SIMD, NPY_SIMD_F64#
 */
#if @VCHK@
/**begin repeat1
 * #kind     = sqrt, absolute, square, reciprocal#
 * #intr     = sqrt, abs,      square, recip#
 * #repl_0w1 = 0,    0,        0,      1#
 * #RECIP_WORKAROUND = 0, 0,   0,      WORKAROUND_CLANG_RECIPROCAL_BUG#
 */
/**begin repeat2
 * #STYPE  = CONTIG, NCONTIG, CONTIG,  NCONTIG#
 * #DTYPE  = CONTIG, CONTIG,  NCONTIG, NCONTIG#
 * #unroll = 4,      4,       2,       2#
 */
static void simd_@TYPE@_@kind@_@STYPE@_@DTYPE@
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
{
    const npyv_lanetype_@sfx@ *src = _src;
          npyv_lanetype_@sfx@ *dst = _dst;

    const int vstep = npyv_nlanes_@sfx@;
    const int wstep = vstep * @unroll@;

    // unrolled iterations
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
        /**begin repeat3
         * #N  = 0, 1, 2, 3#
         */
        #if @unroll@ > @N@
            #if @STYPE@ == CONTIG
                npyv_@sfx@ v_src@N@ = npyv_load_@sfx@(src + vstep*@N@);
            #else
                npyv_@sfx@ v_src@N@ = npyv_loadn_@sfx@(src + ssrc*vstep*@N@, ssrc);
            #endif
            npyv_@sfx@ v_unary@N@ = npyv_@intr@_@sfx@(v_src@N@);
        #endif
        /**end repeat3**/
        /**begin repeat3
         * #N  = 0, 1, 2, 3#
         */
        #if @unroll@ > @N@
            #if @DTYPE@ == CONTIG
                npyv_store_@sfx@(dst + vstep*@N@, v_unary@N@);
            #else
                npyv_storen_@sfx@(dst + sdst*vstep*@N@, sdst, v_unary@N@);
            #endif
        #endif
        /**end repeat3**/
    }

    // vector-sized iterations
    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
    #if @STYPE@ == CONTIG
        npyv_@sfx@ v_src0 = npyv_load_@sfx@(src);
    #else
        npyv_@sfx@ v_src0 = npyv_loadn_@sfx@(src, ssrc);
    #endif
        npyv_@sfx@ v_unary0 = npyv_@intr@_@sfx@(v_src0);
    #if @DTYPE@ == CONTIG
        npyv_store_@sfx@(dst, v_unary0);
    #else
        npyv_storen_@sfx@(dst, sdst, v_unary0);
    #endif
    }

    // last partial iteration, if needed
    if(len > 0){
    #if @STYPE@ == CONTIG
        #if @repl_0w1@
            npyv_@sfx@ v_src0 = npyv_load_till_@sfx@(src, len, 1);
        #else
            npyv_@sfx@ v_src0 = npyv_load_tillz_@sfx@(src, len);
        #endif
    #else
        #if @repl_0w1@
            npyv_@sfx@ v_src0 = npyv_loadn_till_@sfx@(src, ssrc, len, 1);
        #else
            npyv_@sfx@ v_src0 = npyv_loadn_tillz_@sfx@(src, ssrc, len);
        #endif
    #endif
        #if @RECIP_WORKAROUND@
            /*
             * Workaround clang bug.  We use a dummy variable marked 'volatile'
             * to convince clang that the entire vector is needed.  We only
             * want to do this for the last iteration / partial load-store of
             * the loop since 'volatile' forces a refresh of the contents.
             */
             volatile npyv_@sfx@ unused_but_workaround_bug = v_src0;
        #endif // @RECIP_WORKAROUND@
        npyv_@sfx@ v_unary0 = npyv_@intr@_@sfx@(v_src0);
    #if @DTYPE@ == CONTIG
        npyv_store_till_@sfx@(dst, len, v_unary0);
    #else
        npyv_storen_till_@sfx@(dst, sdst, len, v_unary0);
    #endif
    }

    npyv_cleanup();
}
/**end repeat2**/
/**end repeat1**/
#endif // @VCHK@
/**end repeat**/

#undef WORKAROUND_CLANG_RECIPROCAL_BUG

/********************************************************************************
 ** Defining ufunc inner functions
 ********************************************************************************/
/**begin repeat
 * #TYPE = FLOAT, DOUBLE#
 * #sfx  = f32, f64#
 * #VCHK = NPY_SIMD, NPY_SIMD_F64#
 */
/**begin repeat1
 * #kind  = sqrt, absolute, square, reciprocal#
 * #intr  = sqrt, abs,      square, recip#
 * #clear = 0,    1,        0,      0#
 */
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    const char *src = args[0]; char *dst = args[1];
    const npy_intp src_step = steps[0];
    const npy_intp dst_step = steps[1];
    npy_intp len = dimensions[0];
#if @VCHK@
    const int lsize = sizeof(npyv_lanetype_@sfx@);
    assert(src_step % lsize == 0 && dst_step % lsize == 0);
    if (is_mem_overlap(src, src_step, dst, dst_step, len)) {
        goto no_unroll;
    }
    const npy_intp ssrc = src_step / lsize;
    const npy_intp sdst = dst_step / lsize;
    if (!npyv_loadable_stride_@sfx@(ssrc) || !npyv_storable_stride_@sfx@(sdst)) {
        goto no_unroll;
    }
    if (ssrc == 1 && sdst == 1) {
        simd_@TYPE@_@kind@_CONTIG_CONTIG(src, 1, dst, 1, len);
    }
    else if (sdst == 1) {
        simd_@TYPE@_@kind@_NCONTIG_CONTIG(src, ssrc, dst, 1, len);
    }
    else if (ssrc == 1) {
        simd_@TYPE@_@kind@_CONTIG_NCONTIG(src, 1, dst, sdst, len);
    } else {
        simd_@TYPE@_@kind@_NCONTIG_NCONTIG(src, ssrc, dst, sdst, len);
    }
    goto clear;
no_unroll:
#endif // @VCHK@
    for (; len > 0; --len, src += src_step, dst += dst_step) {
    #if @VCHK@
        // to guarantee the same precsion and fp/domain errors for both scalars and vectors
        simd_@TYPE@_@kind@_CONTIG_CONTIG(src, 0, dst, 0, 1);
    #else
        const npyv_lanetype_@sfx@ src0 = *(npyv_lanetype_@sfx@*)src;
        *(npyv_lanetype_@sfx@*)dst = c_@intr@_@sfx@(src0);
    #endif
    }
#if @VCHK@
clear:;
#endif
#if @clear@
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}
/**end repeat1**/
/**end repeat**/