Skip to content

Commit

Permalink
Merge pull request #19955 from charris/backport-19926
Browse files Browse the repository at this point in the history
BUG: Resolve Divide by Zero on Apple silicon + test failures (#19926)
  • Loading branch information
charris committed Sep 26, 2021
2 parents 75f55a2 + c91443b commit 13c259e
Showing 1 changed file with 83 additions and 1 deletion.
84 changes: 83 additions & 1 deletion numpy/core/src/umath/loops_unary_fp.dispatch.c.src
Expand Up @@ -77,6 +77,56 @@ NPY_FINLINE double c_square_f64(double a)
*/
#define CONTIG 0
#define NCONTIG 1

/*
* clang has a bug that's present at -O1 or greater. When partially loading a
* vector register for a reciprocal operation, the remaining elements are set
* to 1 to avoid divide-by-zero. The partial load is paired with a partial
* store after the reciprocal operation. clang notices that the entire register
* is not needed for the store and optimizes out the fill of 1 to the remaining
* elements. This causes either a divide-by-zero or 0/0 with invalid exception
* that we were trying to avoid by filling.
*
* Using a dummy variable marked 'volatile' convinces clang not to ignore
* the explicit fill of remaining elements. If `-ftrapping-math` is
* supported, then it'll also avoid the bug. `-ftrapping-math` is supported
* on Apple clang v12+ for x86_64. It is not currently supported for arm64.
* `-ftrapping-math` is set by default of Numpy builds in
* numpy/distutils/ccompiler.py.
*
* Note: Apple clang and clang upstream have different versions that overlap
*/
#if defined(__clang__)
#if defined(__apple_build_version__)
// Apple Clang
#if __apple_build_version__ < 12000000
// Apple Clang before v12
#define WORKAROUND_CLANG_RECIPROCAL_BUG 1
#elif defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64)
// Apple Clang after v12, targeting i386 or x86_64
#define WORKAROUND_CLANG_RECIPROCAL_BUG 0
#else
// Apple Clang after v12, not targeting i386 or x86_64
#define WORKAROUND_CLANG_RECIPROCAL_BUG 1
#endif
#else
// Clang, not Apple Clang
#if __clang_major__ < 10
// Clang before v10
#define WORKAROUND_CLANG_RECIPROCAL_BUG 1
#elif defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64)
// Clang v10+, targeting i386 or x86_64
#define WORKAROUND_CLANG_RECIPROCAL_BUG 0
#else
// Clang v10+, not targeting i386 or x86_64
#define WORKAROUND_CLANG_RECIPROCAL_BUG 1
#endif
#endif
#else
// Not a Clang compiler
#define WORKAROUND_CLANG_RECIPROCAL_BUG 0
#endif

/**begin repeat
* #TYPE = FLOAT, DOUBLE#
* #sfx = f32, f64#
Expand All @@ -87,6 +137,7 @@ NPY_FINLINE double c_square_f64(double a)
* #kind = sqrt, absolute, square, reciprocal#
* #intr = sqrt, abs, square, recip#
* #repl_0w1 = 0, 0, 0, 1#
* #RECIP_WORKAROUND = 0, 0, 0, WORKAROUND_CLANG_RECIPROCAL_BUG#
*/
/**begin repeat2
* #STYPE = CONTIG, NCONTIG, CONTIG, NCONTIG#
Expand All @@ -101,6 +152,8 @@ static void simd_@TYPE@_@kind@_@STYPE@_@DTYPE@

const int vstep = npyv_nlanes_@sfx@;
const int wstep = vstep * @unroll@;

// unrolled iterations
for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
/**begin repeat3
* #N = 0, 1, 2, 3#
Expand All @@ -126,7 +179,24 @@ static void simd_@TYPE@_@kind@_@STYPE@_@DTYPE@
#endif
/**end repeat3**/
}
for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {

// vector-sized iterations
for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
#if @STYPE@ == CONTIG
npyv_@sfx@ v_src0 = npyv_load_@sfx@(src);
#else
npyv_@sfx@ v_src0 = npyv_loadn_@sfx@(src, ssrc);
#endif
npyv_@sfx@ v_unary0 = npyv_@intr@_@sfx@(v_src0);
#if @DTYPE@ == CONTIG
npyv_store_@sfx@(dst, v_unary0);
#else
npyv_storen_@sfx@(dst, sdst, v_unary0);
#endif
}

// last partial iteration, if needed
if(len > 0){
#if @STYPE@ == CONTIG
#if @repl_0w1@
npyv_@sfx@ v_src0 = npyv_load_till_@sfx@(src, len, 1);
Expand All @@ -140,20 +210,32 @@ static void simd_@TYPE@_@kind@_@STYPE@_@DTYPE@
npyv_@sfx@ v_src0 = npyv_loadn_tillz_@sfx@(src, ssrc, len);
#endif
#endif
#if @RECIP_WORKAROUND@
/*
* Workaround clang bug. We use a dummy variable marked 'volatile'
* to convince clang that the entire vector is needed. We only
* want to do this for the last iteration / partial load-store of
* the loop since 'volatile' forces a refresh of the contents.
*/
volatile npyv_@sfx@ unused_but_workaround_bug = v_src0;
#endif // @RECIP_WORKAROUND@
npyv_@sfx@ v_unary0 = npyv_@intr@_@sfx@(v_src0);
#if @DTYPE@ == CONTIG
npyv_store_till_@sfx@(dst, len, v_unary0);
#else
npyv_storen_till_@sfx@(dst, sdst, len, v_unary0);
#endif
}

npyv_cleanup();
}
/**end repeat2**/
/**end repeat1**/
#endif // @VCHK@
/**end repeat**/

#undef WORKAROUND_CLANG_RECIPROCAL_BUG

/********************************************************************************
** Defining ufunc inner functions
********************************************************************************/
Expand Down

0 comments on commit 13c259e

Please sign in to comment.