Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Resolve Divide by Zero on Apple silicon + test failures (#19926) #19955

Merged
merged 1 commit into from Sep 26, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
84 changes: 83 additions & 1 deletion numpy/core/src/umath/loops_unary_fp.dispatch.c.src
Expand Up @@ -77,6 +77,56 @@ NPY_FINLINE double c_square_f64(double a)
*/
#define CONTIG 0
#define NCONTIG 1

/*
* clang has a bug that's present at -O1 or greater. When partially loading a
* vector register for a reciprocal operation, the remaining elements are set
* to 1 to avoid divide-by-zero. The partial load is paired with a partial
* store after the reciprocal operation. clang notices that the entire register
* is not needed for the store and optimizes out the fill of 1 to the remaining
* elements. This causes either a divide-by-zero or 0/0 with invalid exception
* that we were trying to avoid by filling.
*
* Using a dummy variable marked 'volatile' convinces clang not to ignore
* the explicit fill of remaining elements. If `-ftrapping-math` is
* supported, then it'll also avoid the bug. `-ftrapping-math` is supported
* on Apple clang v12+ for x86_64. It is not currently supported for arm64.
* `-ftrapping-math` is set by default of Numpy builds in
* numpy/distutils/ccompiler.py.
*
* Note: Apple clang and clang upstream have different versions that overlap
*/
#if defined(__clang__)
#if defined(__apple_build_version__)
// Apple Clang
#if __apple_build_version__ < 12000000
// Apple Clang before v12
#define WORKAROUND_CLANG_RECIPROCAL_BUG 1
#elif defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64)
// Apple Clang after v12, targeting i386 or x86_64
#define WORKAROUND_CLANG_RECIPROCAL_BUG 0
#else
// Apple Clang after v12, not targeting i386 or x86_64
#define WORKAROUND_CLANG_RECIPROCAL_BUG 1
#endif
#else
// Clang, not Apple Clang
#if __clang_major__ < 10
// Clang before v10
#define WORKAROUND_CLANG_RECIPROCAL_BUG 1
#elif defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64)
// Clang v10+, targeting i386 or x86_64
#define WORKAROUND_CLANG_RECIPROCAL_BUG 0
#else
// Clang v10+, not targeting i386 or x86_64
#define WORKAROUND_CLANG_RECIPROCAL_BUG 1
#endif
#endif
#else
// Not a Clang compiler
#define WORKAROUND_CLANG_RECIPROCAL_BUG 0
#endif

/**begin repeat
* #TYPE = FLOAT, DOUBLE#
* #sfx = f32, f64#
Expand All @@ -87,6 +137,7 @@ NPY_FINLINE double c_square_f64(double a)
* #kind = sqrt, absolute, square, reciprocal#
* #intr = sqrt, abs, square, recip#
* #repl_0w1 = 0, 0, 0, 1#
* #RECIP_WORKAROUND = 0, 0, 0, WORKAROUND_CLANG_RECIPROCAL_BUG#
*/
/**begin repeat2
* #STYPE = CONTIG, NCONTIG, CONTIG, NCONTIG#
Expand All @@ -101,6 +152,8 @@ static void simd_@TYPE@_@kind@_@STYPE@_@DTYPE@

const int vstep = npyv_nlanes_@sfx@;
const int wstep = vstep * @unroll@;

// unrolled iterations
for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
/**begin repeat3
* #N = 0, 1, 2, 3#
Expand All @@ -126,7 +179,24 @@ static void simd_@TYPE@_@kind@_@STYPE@_@DTYPE@
#endif
/**end repeat3**/
}
for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {

// vector-sized iterations
for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
#if @STYPE@ == CONTIG
npyv_@sfx@ v_src0 = npyv_load_@sfx@(src);
#else
npyv_@sfx@ v_src0 = npyv_loadn_@sfx@(src, ssrc);
#endif
npyv_@sfx@ v_unary0 = npyv_@intr@_@sfx@(v_src0);
#if @DTYPE@ == CONTIG
npyv_store_@sfx@(dst, v_unary0);
#else
npyv_storen_@sfx@(dst, sdst, v_unary0);
#endif
}

// last partial iteration, if needed
if(len > 0){
#if @STYPE@ == CONTIG
#if @repl_0w1@
npyv_@sfx@ v_src0 = npyv_load_till_@sfx@(src, len, 1);
Expand All @@ -140,20 +210,32 @@ static void simd_@TYPE@_@kind@_@STYPE@_@DTYPE@
npyv_@sfx@ v_src0 = npyv_loadn_tillz_@sfx@(src, ssrc, len);
#endif
#endif
#if @RECIP_WORKAROUND@
/*
* Workaround clang bug. We use a dummy variable marked 'volatile'
* to convince clang that the entire vector is needed. We only
* want to do this for the last iteration / partial load-store of
* the loop since 'volatile' forces a refresh of the contents.
*/
volatile npyv_@sfx@ unused_but_workaround_bug = v_src0;
#endif // @RECIP_WORKAROUND@
npyv_@sfx@ v_unary0 = npyv_@intr@_@sfx@(v_src0);
#if @DTYPE@ == CONTIG
npyv_store_till_@sfx@(dst, len, v_unary0);
#else
npyv_storen_till_@sfx@(dst, sdst, len, v_unary0);
#endif
}

npyv_cleanup();
}
/**end repeat2**/
/**end repeat1**/
#endif // @VCHK@
/**end repeat**/

#undef WORKAROUND_CLANG_RECIPROCAL_BUG

/********************************************************************************
** Defining ufunc inner functions
********************************************************************************/
Expand Down