Skip to content

Commit

Permalink
Merge pull request #599 from Cuda-Chen/gemerate-more-random-test-inputs
Browse files Browse the repository at this point in the history
Align result to SSE when input is 0.0f/-0.0f in _mm_rsqrt_{ps, ss}
  • Loading branch information
jserv committed Jun 9, 2023
2 parents 6b61652 + 098a786 commit a057ce8
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 4 deletions.
24 changes: 24 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,30 @@ their equivalents are built utilizing a number of NEON intrinsics.
For example, SSE intrinsic `_mm_loadu_si128` has a direct NEON mapping (`vld1q_s32`),
but SSE intrinsic `_mm_maddubs_epi16` has to be implemented with 13+ NEON instructions.

### Floating-point compatibility

Some conversions require several NEON intrinsics, which may produce inconsistent results
compared to their SSE counterparts due to differences in the arithmetic rules of IEEE-754.

Taking a possible conversion of `_mm_rsqrt_ps` as example:

```c
__m128 _mm_rsqrt_ps(__m128 in)
{
float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in));

out = vmulq_f32(
out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));

return vreinterpretq_m128_f32(out);
}
```
The `_mm_rsqrt_ps` conversion will produce NaN if a source value is `0.0` (first INF for the
reciprocal square root of `0.0`, then INF * `0.0` using `vmulq_f32`). In contrast,
the SSE counterpart produces INF if a source value is `0.0`.
As a result, additional treatments should be applied to ensure consistency between the conversion and its SSE counterpart.
## Usage
- Put the file `sse2neon.h` in to your source code directory.
Expand Down
16 changes: 16 additions & 0 deletions sse2neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -2313,8 +2313,24 @@ FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
{
float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in));

// Generate masks for detecting whether input has any 0.0f/-0.0f
// (which becomes positive/negative infinity by IEEE-754 arthimetic rules).
const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
const uint32x4_t neg_inf = vdupq_n_u32(0xFF800000);
const uint32x4_t has_pos_zero =
vceqq_u32(pos_inf, vreinterpretq_u32_f32(out));
const uint32x4_t has_neg_zero =
vceqq_u32(neg_inf, vreinterpretq_u32_f32(out));

out = vmulq_f32(
out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));

// Set output vector element to infinity/negative-infinity if
// the corresponding input vector element is 0.0f/-0.0f.
out = vbslq_f32(has_pos_zero, (float32x4_t) pos_inf, out);
out = vbslq_f32(has_neg_zero, (float32x4_t) neg_inf, out);

return vreinterpretq_m128_f32(out);
}

Expand Down
12 changes: 8 additions & 4 deletions tests/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -357,19 +357,23 @@ result_t validateFloatError(__m128 a,
float df2 = fabsf((t[2] - f2) / f2);
float df3 = fabsf((t[3] - f3) / f3);

if ((std::isnan(t[0]) && std::isnan(f0)) || (t[0] == 0 && f0 == 0)) {
if ((std::isnan(t[0]) && std::isnan(f0)) || (t[0] == 0 && f0 == 0) ||
(std::isinf(t[0]) && std::isinf(f0))) {
df0 = 0;
}

if ((std::isnan(t[1]) && std::isnan(f1)) || (t[1] == 0 && f1 == 0)) {
if ((std::isnan(t[1]) && std::isnan(f1)) || (t[1] == 0 && f1 == 0) ||
(std::isinf(t[1]) && std::isinf(f1))) {
df1 = 0;
}

if ((std::isnan(t[2]) && std::isnan(f2)) || (t[2] == 0 && f2 == 0)) {
if ((std::isnan(t[2]) && std::isnan(f2)) || (t[2] == 0 && f2 == 0) ||
(std::isinf(t[2]) && std::isinf(f2))) {
df2 = 0;
}

if ((std::isnan(t[3]) && std::isnan(f3)) || (t[3] == 0 && f3 == 0)) {
if ((std::isnan(t[3]) && std::isnan(f3)) || (t[3] == 0 && f3 == 0) ||
(std::isinf(t[3]) && std::isinf(f3))) {
df3 = 0;
}

Expand Down
21 changes: 21 additions & 0 deletions tests/impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,27 @@ class SSE2NEONTestImpl : public SSE2NEONTest
mTestFloatPointer1[2] = 1.0f / mTestFloatPointer1[2];
mTestFloatPointer1[3] = 1.0f / mTestFloatPointer1[3];
}
if (test == it_mm_rcp_ps || test == it_mm_rcp_ss ||
test == it_mm_rsqrt_ps || test == it_mm_rsqrt_ss) {
if ((rand() & 3) == 0) {
uint32_t r1 = rand() & 3;
uint32_t r2 = rand() & 3;
uint32_t r3 = rand() & 3;
uint32_t r4 = rand() & 3;
uint32_t r5 = rand() & 3;
uint32_t r6 = rand() & 3;
uint32_t r7 = rand() & 3;
uint32_t r8 = rand() & 3;
mTestFloatPointer1[r1] = 0.0f;
mTestFloatPointer1[r2] = 0.0f;
mTestFloatPointer1[r3] = 0.0f;
mTestFloatPointer1[r4] = 0.0f;
mTestFloatPointer1[r5] = -0.0f;
mTestFloatPointer1[r6] = -0.0f;
mTestFloatPointer1[r7] = -0.0f;
mTestFloatPointer1[r8] = -0.0f;
}
}
if (test == it_mm_cmpge_ps || test == it_mm_cmpge_ss ||
test == it_mm_cmple_ps || test == it_mm_cmple_ss ||
test == it_mm_cmpeq_ps || test == it_mm_cmpeq_ss) {
Expand Down

0 comments on commit a057ce8

Please sign in to comment.