Skip to content

Commit

Permalink
Merge pull request #19046 from seiko2plus/issue_19045
Browse files Browse the repository at this point in the history
BUG, SIMD: Fix unexpected result of uint8 division on X86
  • Loading branch information
charris committed May 20, 2021
2 parents d707f39 + 519ab99 commit 36cc9be
Show file tree
Hide file tree
Showing 6 changed files with 128 additions and 49 deletions.
8 changes: 4 additions & 4 deletions numpy/core/src/common/simd/avx2/arithmetic.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,16 +73,16 @@
// divide each unsigned 8-bit element by a precomputed divisor
NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor)
{
const __m256i bmask = _mm256_set1_epi32(0xFF00FF00);
const __m256i bmask = _mm256_set1_epi32(0x00FF00FF);
const __m128i shf1 = _mm256_castsi256_si128(divisor.val[1]);
const __m128i shf2 = _mm256_castsi256_si128(divisor.val[2]);
const __m256i shf1b = _mm256_set1_epi8(0xFFU >> _mm_cvtsi128_si32(shf1));
const __m256i shf2b = _mm256_set1_epi8(0xFFU >> _mm_cvtsi128_si32(shf2));
// high part of unsigned multiplication
__m256i mulhi_odd = _mm256_mulhi_epu16(a, divisor.val[0]);
__m256i mulhi_even = _mm256_mulhi_epu16(_mm256_slli_epi16(a, 8), divisor.val[0]);
__m256i mulhi_even = _mm256_mullo_epi16(_mm256_and_si256(a, bmask), divisor.val[0]);
mulhi_even = _mm256_srli_epi16(mulhi_even, 8);
__m256i mulhi = _mm256_blendv_epi8(mulhi_even, mulhi_odd, bmask);
__m256i mulhi_odd = _mm256_mullo_epi16(_mm256_srli_epi16(a, 8), divisor.val[0]);
__m256i mulhi = _mm256_blendv_epi8(mulhi_odd, mulhi_even, bmask);
// floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2
__m256i q = _mm256_sub_epi8(a, mulhi);
q = _mm256_and_si256(_mm256_srl_epi16(q, shf1), shf1b);
Expand Down
19 changes: 10 additions & 9 deletions numpy/core/src/common/simd/avx512/arithmetic.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,12 +116,13 @@ NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor)
const __m128i shf1 = _mm512_castsi512_si128(divisor.val[1]);
const __m128i shf2 = _mm512_castsi512_si128(divisor.val[2]);
#ifdef NPY_HAVE_AVX512BW
const __m512i bmask = _mm512_set1_epi32(0x00FF00FF);
const __m512i shf1b = _mm512_set1_epi8(0xFFU >> _mm_cvtsi128_si32(shf1));
const __m512i shf2b = _mm512_set1_epi8(0xFFU >> _mm_cvtsi128_si32(shf2));
// high part of unsigned multiplication
__m512i mulhi_odd = _mm512_mulhi_epu16(a, divisor.val[0]);
__m512i mulhi_even = _mm512_mulhi_epu16(_mm512_slli_epi16(a, 8), divisor.val[0]);
__m512i mulhi_even = _mm512_mullo_epi16(_mm512_and_si512(a, bmask), divisor.val[0]);
mulhi_even = _mm512_srli_epi16(mulhi_even, 8);
__m512i mulhi_odd = _mm512_mullo_epi16(_mm512_srli_epi16(a, 8), divisor.val[0]);
__m512i mulhi = _mm512_mask_mov_epi8(mulhi_even, 0xAAAAAAAAAAAAAAAA, mulhi_odd);
// floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2
__m512i q = _mm512_sub_epi8(a, mulhi);
Expand All @@ -130,18 +131,18 @@ NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor)
q = _mm512_and_si512(_mm512_srl_epi16(q, shf2), shf2b);
return q;
#else
const __m256i bmask = _mm256_set1_epi32(0xFF00FF00);
const __m256i bmask = _mm256_set1_epi32(0x00FF00FF);
const __m256i shf1b = _mm256_set1_epi8(0xFFU >> _mm_cvtsi128_si32(shf1));
const __m256i shf2b = _mm256_set1_epi8(0xFFU >> _mm_cvtsi128_si32(shf2));
const __m512i shf2bw= npyv512_combine_si256(shf2b, shf2b);
const __m256i mulc = npyv512_lower_si256(divisor.val[0]);
//// lower 256-bit
__m256i lo_a = npyv512_lower_si256(a);
// high part of unsigned multiplication
__m256i mulhi_odd = _mm256_mulhi_epu16(lo_a, mulc);
__m256i mulhi_even = _mm256_mulhi_epu16(_mm256_slli_epi16(lo_a, 8), mulc);
__m256i mulhi_even = _mm256_mullo_epi16(_mm256_and_si256(lo_a, bmask), mulc);
mulhi_even = _mm256_srli_epi16(mulhi_even, 8);
__m256i mulhi = _mm256_blendv_epi8(mulhi_even, mulhi_odd, bmask);
__m256i mulhi_odd = _mm256_mullo_epi16(_mm256_srli_epi16(lo_a, 8), mulc);
__m256i mulhi = _mm256_blendv_epi8(mulhi_odd, mulhi_even, bmask);
// floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2
__m256i lo_q = _mm256_sub_epi8(lo_a, mulhi);
lo_q = _mm256_and_si256(_mm256_srl_epi16(lo_q, shf1), shf1b);
Expand All @@ -151,10 +152,10 @@ NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor)
//// higher 256-bit
__m256i hi_a = npyv512_higher_si256(a);
// high part of unsigned multiplication
mulhi_odd = _mm256_mulhi_epu16(hi_a, mulc);
mulhi_even = _mm256_mulhi_epu16(_mm256_slli_epi16(hi_a, 8), mulc);
mulhi_even = _mm256_mullo_epi16(_mm256_and_si256(hi_a, bmask), mulc);
mulhi_even = _mm256_srli_epi16(mulhi_even, 8);
mulhi = _mm256_blendv_epi8(mulhi_even, mulhi_odd, bmask);
mulhi_odd = _mm256_mullo_epi16(_mm256_srli_epi16(hi_a, 8), mulc);
mulhi = _mm256_blendv_epi8(mulhi_odd, mulhi_even, bmask);
// floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2
__m256i hi_q = _mm256_sub_epi8(hi_a, mulhi);
hi_q = _mm256_and_si256(_mm256_srl_epi16(hi_q, shf1), shf1b);
Expand Down
4 changes: 3 additions & 1 deletion numpy/core/src/common/simd/intdiv.h
Original file line number Diff line number Diff line change
Expand Up @@ -204,14 +204,16 @@ NPY_FINLINE npyv_u8x3 npyv_divisor_u8(npy_uint8 d)
sh1 = 1; sh2 = l - 1; // shift counts
}
npyv_u8x3 divisor;
divisor.val[0] = npyv_setall_u8(m);
#ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512
divisor.val[0] = npyv_setall_u16(m);
divisor.val[1] = npyv_set_u8(sh1);
divisor.val[2] = npyv_set_u8(sh2);
#elif defined(NPY_HAVE_VSX2)
divisor.val[0] = npyv_setall_u8(m);
divisor.val[1] = npyv_setall_u8(sh1);
divisor.val[2] = npyv_setall_u8(sh2);
#elif defined(NPY_HAVE_NEON)
divisor.val[0] = npyv_setall_u8(m);
divisor.val[1] = npyv_reinterpret_u8_s8(npyv_setall_s8(-sh1));
divisor.val[2] = npyv_reinterpret_u8_s8(npyv_setall_s8(-sh2));
#else
Expand Down
8 changes: 4 additions & 4 deletions numpy/core/src/common/simd/sse/arithmetic.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,14 +92,14 @@ NPY_FINLINE __m128i npyv_mul_u8(__m128i a, __m128i b)
// divide each unsigned 8-bit element by a precomputed divisor
NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor)
{
const __m128i bmask = _mm_set1_epi32(0xFF00FF00);
const __m128i bmask = _mm_set1_epi32(0x00FF00FF);
const __m128i shf1b = _mm_set1_epi8(0xFFU >> _mm_cvtsi128_si32(divisor.val[1]));
const __m128i shf2b = _mm_set1_epi8(0xFFU >> _mm_cvtsi128_si32(divisor.val[2]));
// high part of unsigned multiplication
__m128i mulhi_odd = _mm_mulhi_epu16(a, divisor.val[0]);
__m128i mulhi_even = _mm_mulhi_epu16(_mm_slli_epi16(a, 8), divisor.val[0]);
__m128i mulhi_even = _mm_mullo_epi16(_mm_and_si128(a, bmask), divisor.val[0]);
__m128i mulhi_odd = _mm_mullo_epi16(_mm_srli_epi16(a, 8), divisor.val[0]);
mulhi_even = _mm_srli_epi16(mulhi_even, 8);
__m128i mulhi = npyv_select_u8(bmask, mulhi_odd, mulhi_even);
__m128i mulhi = npyv_select_u8(bmask, mulhi_even, mulhi_odd);
// floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2
__m128i q = _mm_sub_epi8(a, mulhi);
q = _mm_and_si128(_mm_srl_epi16(q, divisor.val[1]), shf1b);
Expand Down
11 changes: 7 additions & 4 deletions numpy/core/tests/test_simd.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# NOTE: Please avoid the use of numpy.testing since NPYV intrinsics
# may be involved in their functionality.
import pytest, math, re
import itertools
from numpy.core._simd import targets
from numpy.core._multiarray_umath import __cpu_baseline__

Expand Down Expand Up @@ -820,8 +821,10 @@ def test_arithmetic_intdiv(self):
def trunc_div(a, d):
"""
Divide towards zero works with large integers > 2^53,
equivalent to int(a/d)
and wrap around overflow similar to what C does.
"""
if d == -1 and a == int_min:
return a
sign_a, sign_d = a < 0, d < 0
if a == 0 or sign_a == sign_d:
return a // d
Expand All @@ -833,9 +836,9 @@ def trunc_div(a, d):
0, 1, self.nlanes, int_max-self.nlanes,
int_min, int_min//2 + 1
)
divisors = (1, 2, self.nlanes, int_min, int_max, int_max//2)
divisors = (1, 2, 9, 13, self.nlanes, int_min, int_max, int_max//2)

for x, d in zip(rdata, divisors):
for x, d in itertools.product(rdata, divisors):
data = self._data(x)
vdata = self.load(data)
data_divc = [trunc_div(a, d) for a in data]
Expand All @@ -848,7 +851,7 @@ def trunc_div(a, d):

safe_neg = lambda x: -x-1 if -x > int_max else -x
# test round divison for signed integers
for x, d in zip(rdata, divisors):
for x, d in itertools.product(rdata, divisors):
d_neg = safe_neg(d)
data = self._data(x)
data_neg = [safe_neg(a) for a in data]
Expand Down
127 changes: 100 additions & 27 deletions numpy/core/tests/test_umath.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pytest
import sys
from fractions import Fraction
from functools import reduce

import numpy.core.umath as ncu
from numpy.core import _umath_tests as ncu_tests
Expand Down Expand Up @@ -249,43 +250,115 @@ def test_division_int(self):
assert_equal(x // 100, [0, 0, 0, 1, -1, -1, -1, -1, -2])
assert_equal(x % 100, [5, 10, 90, 0, 95, 90, 10, 0, 80])

@pytest.mark.parametrize("input_dtype",
np.sctypes['int'] + np.sctypes['uint'])
def test_division_int_boundary(self, input_dtype):
iinfo = np.iinfo(input_dtype)

# Unsigned:
# Create list with 0, 25th, 50th, 75th percentile and max
if iinfo.min == 0:
lst = [0, iinfo.max//4, iinfo.max//2,
int(iinfo.max/1.33), iinfo.max]
divisors = [iinfo.max//4, iinfo.max//2,
int(iinfo.max/1.33), iinfo.max]
# Signed:
# Create list with min, 25th percentile, 0, 75th percentile, max
else:
lst = [iinfo.min, iinfo.min//2, 0, iinfo.max//2, iinfo.max]
divisors = [iinfo.min, iinfo.min//2, iinfo.max//2, iinfo.max]
a = np.array(lst, dtype=input_dtype)
@pytest.mark.parametrize("dtype,ex_val", itertools.product(
np.sctypes['int'] + np.sctypes['uint'], (
(
# dividend
"np.arange(fo.max-lsize, fo.max, dtype=dtype),"
# divisors
"np.arange(lsize, dtype=dtype),"
# scalar divisors
"range(15)"
),
(
# dividend
"np.arange(fo.min, fo.min+lsize, dtype=dtype),"
# divisors
"np.arange(lsize//-2, lsize//2, dtype=dtype),"
# scalar divisors
"range(fo.min, fo.min + 15)"
), (
# dividend
"np.arange(fo.max-lsize, fo.max, dtype=dtype),"
# divisors
"np.arange(lsize, dtype=dtype),"
# scalar divisors
"[1,3,9,13,neg, fo.min+1, fo.min//2, fo.max//3, fo.max//4]"
)
)
))
def test_division_int_boundary(self, dtype, ex_val):
fo = np.iinfo(dtype)
neg = -1 if fo.min < 0 else 1
# Large enough to test SIMD loops and remaind elements
lsize = 512 + 7
a, b, divisors = eval(ex_val)
a_lst, b_lst = a.tolist(), b.tolist()

c_div = lambda n, d: (
0 if d == 0 or (n and n == fo.min and d == -1) else n//d
)
with np.errstate(divide='ignore'):
ac = a.copy()
ac //= b
div_ab = a // b
div_lst = [c_div(x, y) for x, y in zip(a_lst, b_lst)]

msg = "Integer arrays floor division check (//)"
assert all(div_ab == div_lst), msg
msg_eq = "Integer arrays floor division check (//=)"
assert all(ac == div_lst), msg_eq

for divisor in divisors:
div_a = a // divisor
b = a.copy(); b //= divisor
div_lst = [i // divisor for i in lst]
ac = a.copy()
with np.errstate(divide='ignore'):
div_a = a // divisor
ac //= divisor
div_lst = [c_div(i, divisor) for i in a_lst]

msg = "Integer arrays floor division check (//)"
assert all(div_a == div_lst), msg

msg = "Integer arrays floor division check (//=)"
assert all(div_a == b), msg
assert all(ac == div_lst), msg_eq

with np.errstate(divide='raise'):
if 0 in b or (fo.min and -1 in b and fo.min in a):
# Verify overflow case
with pytest.raises(FloatingPointError):
a // b
else:
a // b
if fo.min and fo.min in a:
with pytest.raises(FloatingPointError):
a // -1
elif fo.min:
a // -1
with pytest.raises(FloatingPointError):
a // 0
with pytest.raises(FloatingPointError):
a //= 0
ac = a.copy()
ac //= 0

np.array([], dtype=dtype) // 0

@pytest.mark.parametrize("dtype,ex_val", itertools.product(
np.sctypes['int'] + np.sctypes['uint'], (
"np.array([fo.max, 1, 2, 1, 1, 2, 3], dtype=dtype)",
"np.array([fo.min, 1, -2, 1, 1, 2, -3], dtype=dtype)",
"np.arange(fo.min, fo.min+(100*10), 10, dtype=dtype)",
"np.arange(fo.max-(100*7), fo.max, 7, dtype=dtype)",
)
))
def test_division_int_reduce(self, dtype, ex_val):
fo = np.iinfo(dtype)
a = eval(ex_val)
lst = a.tolist()
c_div = lambda n, d: (
0 if d == 0 or (n and n == fo.min and d == -1) else n//d
)

with np.errstate(divide='ignore'):
div_a = np.floor_divide.reduce(a)
div_lst = reduce(c_div, lst)
msg = "Reduce floor integer division check"
assert div_a == div_lst, msg

np.array([], dtype=input_dtype) // 0
with np.errstate(divide='raise'):
with pytest.raises(FloatingPointError):
np.floor_divide.reduce(np.arange(-100, 100, dtype=dtype))
if fo.min:
with pytest.raises(FloatingPointError):
np.floor_divide.reduce(
np.array([fo.min, 1, -1], dtype=dtype)
)

@pytest.mark.parametrize(
"dividend,divisor,quotient",
Expand Down

0 comments on commit 36cc9be

Please sign in to comment.