Skip to content

Commit

Permalink
rewrite loop rint
Browse files Browse the repository at this point in the history
  • Loading branch information
luyahan committed Apr 25, 2024
1 parent 6f6be04 commit 46bd8a2
Show file tree
Hide file tree
Showing 4 changed files with 156 additions and 8 deletions.
2 changes: 1 addition & 1 deletion numpy/_core/code_generators/generate_umath.py
Original file line number Diff line number Diff line change
Expand Up @@ -988,7 +988,7 @@ def english_upper(s):
docstrings.get('numpy._core.umath.rint'),
None,
TD('e', f='rint', astype={'e': 'f'}),
TD(inexactvec, dispatch=[('loops_unary_fp', 'fd')]),
TD(inexactvec, cfunc_alias='rint'),
TD('fdg' + cmplx, f='rint'),
TD(P, f='rint'),
),
Expand Down
6 changes: 5 additions & 1 deletion numpy/_core/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,9 @@ if use_highway
highway_lib = static_library('highway',
[
# required for hwy::Abort symbol
'src/highway/hwy/abort.cc'
'src/highway/hwy/abort.cc',
'src/highway/hwy/per_target.cc',
'src/highway/hwy/targets.cc'
],
cpp_args: '-DTOOLCHAIN_MISS_ASM_HWCAP_H',
include_directories: ['src/highway'],
Expand Down Expand Up @@ -1141,6 +1143,7 @@ src_umath = umath_gen_headers + [
src_file.process('src/umath/matmul.c.src'),
src_file.process('src/umath/matmul.h.src'),
'src/umath/ufunc_type_resolution.c',
'src/umath/loop_unary_fp.cpp',
'src/umath/clip.cpp',
'src/umath/clip.h',
'src/umath/dispatching.c',
Expand Down Expand Up @@ -1214,6 +1217,7 @@ py.extension_module('_multiarray_umath',
'src/multiarray',
'src/npymath',
'src/umath',
'src/highway',
],
dependencies: [blas_dep],
link_with: [npymath_lib, multiarray_umath_mtargets.static_lib('_multiarray_umath_mtargets')] + highway_lib,
Expand Down
144 changes: 144 additions & 0 deletions numpy/_core/src/umath/loop_unary_fp.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
#define _UMATHMODULE
#define _MULTIARRAYMODULE
#define NPY_NO_DEPRECATED_API NPY_API_VERSION

#define PY_SSIZE_T_CLEAN
#include <Python.h>

#include "numpy/ndarraytypes.h"
#include "numpy/npy_common.h"
#include "numpy/npy_math.h"
#include "numpy/utils.h"

#include "fast_loop_macros.h"
#include "loops_utils.h"
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "loop_unary_fp.cpp" // this file
#include <hwy/foreach_target.h> // must come before highway.h
#include <hwy/highway.h>
#include <hwy/aligned_allocator.h>


namespace numpy {
namespace HWY_NAMESPACE { // required: unique per target

// Can skip hn:: prefixes if already inside hwy::HWY_NAMESPACE.
namespace hn = hwy::HWY_NAMESPACE;

// Alternative to per-function HWY_ATTR: see HWY_BEFORE_NAMESPACE
#define SUPER(NAME, FUNC) \
template <typename T> \
HWY_ATTR void Super##NAME(char** args, npy_intp const* dimensions, \
npy_intp const* steps) { \
const T* HWY_RESTRICT input_array = (const T*)args[0]; \
T* HWY_RESTRICT output_array = (T*)args[1]; \
const size_t size = dimensions[0]; \
const hn::ScalableTag<T> d; \
\
if (is_mem_overlap(input_array, steps[0], output_array, steps[1], size)) { \
for (size_t i = 0; i < size; i++) { \
const auto in = hn::LoadN(d, input_array + i, 1); \
auto x = FUNC(in); \
hn::StoreN(x, d, output_array + i, 1); \
} \
} else if (IS_UNARY_CONT(input_array, output_array)) { \
size_t full = size & -hn::Lanes(d); \
size_t remainder = size - full; \
if (full > hn::Lanes(d) * 4) { \
for (size_t i = 0; hn::Lanes(d) * 4 <= full - i; \
i += hn::Lanes(d) * 4) { \
const auto in0 = hn::LoadU(d, input_array + i); \
auto x0 = FUNC(in0); \
\
const auto in1 = hn::LoadU(d, input_array + i + hn::Lanes(d) * 1); \
auto x1 = FUNC(in1); \
\
const auto in2 = hn::LoadU(d, input_array + i + hn::Lanes(d) * 2); \
auto x2 = FUNC(in2); \
\
const auto in3 = hn::LoadU(d, input_array + i + hn::Lanes(d) * 3); \
auto x3 = FUNC(in3); \
\
hn::StoreU(x0, d, output_array + i); \
hn::StoreU(x1, d, output_array + i + hn::Lanes(d) * 1); \
hn::StoreU(x2, d, output_array + i + hn::Lanes(d) * 2); \
hn::StoreU(x3, d, output_array + i + hn::Lanes(d) * 3); \
} \
full = full % (hn::Lanes(d) * 4); \
} \
for (size_t i = 0; i < full; i += hn::Lanes(d)) { \
const auto in = hn::LoadU(d, input_array + i); \
auto x = FUNC(in); \
hn::StoreU(x, d, output_array + i); \
} \
if (remainder) { \
const auto in = hn::LoadN(d, input_array + full, remainder); \
auto x = FUNC(in); \
hn::StoreN(x, d, output_array + full, remainder); \
} \
} else { \
using TI = hwy::MakeSigned<T>; \
const hn::Rebind<TI, hn::ScalableTag<T>> di; \
\
const int lsize = sizeof(input_array[0]); \
const npy_intp ssrc = steps[0] / lsize; \
const npy_intp sdst = steps[1] / lsize; \
auto load_index = hn::Mul(hn::Iota(di, 0), hn::Set(di, ssrc)); \
auto store_index = hn::Mul(hn::Iota(di, 0), hn::Set(di, sdst)); \
size_t full = size & -hn::Lanes(d); \
size_t remainder = size - full; \
for (size_t i = 0; i < full; i += hn::Lanes(d)) { \
const auto in = \
hn::GatherIndex(d, input_array + i * ssrc, load_index); \
auto x = FUNC(in); \
hn::ScatterIndex(x, d, output_array + i * sdst, store_index); \
} \
if (remainder) { \
const auto in = hn::GatherIndexN(d, input_array + full * ssrc, \
load_index, remainder); \
auto x = FUNC(in); \
hn::ScatterIndexN(x, d, output_array + full * sdst, store_index, \
remainder); \
} \
} \
}

SUPER(Rint, hn::Round)

HWY_ATTR void DOUBLE_HWRint(char **args, npy_intp const *dimensions, npy_intp const *steps) {
SuperRint<npy_double>(args, dimensions, steps);
}

HWY_ATTR void FLOAT_HWRint(char **args, npy_intp const *dimensions, npy_intp const *steps) {
SuperRint<npy_float>(args, dimensions, steps);
}

}
}

#if HWY_ONCE
namespace numpy {

HWY_EXPORT(FLOAT_HWRint);
HWY_EXPORT(DOUBLE_HWRint);

extern "C" {

NPY_NO_EXPORT void
DOUBLE_rint(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
auto dispatcher = HWY_DYNAMIC_POINTER(DOUBLE_HWRint);
return dispatcher(args, dimensions, steps);
}

NPY_NO_EXPORT void
FLOAT_rint(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
auto dispatcher = HWY_DYNAMIC_POINTER(FLOAT_HWRint);
return dispatcher(args, dimensions, steps);
}

} // extern "C"
} // numpy
#endif

12 changes: 6 additions & 6 deletions numpy/_core/src/umath/loops_unary_fp.dispatch.c.src
Original file line number Diff line number Diff line change
Expand Up @@ -100,9 +100,9 @@ NPY_FINLINE double c_square_f64(double a)
*/
#if @VCHK@
/**begin repeat1
* #kind = rint, floor, ceil, trunc, sqrt, absolute, square, reciprocal#
* #intr = rint, floor, ceil, trunc, sqrt, abs, square, recip#
* #repl_0w1 = 0*7, 1#
* #kind = floor, ceil, trunc, sqrt, absolute, square, reciprocal#
* #intr = floor, ceil, trunc, sqrt, abs, square, recip#
* #repl_0w1 = 0*6, 1#
*/
/**begin repeat2
* #STYPE = CONTIG, NCONTIG, CONTIG, NCONTIG#
Expand Down Expand Up @@ -199,9 +199,9 @@ static void simd_@TYPE@_@kind@_@STYPE@_@DTYPE@
* #VCHK = NPY_SIMD_F32, NPY_SIMD_F64#
*/
/**begin repeat1
* #kind = rint, floor, ceil, trunc, sqrt, absolute, square, reciprocal#
* #intr = rint, floor, ceil, trunc, sqrt, abs, square, recip#
* #clear = 0, 0, 0, 0, 0, 1, 0, 0#
* #kind = floor, ceil, trunc, sqrt, absolute, square, reciprocal#
* #intr = floor, ceil, trunc, sqrt, abs, square, recip#
* #clear = 0, 0, 0, 0, 1, 0, 0#
*/
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
Expand Down

0 comments on commit 46bd8a2

Please sign in to comment.