Skip to content

Commit

Permalink
NEON: Implement some f16XN types and f16 related intrinsics. (#1071)
Browse files Browse the repository at this point in the history
* [NEON] Add vmulq_f16 and vmul_f16.

* [NEON] Add vmulq_f16 and vmul_f16 test.

* [NEON] Add vget_lane_f16 and vgetq_lane_f16.

* [NEON] Add vsubh_f16 and vsub_f16.

* [NEON] Add vextq_f16.

* [NEON] Add vget_low_f16.

* [NEON] Add vmulq_lane_f16.

* [NEON] Add vmul_n_f16.

* [NEON] Add vget_high_f16.

* [NEON] Add vsetq_lane_f16.

* [NEON] Add vcombine_f16.

* [NEON] Add vcvtaq_s32_f32, vcvtas_s32_f32, and vcvta_s32_f32.

* [NEON] Add vpadd_f16.

* [NEON] Add vuzp1_f16.

* [NEON] Add vuzp2_f16.

* [NEON] Add vmaxq_f16 and vmax_f16.

* [NEON] Add vcvtas_u32_f32, vcvta_u32_f32, vcvtaq_u32_f32.

* [NEON] Add type simde_float16x8x2_t.

* [NEON] Add vld2q_f16.

* [NEON] Add vld1q_dup_f16.

* [NEON] Add vpmax_f16.

* [NEON] Add vrsqrtsq_f16, vrsqrtsh_f16, vrsqrts_f16.

* [NEON] Add vcgtq_f16, vcgt_f16, vcgth_f16.

* [NEON] Add vdiv_f32 and vdivq_f32.

* [NEON] Add vrecps_f16 and vrecpsq_f16.

* [NEON] Add vset_lane_f16.

* [NEON] Add vrecpe_f16, vrecpeq_f16.

* [NEON] Add vfmaq_f16.

* [NEON] Add vabsq_f16 and vabs_f16.

* [NEON] Add vcltq_f16 and vclth_f16.

* [NEON] Add vmin_f16 and vminq_f16.

* [NEON] Add vclt_f16.

* [NEON] Add vclezq_f16, vclez_f16, and vclezh_f16.

* [NEON] Add vzip_f16 and vzipq_f16.

* [NEON] Add vzip1_f16 and vzip1q_f16.

* [NEON] Add vzip2_f16 and vzip2q_f16.

* [NEON] Add vst2_f16 and vst2q_f16.

* [NEON] Add type simde_float16x4x2.

* [NEON] Add 16 intrinsics of vreinterpret series.

* [SIMDE] Add sqrtl() in simde_math_sqrtl.

* [NEON] Add vrndnq_f16, vrndns_f16, and vrndn_f16.

* [NEON] Add sqrt in meson.build.

* [NEON] Add 7 sqrt related intrinsics.

* [Fix] Add the judge whether define sqrt() or not.

* [NEON] Add vrsqrteq_f16, vrsqrte_f16, and vrsqrteh_f16.

* [NEON] Add vqrshrnh_n_s16 and vqrshrnh_n_u16.

* [NEON] Add vqrshrunh_n_s16.

* [Fix] Add new conditions for fp16 intrinsics.

* [License] Add Copyright.
  • Loading branch information
yyctw committed Oct 6, 2023
1 parent 1594d7c commit aae2245
Show file tree
Hide file tree
Showing 82 changed files with 6,412 additions and 16 deletions.
2 changes: 2 additions & 0 deletions meson.build
Expand Up @@ -48,6 +48,7 @@ simde_neon_families = [
'cvtn',
'combine',
'create',
'div',
'dot',
'dot_lane',
'dup_n',
Expand Down Expand Up @@ -164,6 +165,7 @@ simde_neon_families = [
'shr_n',
'shrn_n',
'sqadd',
'sqrt',
'sra_n',
'sri_n',
'st1',
Expand Down
3 changes: 3 additions & 0 deletions simde/arm/neon.h
Expand Up @@ -22,6 +22,7 @@
*
* Copyright:
* 2020 Evan Nemerson <evan@nemerson.com>
* 2023 Yi-Yen Chung <eric681@andestech.com> (Copyright owned by Andes Technology)
*/

#if !defined(SIMDE_ARM_NEON_H)
Expand Down Expand Up @@ -68,6 +69,7 @@
#include "neon/cvtn.h"
#include "neon/combine.h"
#include "neon/create.h"
#include "neon/div.h"
#include "neon/dot.h"
#include "neon/dot_lane.h"
#include "neon/dup_lane.h"
Expand Down Expand Up @@ -184,6 +186,7 @@
#include "neon/shr_n.h"
#include "neon/shrn_n.h"
#include "neon/sqadd.h"
#include "neon/sqrt.h"
#include "neon/sra_n.h"
#include "neon/sri_n.h"
#include "neon/st1.h"
Expand Down
63 changes: 63 additions & 0 deletions simde/arm/neon/abs.h
Expand Up @@ -22,6 +22,7 @@
*
* Copyright:
* 2020 Evan Nemerson <evan@nemerson.com>
* 2023 Yi-Yen Chung <eric681@andestech.com> (Copyright owned by Andes Technology)
*/

#if !defined(SIMDE_ARM_NEON_ABS_H)
Expand All @@ -47,6 +48,45 @@ simde_vabsd_s64(int64_t a) {
#define vabsd_s64(a) simde_vabsd_s64(a)
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde_float16_t
simde_vabsh_f16(simde_float16_t a) {
#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
return vabsh_f16(a);
#else
simde_float32_t a_ = simde_float16_to_float32(a);

return (a_ >= 0.0f) ? simde_float16_from_float32(a_) : simde_float16_from_float32(-a_);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
#undef vabsh_f16
#define vabsh_f16(a) simde_vabsh_f16(a)
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde_float16x4_t
simde_vabs_f16(simde_float16x4_t a) {
#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
return vabs_f16(a);
#else
simde_float16x4_private
r_,
a_ = simde_float16x4_to_private(a);

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = simde_vabsh_f16(a_.values[i]);
}

return simde_float16x4_from_private(r_);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
#undef vabs_f16
#define vabs_f16(a) simde_vabs_f16(a)
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde_float32x2_t
simde_vabs_f32(simde_float32x2_t a) {
Expand Down Expand Up @@ -211,6 +251,29 @@ simde_vabs_s64(simde_int64x1_t a) {
#define vabs_s64(a) simde_vabs_s64(a)
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde_float16x8_t
simde_vabsq_f16(simde_float16x8_t a) {
#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
return vabsq_f16(a);
#else
simde_float16x8_private
r_,
a_ = simde_float16x8_to_private(a);

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = simde_vabsh_f16(a_.values[i]);
}

return simde_float16x8_from_private(r_);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
#undef vabsq_f16
#define vabsq_f16(a) simde_vabsq_f16(a)
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde_float32x4_t
simde_vabsq_f32(simde_float32x4_t a) {
Expand Down
66 changes: 66 additions & 0 deletions simde/arm/neon/cgt.h
Expand Up @@ -23,6 +23,7 @@
* Copyright:
* 2020 Evan Nemerson <evan@nemerson.com>
* 2020 Christopher Moore <moore@free.fr>
* 2023 Yi-Yen Chung <eric681@andestech.com> (Copyright owned by Andes Technology)
*/

#if !defined(SIMDE_ARM_NEON_CGT_H)
Expand Down Expand Up @@ -78,6 +79,23 @@ simde_vcgtd_u64(uint64_t a, uint64_t b) {
#define vcgtd_u64(a, b) simde_vcgtd_u64((a), (b))
#endif

SIMDE_FUNCTION_ATTRIBUTES
uint16_t
simde_vcgth_f16(simde_float16_t a, simde_float16_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
return HEDLEY_STATIC_CAST(uint16_t, vcgth_f16(a, b));
#else
simde_float32_t a_ = simde_float16_to_float32(a);
simde_float32_t b_ = simde_float16_to_float32(b);

return (a_ > b_) ? UINT16_MAX : 0;
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
#undef vcgth_f16
#define vcgth_f16(a, b) simde_vcgth_f16((a), (b))
#endif

SIMDE_FUNCTION_ATTRIBUTES
uint32_t
simde_vcgts_f32(simde_float32_t a, simde_float32_t b) {
Expand All @@ -92,6 +110,30 @@ simde_vcgts_f32(simde_float32_t a, simde_float32_t b) {
#define vcgts_f32(a, b) simde_vcgts_f32((a), (b))
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde_uint16x8_t
simde_vcgtq_f16(simde_float16x8_t a, simde_float16x8_t b) {
#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
return vcgtq_f16(a, b);
#else
simde_float16x8_private
a_ = simde_float16x8_to_private(a),
b_ = simde_float16x8_to_private(b);
simde_uint16x8_private r_;

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = simde_vcgth_f16(a_.values[i], b_.values[i]);
}

return simde_uint16x8_from_private(r_);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
#undef vcgtq_f16
#define vcgtq_f16(a, b) simde_vcgtq_f16((a), (b))
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde_uint32x4_t
simde_vcgtq_f32(simde_float32x4_t a, simde_float32x4_t b) {
Expand Down Expand Up @@ -442,6 +484,30 @@ simde_vcgtq_u64(simde_uint64x2_t a, simde_uint64x2_t b) {
#define vcgtq_u64(a, b) simde_vcgtq_u64((a), (b))
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde_uint16x4_t
simde_vcgt_f16(simde_float16x4_t a, simde_float16x4_t b) {
#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
return vcgt_f16(a, b);
#else
simde_float16x4_private
a_ = simde_float16x4_to_private(a),
b_ = simde_float16x4_to_private(b);
simde_uint16x4_private r_;

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = simde_vcgth_f16(a_.values[i], b_.values[i]);
}

return simde_uint16x4_from_private(r_);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
#undef vcgt_f16
#define vcgt_f16(a, b) simde_vcgt_f16((a), (b))
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde_uint32x2_t
simde_vcgt_f32(simde_float32x2_t a, simde_float32x2_t b) {
Expand Down
61 changes: 61 additions & 0 deletions simde/arm/neon/clez.h
Expand Up @@ -23,6 +23,7 @@
* Copyright:
* 2020 Evan Nemerson <evan@nemerson.com>
* 2020 Christopher Moore <moore@free.fr>
* 2023 Yi-Yen Chung <eric681@andestech.com> (Copyright owned by Andes Technology)
*/

#if !defined(SIMDE_ARM_NEON_CLEZ_H)
Expand Down Expand Up @@ -78,6 +79,44 @@ simde_vclezs_f32(simde_float32_t a) {
#define vclezs_f32(a) simde_vclezs_f32(a)
#endif

SIMDE_FUNCTION_ATTRIBUTES
uint16_t
simde_vclezh_f16(simde_float16_t a) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
return HEDLEY_STATIC_CAST(uint16_t, vclezh_f16(a));
#else
simde_float32_t a_ = simde_float16_to_float32(a);

return (a_ <= 0.0f) ? UINT16_MAX : 0;
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
#undef vclezh_f16
#define vclezh_f16(a) simde_vclezh_f16(a)
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde_uint16x8_t
simde_vclezq_f16(simde_float16x8_t a) {
#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
return vclezq_f16(a);
#else
simde_float16x8_private a_ = simde_float16x8_to_private(a);
simde_uint16x8_private r_;

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = simde_vclezh_f16(a_.values[i]);
}

return simde_uint16x8_from_private(r_);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
#undef vclezq_f16
#define vclezq_f16(a) simde_vclezq_f16(a)
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde_uint32x4_t
simde_vclezq_f32(simde_float32x4_t a) {
Expand Down Expand Up @@ -246,6 +285,28 @@ simde_vclezq_s64(simde_int64x2_t a) {
#define vclezq_s64(a) simde_vclezq_s64(a)
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde_uint16x4_t
simde_vclez_f16(simde_float16x4_t a) {
#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
return vclez_f16(a);
#else
simde_float16x4_private a_ = simde_float16x4_to_private(a);
simde_uint16x4_private r_;

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = simde_vclezh_f16(a_.values[i]);
}

return simde_uint16x4_from_private(r_);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
#undef vclez_f16
#define vclez_f16(a) simde_vclez_f16(a)
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde_uint32x2_t
simde_vclez_f32(simde_float32x2_t a) {
Expand Down
66 changes: 66 additions & 0 deletions simde/arm/neon/clt.h
Expand Up @@ -23,6 +23,7 @@
* Copyright:
* 2020 Evan Nemerson <evan@nemerson.com>
* 2020 Christopher Moore <moore@free.fr>
* 2023 Yi-Yen Chung <eric681@andestech.com> (Copyright owned by Andes Technology)
*/

#if !defined(SIMDE_ARM_NEON_CLT_H)
Expand Down Expand Up @@ -77,6 +78,23 @@ simde_vcltd_u64(uint64_t a, uint64_t b) {
#define vcltd_u64(a, b) simde_vcltd_u64((a), (b))
#endif

SIMDE_FUNCTION_ATTRIBUTES
uint16_t
simde_vclth_f16(simde_float16_t a, simde_float16_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
return HEDLEY_STATIC_CAST(uint16_t, vclth_f16(a, b));
#else
simde_float32_t a_ = simde_float16_to_float32(a);
simde_float32_t b_ = simde_float16_to_float32(b);

return (a_ < b_) ? UINT16_MAX : 0;
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
#undef vclth_f16
#define vclth_f16(a, b) simde_vclth_f16((a), (b))
#endif

SIMDE_FUNCTION_ATTRIBUTES
uint32_t
simde_vclts_f32(simde_float32_t a, simde_float32_t b) {
Expand All @@ -91,6 +109,30 @@ simde_vclts_f32(simde_float32_t a, simde_float32_t b) {
#define vclts_f32(a, b) simde_vclts_f32((a), (b))
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde_uint16x8_t
simde_vcltq_f16(simde_float16x8_t a, simde_float16x8_t b) {
#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
return vcltq_f16(a, b);
#else
simde_float16x8_private
a_ = simde_float16x8_to_private(a),
b_ = simde_float16x8_to_private(b);
simde_uint16x8_private r_;

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = simde_vclth_f16(a_.values[i], b_.values[i]);
}

return simde_uint16x8_from_private(r_);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
#undef vcltq_f16
#define vcltq_f16(a, b) simde_vcltq_f16((a), (b))
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde_uint32x4_t
simde_vcltq_f32(simde_float32x4_t a, simde_float32x4_t b) {
Expand Down Expand Up @@ -450,6 +492,30 @@ simde_vcltq_u64(simde_uint64x2_t a, simde_uint64x2_t b) {
#define vcltq_u64(a, b) simde_vcltq_u64((a), (b))
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde_uint16x4_t
simde_vclt_f16(simde_float16x4_t a, simde_float16x4_t b) {
#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
return vclt_f16(a, b);
#else
simde_float16x4_private
a_ = simde_float16x4_to_private(a),
b_ = simde_float16x4_to_private(b);
simde_uint16x4_private r_;

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = simde_vclth_f16(a_.values[i], b_.values[i]);
}

return simde_uint16x4_from_private(r_);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
#undef vclt_f16
#define vclt_f16(a, b) simde_vclt_f16((a), (b))
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde_uint32x2_t
simde_vclt_f32(simde_float32x2_t a, simde_float32x2_t b) {
Expand Down

0 comments on commit aae2245

Please sign in to comment.