From 58bea0bcbba3629043939aa499068055dd0df017 Mon Sep 17 00:00:00 2001 From: divinity76 Date: Tue, 12 Mar 2024 08:21:51 +0100 Subject: [PATCH] optimize neon loadu_128/storeu_128 (#384) vld1q_u8 and vst1q_u8 has no alignment requirements. This improves performance on Oracle Cloud's VM.Standard.A1.Flex by 1.15% on a 16*1024 input, from 13920 nanoseconds down to 13800 nanoseconds (approx) --- c/blake3_neon.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/c/blake3_neon.c b/c/blake3_neon.c index 8a818fc7..90bdd572 100644 --- a/c/blake3_neon.c +++ b/c/blake3_neon.c @@ -10,14 +10,12 @@ INLINE uint32x4_t loadu_128(const uint8_t src[16]) { // vld1q_u32 has alignment requirements. Don't use it. - uint32x4_t x; - memcpy(&x, src, 16); - return x; + return vreinterpretq_u32_u8(vld1q_u8(src)); } INLINE void storeu_128(uint32x4_t src, uint8_t dest[16]) { // vst1q_u32 has alignment requirements. Don't use it. - memcpy(dest, &src, 16); + vst1q_u8(dest, vreinterpretq_u8_u32(src)); } INLINE uint32x4_t add_128(uint32x4_t a, uint32x4_t b) {