Skip to content

Commit

Permalink
optimize neon loadu_128/storeu_128
Browse files Browse the repository at this point in the history
vld1q_u8 and vst1q_u8 has no alignment requirements.

This improves performance on Oracle Cloud's VM.Standard.A1.Flex by 1.15% on a 16*1024 input,
 from 13920 nanoseconds down to 13800 nanoseconds (approx)

ref BLAKE3-team/BLAKE3#384
  • Loading branch information
divinity76 committed Feb 9, 2024
1 parent ad9c31e commit 09a34ef
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 6 deletions.
24 changes: 24 additions & 0 deletions ext/hash/blake3/patches.diff
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,27 @@ index af6c3dadc7..af3bf17bbe 100644

void blake3_compress_in_place(uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
diff --git a/ext/hash/blake3/upstream_blake3/c/blake3_neon.c b/ext/hash/blake3/upstream_blake3/c/blake3_neon.c
index 8a818fc78f..c4b4548edf 100644
--- a/ext/hash/blake3/upstream_blake3/c/blake3_neon.c
+++ b/ext/hash/blake3/upstream_blake3/c/blake3_neon.c
@@ -9,15 +9,13 @@
#endif

INLINE uint32x4_t loadu_128(const uint8_t src[16]) {
- // vld1q_u32 has alignment requirements. Don't use it.
- uint32x4_t x;
- memcpy(&x, src, 16);
- return x;
+ // https://github.com/BLAKE3-team/BLAKE3/pull/384
+ return vreinterpretq_u32_u8(vld1q_u8(src));
}

INLINE void storeu_128(uint32x4_t src, uint8_t dest[16]) {
- // vst1q_u32 has alignment requirements. Don't use it.
- memcpy(dest, &src, 16);
+ // https://github.com/BLAKE3-team/BLAKE3/pull/384
+ vst1q_u8(dest, vreinterpretq_u8_u32(src));
}

INLINE uint32x4_t add_128(uint32x4_t a, uint32x4_t b) {
10 changes: 4 additions & 6 deletions ext/hash/blake3/upstream_blake3/c/blake3_neon.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,13 @@
#endif

INLINE uint32x4_t loadu_128(const uint8_t src[16]) {
// vld1q_u32 has alignment requirements. Don't use it.
uint32x4_t x;
memcpy(&x, src, 16);
return x;
// https://github.com/BLAKE3-team/BLAKE3/pull/384
return vreinterpretq_u32_u8(vld1q_u8(src));
}

INLINE void storeu_128(uint32x4_t src, uint8_t dest[16]) {
// vst1q_u32 has alignment requirements. Don't use it.
memcpy(dest, &src, 16);
// https://github.com/BLAKE3-team/BLAKE3/pull/384
vst1q_u8(dest, vreinterpretq_u8_u32(src));
}

INLINE uint32x4_t add_128(uint32x4_t a, uint32x4_t b) {
Expand Down

0 comments on commit 09a34ef

Please sign in to comment.