Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: Optimize aeskeygenassist_si128 in ARM64 #573

Merged
merged 1 commit into from Dec 28, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
13 changes: 5 additions & 8 deletions sse2neon.h
Expand Up @@ -9821,14 +9821,11 @@ FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), _a - 0x80);
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), _a - 0xc0);

uint32x4_t select_mask = {0xffffffff, 0x0, 0xffffffff, 0x0};
uint64x2_t v_mask = vshrq_n_u64(vreinterpretq_u64_u8(v), 32);
uint32x4_t x = vbslq_u32(select_mask, vreinterpretq_u32_u64(v_mask),
vreinterpretq_u32_u8(v));
uint32x4_t ror_x = vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 24));
uint32x4_t ror_xor_x = veorq_u32(ror_x, vdupq_n_u32(rcon));

return vreinterpretq_m128i_u32(vbslq_u32(select_mask, x, ror_xor_x));
uint32x4_t v_u32 = vreinterpretq_u32_u8(v);
uint32x4_t ror_v = vorrq_u32(vshrq_n_u32(v_u32, 8), vshlq_n_u32(v_u32, 24));
uint32x4_t ror_xor_v = veorq_u32(ror_v, vdupq_n_u32(rcon));

return vreinterpretq_m128i_u32(vtrn2q_u32(v_u32, ror_xor_v));

#else /* ARMv7-A NEON implementation */
uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55));
Expand Down