Skip to content

Commit

Permalink
refactor: Update aeskeygenassist_si128 to optimized ver
Browse files Browse the repository at this point in the history
  • Loading branch information
howjmay committed Dec 28, 2022
1 parent 72daa0f commit dc53c0d
Showing 1 changed file with 28 additions and 17 deletions.
45 changes: 28 additions & 17 deletions sse2neon.h
Expand Up @@ -9399,8 +9399,6 @@ FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
}

/* AES */

#if !defined(__ARM_FEATURE_CRYPTO)
/* clang-format off */
#define SSE2NEON_AES_SBOX(w) \
{ \
Expand Down Expand Up @@ -9483,13 +9481,13 @@ FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
w(0x55), w(0x21), w(0x0c), w(0x7d) \
}
/* clang-format on */

/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
#define SSE2NEON_AES_H0(x) (x)
static const uint8_t _sse2neon_sbox[256] = SSE2NEON_AES_SBOX(SSE2NEON_AES_H0);
static const uint8_t _sse2neon_rsbox[256] = SSE2NEON_AES_RSBOX(SSE2NEON_AES_H0);
#undef SSE2NEON_AES_H0

#if !defined(__ARM_FEATURE_CRYPTO)
/* x_time function and matrix multiply function */
#if !defined(__aarch64__)
#define SSE2NEON_XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b))
Expand Down Expand Up @@ -9841,9 +9839,6 @@ FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
#endif
}
#undef SSE2NEON_AES_SBOX
#undef SSE2NEON_AES_RSBOX

#if defined(__aarch64__)
#undef SSE2NEON_XT
#undef SSE2NEON_MULTIPLY
Expand Down Expand Up @@ -9906,20 +9901,36 @@ FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
{
// AESE does ShiftRows and SubBytes on A
uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
#if defined(__aarch64__)
uint8x16_t _a = vreinterpretq_u8_m128i(a);
uint8x16_t v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), _a);
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), _a - 0x40);
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), _a - 0x80);
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), _a - 0xc0);

uint8x16_t dest = {
// Undo ShiftRows step from AESE and extract X1 and X3
u8[0x4], u8[0x1], u8[0xE], u8[0xB], // SubBytes(X1)
u8[0x1], u8[0xE], u8[0xB], u8[0x4], // ROT(SubBytes(X1))
u8[0xC], u8[0x9], u8[0x6], u8[0x3], // SubBytes(X3)
u8[0x9], u8[0x6], u8[0x3], u8[0xC], // ROT(SubBytes(X3))
};
uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
uint32x4_t select_mask = {0xffffffff, 0x0, 0xffffffff, 0x0};
uint64x2_t v_mask = vshrq_n_u64(vreinterpretq_u64_u8(v), 32);
uint32x4_t x = vbslq_u32(select_mask, vreinterpretq_u32_u64(v_mask),
vreinterpretq_u32_u8(v));
uint32x4_t ror_x = vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 24));
uint32x4_t ror_xor_x = veorq_u32(ror_x, vdupq_n_u32(rcon));

return vreinterpretq_m128i_u32(vbslq_u32(select_mask, x, ror_xor_x));

#else /* ARMv7-A NEON implementation */
uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55));
uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF));
for (int i = 0; i < 4; ++i) {
((uint8_t *) &X1)[i] = _sse2neon_sbox[((uint8_t *) &X1)[i]];
((uint8_t *) &X3)[i] = _sse2neon_sbox[((uint8_t *) &X3)[i]];
}
return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
#endif
}
#endif
#undef SSE2NEON_AES_SBOX
#undef SSE2NEON_AES_RSBOX

/* Others */

Expand Down

0 comments on commit dc53c0d

Please sign in to comment.