Skip to content

Commit

Permalink
refactor: Update aeskeygenassist_si128 to optimized ver
Browse files Browse the repository at this point in the history
  • Loading branch information
howjmay committed Dec 27, 2022
1 parent 72daa0f commit 1a3e100
Showing 1 changed file with 73 additions and 12 deletions.
85 changes: 73 additions & 12 deletions sse2neon.h
Expand Up @@ -9906,19 +9906,80 @@ FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
{
// AESE does ShiftRows and SubBytes on A
uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));

uint8x16_t dest = {
// Undo ShiftRows step from AESE and extract X1 and X3
u8[0x4], u8[0x1], u8[0xE], u8[0xB], // SubBytes(X1)
u8[0x1], u8[0xE], u8[0xB], u8[0x4], // ROT(SubBytes(X1))
u8[0xC], u8[0x9], u8[0x6], u8[0x3], // SubBytes(X3)
u8[0x9], u8[0x6], u8[0x3], u8[0xC], // ROT(SubBytes(X3))
};
uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
#define SSE2NEON_AES_SBOX(w) \
{ \
w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
w(0xb0), w(0x54), w(0xbb), w(0x16) \
}
/* clang-format on */

#define SSE2NEON_AES_H0(x) (x)
static const uint8_t _sse2neon_sbox[256] =
SSE2NEON_AES_SBOX(SSE2NEON_AES_H0);
#undef SSE2NEON_AES_H0
#if defined(__aarch64__)
uint8x16_t _a = vreinterpretq_u8_m128i(a);
uint8x16_t v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), _a);
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), _a - 0x40);
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), _a - 0x80);
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), _a - 0xc0);

uint32x4_t select_mask = {0xffffffff, 0x0, 0xffffffff, 0x0};
uint64x2_t v_mask = vshrq_n_u64(vreinterpretq_u64_u8(v), 32);
uint32x4_t x = vbslq_u32(select_mask, vreinterpretq_u32_u64(v_mask),
vreinterpretq_u32_u8(v));
uint32x4_t ror_x = vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 24));
uint32x4_t ror_xor_x = veorq_u32(ror_x, vdupq_n_u32(rcon));

return vreinterpretq_m128i_u32(vbslq_u32(select_mask, x, ror_xor_x));

#else /* ARMv7-A NEON implementation */
uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55));
uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF));
for (int i = 0; i < 4; ++i) {
((uint8_t *) &X1)[i] = _sse2neon_sbox[((uint8_t *) &X1)[i]];
((uint8_t *) &X3)[i] = _sse2neon_sbox[((uint8_t *) &X3)[i]];
}
return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
#endif
}
#undef SSE2NEON_AES_SBOX
#endif

/* Others */
Expand Down

0 comments on commit 1a3e100

Please sign in to comment.