diff --git a/sse2neon.h b/sse2neon.h index 0db48053..ebd5c286 100644 --- a/sse2neon.h +++ b/sse2neon.h @@ -9399,8 +9399,6 @@ FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v) } /* AES */ - -#if !defined(__ARM_FEATURE_CRYPTO) /* clang-format off */ #define SSE2NEON_AES_SBOX(w) \ { \ @@ -9483,13 +9481,13 @@ FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v) w(0x55), w(0x21), w(0x0c), w(0x7d) \ } /* clang-format on */ - /* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */ #define SSE2NEON_AES_H0(x) (x) static const uint8_t _sse2neon_sbox[256] = SSE2NEON_AES_SBOX(SSE2NEON_AES_H0); static const uint8_t _sse2neon_rsbox[256] = SSE2NEON_AES_RSBOX(SSE2NEON_AES_H0); #undef SSE2NEON_AES_H0 +#if !defined(__ARM_FEATURE_CRYPTO) /* x_time function and matrix multiply function */ #if !defined(__aarch64__) #define SSE2NEON_XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b)) @@ -9841,9 +9839,6 @@ FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon) ((X1 >> 8) | (X1 << 24)) ^ rcon, X1); #endif } -#undef SSE2NEON_AES_SBOX -#undef SSE2NEON_AES_RSBOX - #if defined(__aarch64__) #undef SSE2NEON_XT #undef SSE2NEON_MULTIPLY @@ -9906,20 +9901,36 @@ FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128 FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon) { - // AESE does ShiftRows and SubBytes on A - uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0)); +#if defined(__aarch64__) + uint8x16_t _a = vreinterpretq_u8_m128i(a); + uint8x16_t v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), _a); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), _a - 0x40); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), _a - 0x80); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), _a - 0xc0); - uint8x16_t dest = { - // Undo ShiftRows step from AESE and extract X1 and X3 - u8[0x4], u8[0x1], u8[0xE], u8[0xB], // SubBytes(X1) - u8[0x1], u8[0xE], u8[0xB], u8[0x4], // ROT(SubBytes(X1)) - u8[0xC], u8[0x9], u8[0x6], u8[0x3], // SubBytes(X3) - u8[0x9], u8[0x6], u8[0x3], u8[0xC], // ROT(SubBytes(X3)) - }; - uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon}; - return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r); + uint32x4_t select_mask = {0xffffffff, 0x0, 0xffffffff, 0x0}; + uint64x2_t v_mask = vshrq_n_u64(vreinterpretq_u64_u8(v), 32); + uint32x4_t x = vbslq_u32(select_mask, vreinterpretq_u32_u64(v_mask), + vreinterpretq_u32_u8(v)); + uint32x4_t ror_x = vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 24)); + uint32x4_t ror_xor_x = veorq_u32(ror_x, vdupq_n_u32(rcon)); + + return vreinterpretq_m128i_u32(vbslq_u32(select_mask, x, ror_xor_x)); + +#else /* ARMv7-A NEON implementation */ + uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55)); + uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF)); + for (int i = 0; i < 4; ++i) { + ((uint8_t *) &X1)[i] = _sse2neon_sbox[((uint8_t *) &X1)[i]]; + ((uint8_t *) &X3)[i] = _sse2neon_sbox[((uint8_t *) &X3)[i]]; + } + return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3, + ((X1 >> 8) | (X1 << 24)) ^ rcon, X1); +#endif } #endif +#undef SSE2NEON_AES_SBOX +#undef SSE2NEON_AES_RSBOX /* Others */