diff --git a/sha2/src/consts.rs b/sha2/src/consts.rs index 4aaff0f4..f126dc61 100644 --- a/sha2/src/consts.rs +++ b/sha2/src/consts.rs @@ -163,56 +163,6 @@ pub const K64X2: [[u64; 2]; 40] = [ [K64[79], K64[78]], ]; -macro_rules! dup_array { - ([$([$a:expr, $b:expr]),*,]) => {[ - $($b, $a, $b, $a),*, - ]} -} - -/// Constants necessary for SHA-512 family of digests. -pub const K64X4: [u64; 160] = dup_array!([ - [K64[1], K64[0]], - [K64[3], K64[2]], - [K64[5], K64[4]], - [K64[7], K64[6]], - [K64[9], K64[8]], - [K64[11], K64[10]], - [K64[13], K64[12]], - [K64[15], K64[14]], - [K64[17], K64[16]], - [K64[19], K64[18]], - [K64[21], K64[20]], - [K64[23], K64[22]], - [K64[25], K64[24]], - [K64[27], K64[26]], - [K64[29], K64[28]], - [K64[31], K64[30]], - [K64[33], K64[32]], - [K64[35], K64[34]], - [K64[37], K64[36]], - [K64[39], K64[38]], - [K64[41], K64[40]], - [K64[43], K64[42]], - [K64[45], K64[44]], - [K64[47], K64[46]], - [K64[49], K64[48]], - [K64[51], K64[50]], - [K64[53], K64[52]], - [K64[55], K64[54]], - [K64[57], K64[56]], - [K64[59], K64[58]], - [K64[61], K64[60]], - [K64[63], K64[62]], - [K64[65], K64[64]], - [K64[67], K64[66]], - [K64[69], K64[68]], - [K64[71], K64[70]], - [K64[73], K64[72]], - [K64[75], K64[74]], - [K64[77], K64[76]], - [K64[79], K64[78]], -]); - pub static H224: [u32; STATE_LEN] = [ 0xc1059ed8, 0x367cd507, 0x3070dd17, 0xf70e5939, 0xffc00b31, 0x68581511, 0x64f98fa7, 0xbefa4fa4, ]; diff --git a/sha2/src/sha512/x86.rs b/sha2/src/sha512/x86.rs index fedf3e66..09b3ceea 100644 --- a/sha2/src/sha512/x86.rs +++ b/sha2/src/sha512/x86.rs @@ -9,9 +9,9 @@ use core::arch::x86::*; #[cfg(target_arch = "x86_64")] use core::arch::x86_64::*; -use crate::consts::{K64, K64X4}; +use crate::consts::K64; -cpufeatures::new!(avx2_cpuid, "avx", "avx2", "sse2", "sse3"); +cpufeatures::new!(avx2_cpuid, "avx2"); pub fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) { // TODO: Replace with https://github.com/rust-lang/rfcs/pull/2725 @@ -25,7 +25,7 @@ pub fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) { } } -#[target_feature(enable = "avx,avx2,sse2,sse3")] +#[target_feature(enable = "avx2")] unsafe fn sha512_compress_x86_64_avx2(state: &mut [u64; 8], blocks: &[[u8; 128]]) { let mut start_block = 0; @@ -110,10 +110,9 @@ unsafe fn load_data_avx2( x[$i] = _mm256_insertf128_si256(x[$i], _mm_loadu_si128(data.add($i + 1) as *const _), 0); x[$i] = _mm256_shuffle_epi8(x[$i], MASK); - let y = _mm256_add_epi64( - x[$i], - _mm256_loadu_si256(&K64X4[4 * $i] as *const u64 as *const _), - ); + + let t = _mm_loadu_si128(K64.as_ptr().add($i * 2) as *const u64 as *const _); + let y = _mm256_add_epi64(x[$i], _mm256_set_m128i(t, t)); _mm_store_si128( &mut ms[2 * $i] as *mut u64 as *mut _, @@ -135,7 +134,8 @@ unsafe fn rounds_0_63_avx(current_state: &mut State, x: &mut [__m128i; 8], ms: & for _ in 0..4 { for j in 0..8 { - let y = sha512_update_x_avx(x, &K64[k64_idx] as *const u64 as *const _); + let k64 = _mm_loadu_si128(&K64[k64_idx] as *const u64 as *const _); + let y = sha512_update_x_avx(x, k64); sha_round(current_state, ms[2 * j]); sha_round(current_state, ms[2 * j + 1]); @@ -153,11 +153,12 @@ unsafe fn rounds_0_63_avx2( ms: &mut MsgSchedule, t2: &mut RoundStates, ) { - let mut k64x4_idx: usize = 2 * SHA512_BLOCK_WORDS_NUM; + let mut k64x4_idx: usize = SHA512_BLOCK_WORDS_NUM; for i in 1..5 { for j in 0..8 { - let y = sha512_update_x_avx2(x, &K64X4[k64x4_idx] as *const u64 as *const _); + let t = _mm_loadu_si128(K64.as_ptr().add(k64x4_idx) as *const u64 as *const _); + let y = sha512_update_x_avx2(x, _mm256_set_m128i(t, t)); sha_round(current_state, ms[2 * j]); sha_round(current_state, ms[2 * j + 1]); @@ -171,7 +172,7 @@ unsafe fn rounds_0_63_avx2( _mm256_extracti128_si256(y, 1), ); - k64x4_idx += 4; + k64x4_idx += 2; } } } @@ -249,14 +250,13 @@ unsafe fn accumulate_state(dst: &mut State, src: &State) { macro_rules! fn_sha512_update_x { ($name:ident, $ty:ident, { - LOAD = $LOAD:ident, ADD64 = $ADD64:ident, ALIGNR8 = $ALIGNR8:ident, SRL64 = $SRL64:ident, SLL64 = $SLL64:ident, XOR = $XOR:ident, }) => { - unsafe fn $name(x: &mut [$ty; 8], k64_p: *const $ty) -> $ty { + unsafe fn $name(x: &mut [$ty; 8], k64: $ty) -> $ty { // q[2:1] let mut t0 = $ALIGNR8(x[1], x[0], 8); // q[10:9] @@ -320,13 +320,12 @@ macro_rules! fn_sha512_update_x { x[6] = x[7]; x[7] = temp; - $ADD64(x[7], $LOAD(k64_p)) + $ADD64(x[7], k64) } }; } fn_sha512_update_x!(sha512_update_x_avx, __m128i, { - LOAD = _mm_loadu_si128, ADD64 = _mm_add_epi64, ALIGNR8 = _mm_alignr_epi8, SRL64 = _mm_srli_epi64, @@ -335,7 +334,6 @@ fn_sha512_update_x!(sha512_update_x_avx, __m128i, { }); fn_sha512_update_x!(sha512_update_x_avx2, __m256i, { - LOAD = _mm256_loadu_si256, ADD64 = _mm256_add_epi64, ALIGNR8 = _mm256_alignr_epi8, SRL64 = _mm256_srli_epi64,