From b963eb29066a8f4e5a19f67cc41bfd90e4555aa1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D1=91=D0=BC=20=D0=9F=D0=B0=D0=B2=D0=BB?=
 =?UTF-8?q?=D0=BE=D0=B2?= <newpavlov@gmail.com>
Date: Thu, 6 Jan 2022 20:17:59 +0300
Subject: [PATCH] sha2: backport #345 to v0.9

---
 Cargo.lock             |  2 +-
 sha2/CHANGELOG.md      |  9 ++++++-
 sha2/Cargo.toml        |  2 +-
 sha2/src/sha512/x86.rs | 61 +++++++++++++++++++++++-------------------
 4 files changed, 43 insertions(+), 31 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index cba49569..635a5180 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -271,7 +271,7 @@ dependencies = [
 
 [[package]]
 name = "sha2"
-version = "0.9.8"
+version = "0.9.9"
 dependencies = [
  "block-buffer",
  "cfg-if",
diff --git a/sha2/CHANGELOG.md b/sha2/CHANGELOG.md
index 433465d0..b9dd701e 100644
--- a/sha2/CHANGELOG.md
+++ b/sha2/CHANGELOG.md
@@ -5,7 +5,14 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## 0.9.8 (2021-09-09)
+## 0.9.9 (2022-01-06)
+### Fixed
+- Backport [#345] bug fix for the AVX2 backend ([#346])
+
+[#345]: https://github.com/RustCrypto/hashes/pull/345
+[#346]: https://github.com/RustCrypto/hashes/pull/346
+
+## 0.9.8 (2021-09-09) [YANKED]
 ### Fixed
 - Bug in the AVX2 backend ([#314])
 
diff --git a/sha2/Cargo.toml b/sha2/Cargo.toml
index cf17feb8..32629cee 100644
--- a/sha2/Cargo.toml
+++ b/sha2/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "sha2"
-version = "0.9.8"
+version = "0.9.9"
 description = """
 Pure Rust implementation of the SHA-2 hash function family
 including SHA-224, SHA-256, SHA-384, and SHA-512.
diff --git a/sha2/src/sha512/x86.rs b/sha2/src/sha512/x86.rs
index 5f555c21..bb790408 100644
--- a/sha2/src/sha512/x86.rs
+++ b/sha2/src/sha512/x86.rs
@@ -34,8 +34,8 @@ unsafe fn sha512_compress_x86_64_avx2(state: &mut [u64; 8], blocks: &[[u8; 128]]
         start_block += 1;
     }
 
-    let mut ms: MsgSchedule = Default::default();
-    let mut t2: RoundStates = [0u64; SHA512_ROUNDS_NUM];
+    let mut ms: MsgSchedule = [_mm_setzero_si128(); 8];
+    let mut t2: RoundStates = [_mm_setzero_si128(); 40];
     let mut x = [_mm256_setzero_si256(); 8];
 
     for i in (start_block..blocks.len()).step_by(2) {
@@ -56,7 +56,7 @@ unsafe fn sha512_compress_x86_64_avx2(state: &mut [u64; 8], blocks: &[[u8; 128]]
 
 #[inline(always)]
 unsafe fn sha512_compress_x86_64_avx(state: &mut [u64; 8], block: &[u8; 128]) {
-    let mut ms = Default::default();
+    let mut ms = [_mm_setzero_si128(); 8];
     let mut x = [_mm_setzero_si128(); 8];
 
     // Reduced to single iteration
@@ -82,7 +82,7 @@ unsafe fn load_data_avx(x: &mut [__m128i; 8], ms: &mut MsgSchedule, data: *const
                 _mm_loadu_si128(&K64[2 * $i] as *const u64 as *const _),
             );
 
-            _mm_store_si128(&mut ms[2 * $i] as *mut u64 as *mut _, y);
+            ms[$i] = y;
         )*};
     }
 
@@ -114,14 +114,8 @@ unsafe fn load_data_avx2(
             let t = _mm_loadu_si128(K64.as_ptr().add($i * 2) as *const u64 as *const _);
             let y = _mm256_add_epi64(x[$i], _mm256_set_m128i(t, t));
 
-            _mm_store_si128(
-                &mut ms[2 * $i] as *mut u64 as *mut _,
-                _mm256_extracti128_si256(y, 0),
-            );
-            _mm_store_si128(
-                &mut t2[2 * $i] as *mut u64 as *mut _,
-                _mm256_extracti128_si256(y, 1),
-            );
+            ms[$i] = _mm256_extracti128_si256(y, 0);
+            t2[$i] = _mm256_extracti128_si256(y, 1);
         )*};
     }
 
@@ -137,10 +131,13 @@ unsafe fn rounds_0_63_avx(current_state: &mut State, x: &mut [__m128i; 8], ms: &
             let k64 = _mm_loadu_si128(&K64[k64_idx] as *const u64 as *const _);
             let y = sha512_update_x_avx(x, k64);
 
-            sha_round(current_state, ms[2 * j]);
-            sha_round(current_state, ms[2 * j + 1]);
+            {
+                let ms = cast_ms(ms);
+                sha_round(current_state, ms[2 * j]);
+                sha_round(current_state, ms[2 * j + 1]);
+            }
 
-            _mm_store_si128(&mut ms[2 * j] as *const u64 as *mut _, y);
+            ms[j] = y;
             k64_idx += 2;
         }
     }
@@ -160,17 +157,14 @@ unsafe fn rounds_0_63_avx2(
             let t = _mm_loadu_si128(K64.as_ptr().add(k64x4_idx) as *const u64 as *const _);
             let y = sha512_update_x_avx2(x, _mm256_set_m128i(t, t));
 
-            sha_round(current_state, ms[2 * j]);
-            sha_round(current_state, ms[2 * j + 1]);
+            {
+                let ms = cast_ms(ms);
+                sha_round(current_state, ms[2 * j]);
+                sha_round(current_state, ms[2 * j + 1]);
+            }
 
-            _mm_store_si128(
-                &mut ms[2 * j] as *mut u64 as *mut _,
-                _mm256_extracti128_si256(y, 0),
-            );
-            _mm_store_si128(
-                &mut t2[(16 * i) + 2 * j] as *mut u64 as *mut _,
-                _mm256_extracti128_si256(y, 1),
-            );
+            ms[j] = _mm256_extracti128_si256(y, 0);
+            t2[8 * i + j] = _mm256_extracti128_si256(y, 1);
 
             k64x4_idx += 2;
         }
@@ -179,6 +173,7 @@ unsafe fn rounds_0_63_avx2(
 
 #[inline(always)]
 fn rounds_64_79(current_state: &mut State, ms: &MsgSchedule) {
+    let ms = cast_ms(ms);
     for i in 64..80 {
         sha_round(current_state, ms[i & 0xf]);
     }
@@ -186,7 +181,7 @@ fn rounds_64_79(current_state: &mut State, ms: &MsgSchedule) {
 
 #[inline(always)]
 fn process_second_block(current_state: &mut State, t2: &RoundStates) {
-    for t2 in t2.iter() {
+    for t2 in cast_rs(t2).iter() {
         sha_round(current_state, *t2);
     }
 }
@@ -341,9 +336,19 @@ fn_sha512_update_x!(sha512_update_x_avx2, __m256i, {
         XOR = _mm256_xor_si256,
 });
 
+#[inline(always)]
+fn cast_ms(ms: &MsgSchedule) -> &[u64; SHA512_BLOCK_WORDS_NUM] {
+    unsafe { &*(ms as *const MsgSchedule as *const _) }
+}
+
+#[inline(always)]
+fn cast_rs(rs: &RoundStates) -> &[u64; SHA512_ROUNDS_NUM] {
+    unsafe { &*(rs as *const RoundStates as *const _) }
+}
+
 type State = [u64; SHA512_HASH_WORDS_NUM];
-type MsgSchedule = [u64; SHA512_BLOCK_WORDS_NUM];
-type RoundStates = [u64; SHA512_ROUNDS_NUM];
+type MsgSchedule = [__m128i; SHA512_BLOCK_WORDS_NUM / 2];
+type RoundStates = [__m128i; SHA512_ROUNDS_NUM / 2];
 
 const SHA512_BLOCK_BYTE_LEN: usize = 128;
 const SHA512_ROUNDS_NUM: usize = 80;