From ded63168de4dce7e4e92753bd39d60d20bfb683e Mon Sep 17 00:00:00 2001 From: Ben Kimock Date: Sun, 19 Jun 2022 04:08:56 -0400 Subject: [PATCH] Fix misaligned reference and logic error in crc32 (#1906) Previously, this code tried to turn a &[u8] into a &[u32] without checking alignment. This means it could and did create misaligned references, which is UB. This can be detected by running the tests with -Zbuild-std --target=x86_64-unknown-linux-gnu (or whatever your host is). This change adopts the approach from the murmurhash implementation. The previous implementation also ignored the tail bytes. The loop at the end treats num_bytes as if it is the full length of the slice, but it isn't, num_bytes number of bytes after the last 4-byte group. This can be observed for example by changing "hello" to just "hell" in the tests. Under the old implementation, the test will still pass. Now, the value that comes out changes, and "hello" and "hell" hash to different values. --- parquet/src/util/hash_util.rs | 33 ++++++++++++--------------------- 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/parquet/src/util/hash_util.rs b/parquet/src/util/hash_util.rs index f3705bc32f5..dd23e7a65f4 100644 --- a/parquet/src/util/hash_util.rs +++ b/parquet/src/util/hash_util.rs @@ -107,27 +107,18 @@ unsafe fn crc32_hash(bytes: &[u8], seed: u32) -> u32 { #[cfg(target_arch = "x86_64")] use std::arch::x86_64::*; - let u32_num_bytes = std::mem::size_of::(); - let mut num_bytes = bytes.len(); - let num_words = num_bytes / u32_num_bytes; - num_bytes %= u32_num_bytes; - - let bytes_u32: &[u32] = std::slice::from_raw_parts( - &bytes[0..num_words * u32_num_bytes] as *const [u8] as *const u32, - num_words, - ); - - let mut offset = 0; let mut hash = seed; - while offset < num_words { - hash = _mm_crc32_u32(hash, bytes_u32[offset]); - offset += 1; + for chunk in bytes + .chunks_exact(4) + .map(|chunk| u32::from_le_bytes(chunk.try_into().unwrap())) + { + hash = _mm_crc32_u32(hash, chunk); } - offset = num_words * u32_num_bytes; - while offset < num_bytes { - hash = _mm_crc32_u8(hash, bytes[offset]); - offset += 1; + let remainder = bytes.len() % 4; + + for byte in &bytes[bytes.len() - remainder..] { + hash = _mm_crc32_u8(hash, *byte); } // The lower half of the CRC hash has poor uniformity, so swap the halves @@ -158,13 +149,13 @@ mod tests { if is_x86_feature_detected!("sse4.2") { unsafe { let result = crc32_hash(b"hello", 123); - assert_eq!(result, 2927487359); + assert_eq!(result, 3359043980); let result = crc32_hash(b"helloworld", 123); - assert_eq!(result, 314229527); + assert_eq!(result, 3971745255); let result = crc32_hash(b"helloworldparquet", 123); - assert_eq!(result, 667078870); + assert_eq!(result, 1124504676); } } }