From aa84a7ce948212438d170dafb3bda6a95b349a0d Mon Sep 17 00:00:00 2001 From: Tom Kaitchuck Date: Mon, 24 Oct 2022 21:36:58 -0700 Subject: [PATCH 1/3] Improve documentation Signed-off-by: Tom Kaitchuck --- README.md | 20 ++++---- src/lib.rs | 67 +++++++++++++++++++-------- src/random_state.rs | 110 ++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 160 insertions(+), 37 deletions(-) diff --git a/README.md b/README.md index be365a7..0dd4465 100644 --- a/README.md +++ b/README.md @@ -53,18 +53,20 @@ map.insert(56, 78); The aHash package has the following flags: * `std`: This enables features which require the standard library. (On by default) This includes providing the utility classes `AHashMap` and `AHashSet`. * `serde`: Enables `serde` support for the utility classes `AHashMap` and `AHashSet`. -* `compile-time-rng`: Whenever possible aHash will seed hashers with random numbers using the [getrandom](https://github.com/rust-random/getrandom) crate. -This is possible for OS targets which provide a source of randomness. (see the [full list](https://docs.rs/getrandom/0.2.0/getrandom/#supported-targets).) -For OS targets without access to a random number generator, `compile-time-rng` provides an alternative. +* `runtime-rng`: To obtain a seed for Hashers will obtain randomness from the operating system. (On by default) +This is done using the [getrandom](https://github.com/rust-random/getrandom) crate. +* `compile-time-rng`: For OS targets without access to a random number generator, `compile-time-rng` provides an alternative. If `getrandom` is unavailable and `compile-time-rng` is enabled, aHash will generate random numbers at compile time and embed them in the binary. This allows for DOS resistance even if there is no random number generator available at runtime (assuming the compiled binary is not public). -This makes the binary non-deterministic, unless `getrandom` is available for the target in which case the flag does nothing. -(If non-determinism is a problem see [constrandom's documentation](https://github.com/tkaitchuck/constrandom#deterministic-builds)) +This makes the binary non-deterministic. (If non-determinism is a problem see [constrandom's documentation](https://github.com/tkaitchuck/constrandom#deterministic-builds)) -**NOTE:** If `getrandom` is unavailable and `compile-time-rng` is disabled aHash will fall back on using the numeric -value of memory addresses as a source of randomness. This is somewhat strong if ALSR is turned on (it is by default) -but for embedded platforms this will result in weak keys. As a result, it is recommended to use `compile-time-rng` anytime -random numbers will not be available at runtime. +If both `runtime-rng` and `compile-time-rng` are enabled the `runtime-rng` will take precedence and `compile-time-rng` will do nothing. + +**NOTE:** If both `runtime-rng` and `compile-time-rng` a source of randomness may be provided by the application on startup +using the [ahash::random_state::set_random_source](https://docs.rs/ahash/latest/ahash/random_state/fn.set_random_source.html) method. +If neither flag is set and this is not done, aHash will fall back on using the numeric value of memory addresses as a source of randomness. +This is somewhat strong if ALSR is turned on (it is by default) but for embedded platforms this will result in weak keys. +As a result, it is recommended to use `compile-time-rng` anytime random numbers will not be available at runtime. ## Comparison with other hashers diff --git a/src/lib.rs b/src/lib.rs index 4566af0..743bae5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,18 +1,24 @@ -//! AHash is a hashing algorithm is intended to be a high performance, (hardware specific), keyed hash function. -//! This can be seen as a DOS resistant alternative to `FxHash`, or a fast equivalent to `SipHash`. -//! It provides a high speed hash algorithm, but where the result is not predictable without knowing a Key. -//! This allows it to be used in a `HashMap` without allowing for the possibility that an malicious user can +//! AHash is a high performance keyed hash function. +//! +//! It is a DOS resistant alternative to `FxHash` or a faster alternative to `SipHash`. +//! +//! It quickly provides a high quality hash where the result is not predictable without knowing the Key. +//! AHash works with `HashMap` to hash keys, but without allowing for the possibility that an malicious user can //! induce a collision. //! //! # How aHash works //! -//! aHash uses the hardware AES instruction on x86 processors to provide a keyed hash function. -//! aHash is not a cryptographically secure hash. +//! When it is available aHash uses the hardware AES instructions to provide a keyed hash function. +//! When it is not, aHash falls back on a slightly slower alternative algorithm. //! +//! AHash does not have a fixed standard for its output. This allows it to improve over time. +//! But this also means that different computers or computers using different versions of ahash will observe different +//! hash values. #![cfg_attr( any(feature = "compile-time-rng", feature = "runtime-rng"), doc = r##" -# Example +# Usage +AHash is a drop in replacement for the default implementation of the Hasher trait. To construct a HashMap using aHash as its hasher do the following: ``` use ahash::{AHasher, RandomState}; use std::collections::HashMap; @@ -25,25 +31,46 @@ map.insert(12, 34); #![cfg_attr( feature = "std", doc = r##" -For convenience, both new-type wrappers and type aliases are provided. The new type wrappers are called called `AHashMap` and `AHashSet`. These do the same thing with slightly less typing. -The type aliases are called `ahash::HashMap`, `ahash::HashSet` are also provided and alias the -std::[HashMap] and std::[HashSet]. Why are there two options? The wrappers are convenient but -can't be used where a generic `std::collection::HashMap` is required. +For convenience, both new-type wrappers and type aliases are provided. +The new type wrappers are called called `AHashMap` and `AHashSet`. +These do the same thing with slightly less typing. (For convience `From`, `Into`, and `Deref` are provided). ``` use ahash::AHashMap; -let mut map: AHashMap = AHashMap::with_capacity(4); +let mut map: AHashMap = AHashMap::new(); map.insert(12, 34); -map.insert(56, 78); -// There are also type aliases provieded together with some extension traits to make -// it more of a drop in replacement for the std::HashMap/HashSet -use ahash::{HashMapExt, HashSetExt}; // Used to get with_capacity() -let mut map = ahash::HashMap::with_capacity(10); +``` + +For even less typing and better interop with existing libraries which require a `std::collection::HashMap` (such as rayon), +the type aliases [HashMap], [HashSet] are provided. These alias the `std::HashMap` and `std::HashSet` using aHash as the hasher. + +``` +use ahash::{HashMap, HashMapExt}; + +let mut map: HashMap = HashMap::new(); map.insert(12, 34); -let mut set = ahash::HashSet::with_capacity(10); -set.insert(10); ``` +Note the import of [HashMapExt]. This is needed for the constructor. + +# Directly hashing + +Hashers can also be instantiated with `RandomState`. For example: +``` +use std::hash::BuildHasher; +use ahash::RandomState; + +let hash_builder = RandomState::with_seed(42); +let hash = hash_builder.hash_one("Some Data"); +``` +### Randomness + +To ensure that each map has a unique set of keys aHash needs a source of randomness. +Normally this is just obtained from the OS. (Or via the `compile-time-rng` flag) + +If for some reason (such as fuzzing) an application wishes to supply all random seeds manually, this can be done via: +[random_state::set_random_source]. + "## )] #![deny(clippy::correctness, clippy::complexity, clippy::perf)] @@ -157,7 +184,7 @@ where /// [AHasher]s in order to hash the keys of the map. /// /// Generally it is preferable to use [RandomState] instead, so that different -/// hashmaps will have different keys. However if fixed keys are desireable this +/// hashmaps will have different keys. However if fixed keys are desirable this /// may be used instead. /// /// # Example diff --git a/src/random_state.rs b/src/random_state.rs index 60ba51f..5f6a085 100644 --- a/src/random_state.rs +++ b/src/random_state.rs @@ -195,6 +195,16 @@ cfg_if::cfg_if! { /// [Hasher]: std::hash::Hasher /// [BuildHasher]: std::hash::BuildHasher /// [HashMap]: std::collections::HashMap +/// +/// There are multiple constructors each is documented in more detail below: +/// +/// | Constructor | Dynamically random? | Seed | +/// |---------------|---------------------|------| +/// |`new` | Each instance unique|_`RandomSource`_| +/// |`generate_with`| Each instance unique|`u64` x 4 + static counter| +/// |`with_seed` | Fixed per process |`u64` + static random number| +/// |`with_seeds` | Fixed |`u64` x 4| +/// #[derive(Clone)] pub struct RandomState { pub(crate) k0: u64, @@ -210,7 +220,11 @@ impl fmt::Debug for RandomState { } impl RandomState { - /// Use randomly generated keys + + /// Create a new `RandomState` `BuildHasher` using random keys. + /// + /// NOTE: This method is only available when a source of randomness is available. So + /// either the flag `runtime-rng` (on by default) or `compile-time-rng` must be enabled. #[inline] #[cfg(any(feature = "compile-time-rng", feature = "runtime-rng"))] pub fn new() -> RandomState { @@ -219,8 +233,15 @@ impl RandomState { Self::from_keys(&fixed[0], &fixed[1], src.gen_hasher_seed()) } - /// Allows for supplying seeds, but each time it is called the resulting state will be different. - /// This is done using a static counter, so it can safely be used with a fixed keys. + /// Create a new `RandomState` `BuildHasher` based on the provided seeds, but in such a way + /// that each time it is called the resulting state will be different and of high quality. + /// This allows fixed constant or poor quality seeds to be provided without the problem of different + /// `BuildHasher`s being identical or weak. + /// + /// This is done via permuting the provided values with the value of a static counter and memory address. + /// (This makes this method somewhat more expensive than `with_seeds` below which does not do this). + /// + /// The provided values (k0-k3) do not need to be of high quality but they should not all be the same value. #[inline] pub fn generate_with(k0: u64, k1: u64, k2: u64, k3: u64) -> RandomState { let src = get_src(); @@ -253,7 +274,11 @@ impl RandomState { RandomState { k0, k1, k2, k3 } } - /// Allows for explicitly setting a seed to used. + /// Build a `RandomState` from a single key. The provided key does not need to be of high quality, + /// but all `RandomState`s created from the same key will produce identical hashers. + /// (In contrast to `generate_with` above) + /// + /// This allows for explicitly setting the seed to be used. /// /// Note: This method does not require the provided seed to be strong. #[inline] @@ -263,9 +288,13 @@ impl RandomState { } /// Allows for explicitly setting the seeds to used. + /// All `RandomState`s created with the same set of keys key will produce identical hashers. + /// (In contrast to `generate_with` above) /// - /// Note: This method is robust against 0s being passed for one or more of the parameters - /// or the same value being passed for more than one parameter. + /// Note: If DOS resistance is desired one of these should be a decent quality random number. + /// If 4 high quality random number are not cheaply available this method is robust against 0s being passed for + /// one or more of the parameters or the same value being passed for more than one parameter. + /// It is recommended to pass numbers in order from highest to lowest quality (if there is any difference). #[inline] pub const fn with_seeds(k0: u64, k1: u64, k2: u64, k3: u64) -> RandomState { RandomState { @@ -276,7 +305,36 @@ impl RandomState { } } - /// Calculates the hash of a single value. + /// Calculates the hash of a single value. This provides a more convenient (and faster) way to obtain a hash: + /// For example: + #[cfg_attr( + any(feature = "compile-time-rng", feature = "runtime-rng"), + doc = r##" # Examples +``` + use std::hash::BuildHasher; + use ahash::RandomState; + + let hash_builder = RandomState::new(); + let hash = hash_builder.hash_one("Some Data"); +``` + "## + )] + /// This is similar to: + #[cfg_attr( + any(feature = "compile-time-rng", feature = "runtime-rng"), + doc = r##" # Examples +``` + use std::hash::{BuildHasher, Hash, Hasher}; + use ahash::RandomState; + + let hash_builder = RandomState::new(); + let mut hasher = hash_builder.build_hasher(); + "Some Data".hash(&mut hasher); + let hash = hasher.finish(); +``` + "## + )] + /// (Note that these two ways to get a hash may not produce the same value for the same data) /// /// This is intended as a convenience for code which *consumes* hashes, such /// as the implementation of a hash table or in unit tests that check @@ -296,6 +354,12 @@ impl RandomState { } } +/// Creates an instance of RandomState using keys obtained from the random number generator. +/// Each instance created in this way will have a unique set of keys. (But the resulting instance +/// can be used to create many hashers each or which will have the same keys.) +/// +/// NOTE: This method is only available when a source of randomness is available. So +/// either the flag `runtime-rng` (on by default) or `compile-time-rng` must be enabled. #[cfg(any(feature = "compile-time-rng", feature = "runtime-rng"))] impl Default for RandomState { #[inline] @@ -343,7 +407,37 @@ impl BuildHasher for RandomState { AHasher::from_random_state(self) } - /// Calculates the hash of a single value. + + /// Calculates the hash of a single value. This provides a more convenient (and faster) way to obtain a hash: + /// For example: + #[cfg_attr( + any(feature = "compile-time-rng", feature = "runtime-rng"), + doc = r##" # Examples +``` + use std::hash::BuildHasher; + use ahash::RandomState; + + let hash_builder = RandomState::new(); + let hash = hash_builder.hash_one("Some Data"); +``` + "## + )] + /// This is similar to: + #[cfg_attr( + any(feature = "compile-time-rng", feature = "runtime-rng"), + doc = r##" # Examples +``` + use std::hash::{BuildHasher, Hash, Hasher}; + use ahash::RandomState; + + let hash_builder = RandomState::new(); + let mut hasher = hash_builder.build_hasher(); + "Some Data".hash(&mut hasher); + let hash = hasher.finish(); +``` + "## + )] + /// (Note that these two ways to get a hash may not produce the same value for the same data) /// /// This is intended as a convenience for code which *consumes* hashes, such /// as the implementation of a hash table or in unit tests that check From fce026d45888c1e05d738ec92ddb0a3f4326cbda Mon Sep 17 00:00:00 2001 From: Tom Kaitchuck Date: Tue, 25 Oct 2022 00:00:03 -0700 Subject: [PATCH 2/3] Add into_keys and into_values Signed-off-by: Tom Kaitchuck --- src/hash_map.rs | 62 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/src/hash_map.rs b/src/hash_map.rs index 693ae4b..e3bbf94 100644 --- a/src/hash_map.rs +++ b/src/hash_map.rs @@ -1,5 +1,6 @@ use std::borrow::Borrow; use std::collections::{hash_map, HashMap}; +use std::collections::hash_map::{IntoKeys, IntoValues}; use std::fmt::{self, Debug}; use std::hash::{BuildHasher, Hash}; use std::iter::FromIterator; @@ -181,6 +182,67 @@ where self.0.insert(k, v) } + /// Creates a consuming iterator visiting all the keys in arbitrary order. + /// The map cannot be used after calling this. + /// The iterator element type is `K`. + /// + /// # Examples + /// + /// ``` + /// use std::collections::HashMap; + /// + /// let map = HashMap::from([ + /// ("a", 1), + /// ("b", 2), + /// ("c", 3), + /// ]); + /// + /// let mut vec: Vec<&str> = map.into_keys().collect(); + /// // The `IntoKeys` iterator produces keys in arbitrary order, so the + /// // keys must be sorted to test them against a sorted array. + /// vec.sort_unstable(); + /// assert_eq!(vec, ["a", "b", "c"]); + /// ``` + /// + /// # Performance + /// + /// In the current implementation, iterating over keys takes O(capacity) time + /// instead of O(len) because it internally visits empty buckets too. + #[inline] + pub fn into_keys(self) -> IntoKeys { + self.0.into_keys() + } + + /// Creates a consuming iterator visiting all the values in arbitrary order. + /// The map cannot be used after calling this. + /// The iterator element type is `V`. + /// + /// # Examples + /// + /// ``` + /// use std::collections::HashMap; + /// + /// let map = HashMap::from([ + /// ("a", 1), + /// ("b", 2), + /// ("c", 3), + /// ]); + /// + /// let mut vec: Vec = map.into_values().collect(); + /// // The `IntoValues` iterator produces values in arbitrary order, so + /// // the values must be sorted to test them against a sorted array. + /// vec.sort_unstable(); + /// assert_eq!(vec, [1, 2, 3]); + /// ``` + /// + /// # Performance + /// + /// In the current implementation, iterating over values takes O(capacity) time + /// instead of O(len) because it internally visits empty buckets too. + pub fn into_values(self) -> IntoValues { + self.0.into_values() + } + /// Removes a key from the map, returning the value at the key if the key /// was previously in the map. /// From ab8ec5dc56c9d4111c73a080c2e71ee5236851bf Mon Sep 17 00:00:00 2001 From: Tom Kaitchuck Date: Tue, 25 Oct 2022 00:35:13 -0700 Subject: [PATCH 3/3] Add inline Signed-off-by: Tom Kaitchuck --- src/hash_map.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/hash_map.rs b/src/hash_map.rs index e3bbf94..dced504 100644 --- a/src/hash_map.rs +++ b/src/hash_map.rs @@ -239,6 +239,7 @@ where /// /// In the current implementation, iterating over values takes O(capacity) time /// instead of O(len) because it internally visits empty buckets too. + #[inline] pub fn into_values(self) -> IntoValues { self.0.into_values() }