rust-random · zrneely · May 11, 2020 · May 11, 2020 · May 11, 2020 · May 11, 2020
diff --git a/Cargo.toml b/Cargo.toml
@@ -23,7 +23,7 @@ appveyor = { repository = "rust-random/rand" }
 [features]
 # Meta-features:
 default = ["std", "std_rng"]
-nightly = ["simd_support"] # enables all features requiring nightly rust
+nightly = ["simd_support", "partition_at_index"] # enables all features requiring nightly rust
 serde1 = [] # does nothing, deprecated
 
 # Option (enabled by default): without "std" rand uses libcore; this option
@@ -45,6 +45,9 @@ std_rng = ["rand_chacha", "rand_hc"]
 # Option: enable SmallRng
 small_rng = ["rand_pcg"]
 
+# Option (requires nightly): better performance of choose_multiple_weighted
+partition_at_index = []
+
 [workspace]
 members = [
     "rand_core",

diff --git a/benches/seq.rs b/benches/seq.rs
@@ -177,3 +177,24 @@ sample_indices!(misc_sample_indices_100_of_1G, sample, 100, 1000_000_000);
 sample_indices!(misc_sample_indices_200_of_1G, sample, 200, 1000_000_000);
 sample_indices!(misc_sample_indices_400_of_1G, sample, 400, 1000_000_000);
 sample_indices!(misc_sample_indices_600_of_1G, sample, 600, 1000_000_000);
+
+macro_rules! sample_indices_rand_weights {
+    ($name:ident, $amount:expr, $length:expr) => {
+        #[bench]
+        fn $name(b: &mut Bencher) {
+            let mut rng = SmallRng::from_rng(thread_rng()).unwrap();
+            b.iter(|| {
+                index::sample_weighted(&mut rng, $length, |idx| (1 + (idx % 100)) as u32, $amount)
+            })
+        }
+    };
+}
+
+sample_indices_rand_weights!(misc_sample_weighted_indices_1_of_1k, 1, 1000);
+sample_indices_rand_weights!(misc_sample_weighted_indices_10_of_1k, 10, 1000);
+sample_indices_rand_weights!(misc_sample_weighted_indices_100_of_1k, 100, 1000);
+sample_indices_rand_weights!(misc_sample_weighted_indices_100_of_1M, 100, 1000_000);
+sample_indices_rand_weights!(misc_sample_weighted_indices_200_of_1M, 200, 1000_000);
+sample_indices_rand_weights!(misc_sample_weighted_indices_400_of_1M, 400, 1000_000);
+sample_indices_rand_weights!(misc_sample_weighted_indices_600_of_1M, 600, 1000_000);
+sample_indices_rand_weights!(misc_sample_weighted_indices_1k_of_1M, 1000, 1000_000);
diff --git a/src/lib.rs b/src/lib.rs
@@ -50,6 +50,7 @@
 #![doc(test(attr(allow(unused_variables), deny(warnings))))]
 #![cfg_attr(not(feature = "std"), no_std)]
 #![cfg_attr(all(feature = "simd_support", feature = "nightly"), feature(stdsimd))]
+#![cfg_attr(all(feature = "partition_at_index", feature = "nightly"), feature(slice_partition_at_index))]
 #![allow(
     clippy::excessive_precision,
     clippy::unreadable_literal,

diff --git a/src/seq/index.rs b/src/seq/index.rs
@@ -8,18 +8,21 @@
 
 //! Low-level API for sampling indices
 
-#[cfg(feature = "alloc")] use core::slice;
+#[cfg(feature = "alloc")]
+use core::slice;
 
 #[cfg(all(feature = "alloc", not(feature = "std")))]
 use crate::alloc::vec::{self, Vec};
-#[cfg(feature = "std")] use std::vec;
+#[cfg(feature = "std")]
+use std::vec;
 // BTreeMap is not as fast in tests, but better than nothing.
 #[cfg(all(feature = "alloc", not(feature = "std")))]
 use crate::alloc::collections::BTreeSet;
-#[cfg(feature = "std")] use std::collections::HashSet;
+#[cfg(feature = "std")]
+use std::collections::HashSet;
 
 #[cfg(feature = "alloc")]
-use crate::distributions::{uniform::SampleUniform, Distribution, Uniform};
+use crate::distributions::{uniform::SampleUniform, Distribution, Uniform, WeightedError};
 use crate::Rng;
 
 /// A vector of indices.
@@ -249,6 +252,117 @@ where R: Rng + ?Sized {
     }
 }
 
+/// Randomly sample exactly `amount` distinct indices from `0..length`, and
+/// return them in an arbitrary order (there is no guarantee of shuffling or
+/// ordering). The weights are to be provided by the input function `weights`,
+/// which will be called once for each index.
+///
+/// This method is used internally by the slice sampling methods, but it can
+/// sometimes be useful to have the indices themselves so this is provided as
+/// an alternative.
+///
+/// This implementation uses `O(length + amount)` space and `O(length)` time
+/// if the "partition_at_index" feature is enabled, or `O(length)` space and
+/// `O(length + amount * log length)` time otherwise.
+///
+/// Panics if `amount > length`.
+pub fn sample_weighted<R, F, X>(
+    rng: &mut R, length: usize, weight: F, amount: usize,
+) -> Result<IndexVec, WeightedError>
+where
+    R: Rng + ?Sized,
+    F: Fn(usize) -> X,
+    X: Into<f64>,
+{
+    if amount > length {
+        panic!("`amount` of samples must be less than or equal to `length`");
+    }
+
+    // This implementation uses the algorithm described by Efraimidis and Spirakis
+    // in this paper: https://doi.org/10.1016/j.ipl.2005.11.003
+
+    struct Element {
+        index: usize,
+        key: f64,
+    }
+    impl PartialOrd for Element {
+        fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
+            self.key
+                .partial_cmp(&other.key)
+                .or(Some(core::cmp::Ordering::Less))
+        }
+    }
+    impl Ord for Element {
+        fn cmp(&self, other: &Self) -> core::cmp::Ordering {
+            self.partial_cmp(other).unwrap() // partial_cmp will always produce a value
+        }
+    }
+    impl PartialEq for Element {
+        fn eq(&self, other: &Self) -> bool {
+            self.key == other.key
+        }
+    }
+    impl Eq for Element {}
+
+    #[cfg(feature = "partition_at_index")]
+    {
+        if length == 0 {
+            return Ok(IndexVec::USize(Vec::new()));
+        }
+
+        let mut candidates = Vec::with_capacity(length);
+        for index in 0..length {
+            let weight = weight(index).into();
+            if weight < 0.0 || weight.is_nan() {
+                return Err(WeightedError::InvalidWeight);
+            }
+
+            let key = rng.gen::<f64>().powf(1.0 / weight);
+            candidates.push(Element { index, key })
+        }
+
+        // Partially sort the array to find the `amount` elements with the greatest
+        // keys. Do this by using `partition_at_index` to put the elements with
+        // the *smallest* keys at the beginning of the list in `O(n)` time, which
+        // provides equivalent information about the elements with the *greatest* keys.
+        let (_, mid, greater) = candidates.partition_at_index(length - amount);
+
+        let mut result = Vec::with_capacity(amount);
+        result.push(mid.index);
+        for element in greater {
+            result.push(element.index);
+        }
+        Ok(IndexVec::USize(result))
+    }
+
+    #[cfg(not(feature = "partition_at_index"))]
+    {
+        #[cfg(all(feature = "alloc", not(feature = "std")))]
+        use crate::alloc::collections::BinaryHeap;
+        #[cfg(feature = "std")]
+        use std::collections::BinaryHeap;
+
+        // Partially sort the array such that the `amount` elements with the largest
+        // keys are first using a binary max heap.
+        let mut candidates = BinaryHeap::with_capacity(length);
+        for index in 0..length {
+            let weight = weight(index).into();
+            if weight < 0.0 || weight.is_nan() {
+                return Err(WeightedError::InvalidWeight);
+            }
+
+            let key = rng.gen::<f64>().powf(1.0 / weight);
+            candidates.push(Element { index, key });
+        }
+
+        let mut result = Vec::with_capacity(amount);
+        while result.len() < amount {
+            result.push(candidates.pop().unwrap().index);
+        }
+        Ok(IndexVec::USize(result))
+    }
+}
+
 /// Randomly sample exactly `amount` indices from `0..length`, using Floyd's
 /// combination algorithm.
 ///