diff --git a/CHANGELOG.md b/CHANGELOG.md index 716f6639a20..856d3ac35e4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,11 @@ A [separate changelog is kept for rand_core](rand_core/CHANGELOG.md). You may also find the [Update Guide](UPDATING.md) useful. +## [0.6.0] - Unreleased + +### Sequences module +- Optimised and changed return type of the `sample_indices` function. (#479) + ## [0.5.4] - 2018-07-11 ### Platform support - Make `OsRng` work via WASM/stdweb for WebWorkers diff --git a/benches/seq.rs b/benches/seq.rs index 260e2334a41..f143131763b 100644 --- a/benches/seq.rs +++ b/benches/seq.rs @@ -1,4 +1,5 @@ #![feature(test)] +#![allow(non_snake_case)] extern crate test; extern crate rand; @@ -27,28 +28,31 @@ fn seq_slice_choose_1_of_1000(b: &mut Bencher) { }) } -#[bench] -fn seq_slice_choose_multiple_1_of_1000(b: &mut Bencher) { - let mut rng = SmallRng::from_rng(thread_rng()).unwrap(); - let x : &[usize] = &[1; 1000]; - b.iter(|| { - x.choose_multiple(&mut rng, 1).cloned().next() - }) -} - -#[bench] -fn seq_slice_choose_multiple_10_of_100(b: &mut Bencher) { - let mut rng = SmallRng::from_rng(thread_rng()).unwrap(); - let x : &[usize] = &[1; 100]; - let mut buf = [0; 10]; - b.iter(|| { - for (v, slot) in x.choose_multiple(&mut rng, buf.len()).zip(buf.iter_mut()) { - *slot = *v; +macro_rules! seq_slice_choose_multiple { + ($name:ident, $amount:expr, $length:expr) => { + #[bench] + fn $name(b: &mut Bencher) { + let mut rng = SmallRng::from_rng(thread_rng()).unwrap(); + let x : &[i32] = &[$amount; $length]; + let mut result = [0i32; $amount]; + b.iter(|| { + // Collect full result to prevent unwanted shortcuts getting + // first element (in case sample_indices returns an iterator). + for (slot, sample) in result.iter_mut().zip( + x.choose_multiple(&mut rng, $amount)) { + *slot = *sample; + } + result[$amount-1] + }) } - buf - }) + } } +seq_slice_choose_multiple!(seq_slice_choose_multiple_1_of_1000, 1, 1000); +seq_slice_choose_multiple!(seq_slice_choose_multiple_950_of_1000, 950, 1000); +seq_slice_choose_multiple!(seq_slice_choose_multiple_10_of_100, 10, 100); +seq_slice_choose_multiple!(seq_slice_choose_multiple_90_of_100, 90, 100); + #[bench] fn seq_iter_choose_from_100(b: &mut Bencher) { let mut rng = SmallRng::from_rng(thread_rng()).unwrap(); @@ -78,17 +82,22 @@ fn seq_iter_choose_multiple_fill_10_of_100(b: &mut Bencher) { } macro_rules! sample_indices { - ($name:ident, $amount:expr, $length:expr) => { + ($name:ident, $fn:ident, $amount:expr, $length:expr) => { #[bench] fn $name(b: &mut Bencher) { let mut rng = SmallRng::from_rng(thread_rng()).unwrap(); b.iter(|| { - sample_indices(&mut rng, $length, $amount) + index::$fn(&mut rng, $length, $amount) }) } } } -sample_indices!(seq_sample_indices_10_of_1k, 10, 1000); -sample_indices!(seq_sample_indices_50_of_1k, 50, 1000); -sample_indices!(seq_sample_indices_100_of_1k, 100, 1000); +sample_indices!(misc_sample_indices_1_of_1k, sample, 1, 1000); +sample_indices!(misc_sample_indices_10_of_1k, sample, 10, 1000); +sample_indices!(misc_sample_indices_100_of_1k, sample, 100, 1000); +sample_indices!(misc_sample_indices_100_of_1M, sample, 100, 1000_000); +sample_indices!(misc_sample_indices_100_of_1G, sample, 100, 1000_000_000); +sample_indices!(misc_sample_indices_200_of_1G, sample, 200, 1000_000_000); +sample_indices!(misc_sample_indices_400_of_1G, sample, 400, 1000_000_000); +sample_indices!(misc_sample_indices_600_of_1G, sample, 600, 1000_000_000); diff --git a/src/lib.rs b/src/lib.rs index f07a68c495a..197fc2546fc 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -234,7 +234,7 @@ #![cfg_attr(feature = "wasm-bindgen", feature(wasm_import_module))] #[cfg(feature = "std")] extern crate core; -#[cfg(all(feature = "alloc", not(feature="std")))] extern crate alloc; +#[cfg(all(feature = "alloc", not(feature="std")))] #[macro_use] extern crate alloc; #[cfg(feature="simd_support")] extern crate packed_simd; diff --git a/src/seq/index.rs b/src/seq/index.rs new file mode 100644 index 00000000000..805b7f3c2f3 --- /dev/null +++ b/src/seq/index.rs @@ -0,0 +1,398 @@ +// Copyright 2018 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// https://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! Index sampling + +#[cfg(feature="alloc")] use core::slice; + +#[cfg(feature="std")] use std::vec; +#[cfg(all(feature="alloc", not(feature="std")))] use alloc::vec::{self, Vec}; +// BTreeMap is not as fast in tests, but better than nothing. +#[cfg(feature="std")] use std::collections::{HashSet}; +#[cfg(all(feature="alloc", not(feature="std")))] use alloc::collections::BTreeSet; + +#[cfg(feature="alloc")] use distributions::{Distribution, Uniform}; +use Rng; + +/// A vector of indices. +/// +/// Multiple internal representations are possible. +#[derive(Clone, Debug)] +pub enum IndexVec { + #[doc(hidden)] U32(Vec), + #[doc(hidden)] USize(Vec), +} + +impl IndexVec { + /// Returns the number of indices + pub fn len(&self) -> usize { + match self { + &IndexVec::U32(ref v) => v.len(), + &IndexVec::USize(ref v) => v.len(), + } + } + + /// Return the value at the given `index`. + /// + /// (Note: we cannot implement `std::ops::Index` because of lifetime + /// restrictions.) + pub fn index(&self, index: usize) -> usize { + match self { + &IndexVec::U32(ref v) => v[index] as usize, + &IndexVec::USize(ref v) => v[index], + } + } + + /// Return result as a `Vec`. Conversion may or may not be trivial. + pub fn into_vec(self) -> Vec { + match self { + IndexVec::U32(v) => v.into_iter().map(|i| i as usize).collect(), + IndexVec::USize(v) => v, + } + } + + /// Iterate over the indices as a sequence of `usize` values + pub fn iter<'a>(&'a self) -> IndexVecIter<'a> { + match self { + &IndexVec::U32(ref v) => IndexVecIter::U32(v.iter()), + &IndexVec::USize(ref v) => IndexVecIter::USize(v.iter()), + } + } + + /// Convert into an iterator over the indices as a sequence of `usize` values + pub fn into_iter(self) -> IndexVecIntoIter { + match self { + IndexVec::U32(v) => IndexVecIntoIter::U32(v.into_iter()), + IndexVec::USize(v) => IndexVecIntoIter::USize(v.into_iter()), + } + } +} + +impl PartialEq for IndexVec { + fn eq(&self, other: &IndexVec) -> bool { + use self::IndexVec::*; + match (self, other) { + (&U32(ref v1), &U32(ref v2)) => v1 == v2, + (&USize(ref v1), &USize(ref v2)) => v1 == v2, + (&U32(ref v1), &USize(ref v2)) => (v1.len() == v2.len()) + && (v1.iter().zip(v2.iter()).all(|(x, y)| *x as usize == *y)), + (&USize(ref v1), &U32(ref v2)) => (v1.len() == v2.len()) + && (v1.iter().zip(v2.iter()).all(|(x, y)| *x == *y as usize)), + } + } +} + +impl From> for IndexVec { + fn from(v: Vec) -> Self { + IndexVec::U32(v) + } +} + +impl From> for IndexVec { + fn from(v: Vec) -> Self { + IndexVec::USize(v) + } +} + +/// Return type of `IndexVec::iter`. +#[derive(Debug)] +pub enum IndexVecIter<'a> { + #[doc(hidden)] U32(slice::Iter<'a, u32>), + #[doc(hidden)] USize(slice::Iter<'a, usize>), +} + +impl<'a> Iterator for IndexVecIter<'a> { + type Item = usize; + fn next(&mut self) -> Option { + use self::IndexVecIter::*; + match self { + &mut U32(ref mut iter) => iter.next().map(|i| *i as usize), + &mut USize(ref mut iter) => iter.next().cloned(), + } + } + + fn size_hint(&self) -> (usize, Option) { + match self { + &IndexVecIter::U32(ref v) => v.size_hint(), + &IndexVecIter::USize(ref v) => v.size_hint(), + } + } +} + +impl<'a> ExactSizeIterator for IndexVecIter<'a> {} + +/// Return type of `IndexVec::into_iter`. +#[derive(Clone, Debug)] +pub enum IndexVecIntoIter { + #[doc(hidden)] U32(vec::IntoIter), + #[doc(hidden)] USize(vec::IntoIter), +} + +impl Iterator for IndexVecIntoIter { + type Item = usize; + + fn next(&mut self) -> Option { + use self::IndexVecIntoIter::*; + match self { + &mut U32(ref mut v) => v.next().map(|i| i as usize), + &mut USize(ref mut v) => v.next(), + } + } + + fn size_hint(&self) -> (usize, Option) { + use self::IndexVecIntoIter::*; + match self { + &U32(ref v) => v.size_hint(), + &USize(ref v) => v.size_hint(), + } + } +} + +impl ExactSizeIterator for IndexVecIntoIter {} + + +/// Randomly sample exactly `amount` distinct indices from `0..length`, and +/// return them in random order (fully shuffled). +/// +/// This method is used internally by the slice sampling methods, but it can +/// sometimes be useful to have the indices themselves so this is provided as +/// an alternative. +/// +/// The implementation used is not specified; we automatically select the +/// fastest available algorithm for the `length` and `amount` parameters +/// (based on detailed profiling on an Intel Haswell CPU). Roughly speaking, +/// complexity is `O(amount)`, except that when `amount` is small, performance +/// is closer to `O(amount^2)`, and when `length` is close to `amount` then +/// `O(length)`. +/// +/// Note that performance is significantly better over `u32` indices than over +/// `u64` indices. Because of this we hide the underlying type behind an +/// abstraction, `IndexVec`. +/// +/// If an allocation-free `no_std` function is required, it is suggested +/// to adapt the internal `sample_floyd` implementation. +/// +/// Panics if `amount > length`. +pub fn sample(rng: &mut R, length: usize, amount: usize) -> IndexVec + where R: Rng + ?Sized, +{ + if amount > length { + panic!("`amount` of samples must be less than or equal to `length`"); + } + if length > (::core::u32::MAX as usize) { + // We never want to use inplace here, but could use floyd's alg + // Lazy version: always use the cache alg. + return sample_rejection(rng, length, amount); + } + let amount = amount as u32; + let length = length as u32; + + // Choice of algorithm here depends on both length and amount. See: + // https://github.com/rust-lang-nursery/rand/pull/479 + // We do some calculations with f32. Accuracy is not very important. + + if amount < 163 { + const C: [[f32; 2]; 2] = [[1.6, 8.0/45.0], [10.0, 70.0/9.0]]; + let j = if length < 500_000 { 0 } else { 1 }; + let amount_fp = amount as f32; + let m4 = C[0][j] * amount_fp; + // Short-cut: when amount < 12, floyd's is always faster + if amount > 11 && (length as f32) < (C[1][j] + m4) * amount_fp { + sample_inplace(rng, length, amount) + } else { + sample_floyd(rng, length, amount) + } + } else { + const C: [f32; 2] = [270.0, 330.0/9.0]; + let j = if length < 500_000 { 0 } else { 1 }; + if (length as f32) < C[j] * (amount as f32) { + sample_inplace(rng, length, amount) + } else { + // note: could have a specific u32 impl, but I'm lazy and + // generics don't have usable conversions + sample_rejection(rng, length as usize, amount as usize) + } + } +} + +/// Randomly sample exactly `amount` indices from `0..length`, using Floyd's +/// combination algorithm. +/// +/// The output values are fully shuffled. (Overhead is under 50%.) +/// +/// This implementation uses `O(amount)` memory and `O(amount^2)` time. +fn sample_floyd(rng: &mut R, length: u32, amount: u32) -> IndexVec + where R: Rng + ?Sized, +{ + // Shouldn't this be on std::slice? + fn find_pos>(slice: &[T], elt: T) -> Option { + for i in 0..slice.len() { + if slice[i] == elt { + return Some(i); + } + } + None + } + + // For small amount we use Floyd's fully-shuffled variant. For larger + // amounts this is slow due to Vec::insert performance, so we shuffle + // afterwards. Benchmarks show little overhead from extra logic. + let floyd_shuffle = amount < 50; + + debug_assert!(amount <= length); + let mut indices = Vec::with_capacity(amount as usize); + for j in length - amount .. length { + let t = rng.gen_range(0, j + 1); + if floyd_shuffle { + if let Some(pos) = find_pos(&indices, t) { + indices.insert(pos, j); + continue; + } + } else { + if indices.contains(&t) { + indices.push(j); + continue; + } + } + indices.push(t); + } + if !floyd_shuffle { + // Reimplement SliceRandom::shuffle with smaller indices + for i in (1..amount).rev() { + // invariant: elements with index > i have been locked in place. + indices.swap(i as usize, rng.gen_range(0, i + 1) as usize); + } + } + IndexVec::from(indices) +} + +/// Randomly sample exactly `amount` indices from `0..length`, using an inplace +/// partial Fisher-Yates method. +/// Sample an amount of indices using an inplace partial fisher yates method. +/// +/// This allocates the entire `length` of indices and randomizes only the first `amount`. +/// It then truncates to `amount` and returns. +/// +/// This method is not appropriate for large `length` and potentially uses a lot +/// of memory; because of this we only implement for `u32` index (which improves +/// performance in all cases). +/// +/// Set-up is `O(length)` time and memory and shuffling is `O(amount)` time. +fn sample_inplace(rng: &mut R, length: u32, amount: u32) -> IndexVec + where R: Rng + ?Sized, +{ + debug_assert!(amount <= length); + let mut indices: Vec = Vec::with_capacity(length as usize); + indices.extend(0..length); + for i in 0..amount { + let j: u32 = rng.gen_range(i, length); + indices.swap(i as usize, j as usize); + } + indices.truncate(amount as usize); + debug_assert_eq!(indices.len(), amount as usize); + IndexVec::from(indices) +} + +/// Randomly sample exactly `amount` indices from `0..length`, using rejection +/// sampling. +/// +/// Since `amount <<< length` there is a low chance of a random sample in +/// `0..length` being a duplicate. We test for duplicates and resample where +/// necessary. The algorithm is `O(amount)` time and memory. +fn sample_rejection(rng: &mut R, length: usize, amount: usize) -> IndexVec + where R: Rng + ?Sized, +{ + debug_assert!(amount < length); + #[cfg(feature="std")] let mut cache = HashSet::with_capacity(amount); + #[cfg(not(feature="std"))] let mut cache = BTreeSet::new(); + let distr = Uniform::new(0, length); + let mut indices = Vec::with_capacity(amount); + for _ in 0..amount { + let mut pos = distr.sample(rng); + while !cache.insert(pos) { + pos = distr.sample(rng); + } + indices.push(pos); + } + + debug_assert_eq!(indices.len(), amount); + IndexVec::from(indices) +} + +#[cfg(test)] +mod test { + use super::*; + use {Rng, SeedableRng}; + use prng::XorShiftRng; + + #[test] + fn test_sample_boundaries() { + let mut r = ::test::rng(404); + + assert_eq!(sample_inplace(&mut r, 0, 0).len(), 0); + assert_eq!(sample_inplace(&mut r, 1, 0).len(), 0); + assert_eq!(sample_inplace(&mut r, 1, 1).into_vec(), vec![0]); + + assert_eq!(sample_rejection(&mut r, 1, 0).len(), 0); + + assert_eq!(sample_floyd(&mut r, 0, 0).len(), 0); + assert_eq!(sample_floyd(&mut r, 1, 0).len(), 0); + assert_eq!(sample_floyd(&mut r, 1, 1).into_vec(), vec![0]); + + // These algorithms should be fast with big numbers. Test average. + let sum: usize = sample_rejection(&mut r, 1 << 25, 10) + .into_iter().sum(); + assert!(1 << 25 < sum && sum < (1 << 25) * 25); + + let sum: usize = sample_floyd(&mut r, 1 << 25, 10) + .into_iter().sum(); + assert!(1 << 25 < sum && sum < (1 << 25) * 25); + } + + #[test] + fn test_sample_alg() { + let xor_rng = XorShiftRng::from_seed; + + let mut r = ::test::rng(403); + let mut seed = [0u8; 16]; + + // We can't test which algorithm is used directly, but Floyd's alg + // should produce different results from the others. (Also, `inplace` + // and `cached` currently use different sizes thus produce different results.) + + // A small length and relatively large amount should use inplace + r.fill(&mut seed); + let (length, amount): (usize, usize) = (100, 50); + let v1 = sample(&mut xor_rng(seed), length, amount); + let v2 = sample_inplace(&mut xor_rng(seed), length as u32, amount as u32); + assert!(v1.iter().all(|e| e < length)); + assert_eq!(v1, v2); + + // Test Floyd's alg does produce different results + let v3 = sample_floyd(&mut xor_rng(seed), length as u32, amount as u32); + assert!(v1 != v3); + + // A large length and small amount should use Floyd + r.fill(&mut seed); + let (length, amount): (usize, usize) = (1<<20, 50); + let v1 = sample(&mut xor_rng(seed), length, amount); + let v2 = sample_floyd(&mut xor_rng(seed), length as u32, amount as u32); + assert!(v1.iter().all(|e| e < length)); + assert_eq!(v1, v2); + + // A large length and larger amount should use cache + r.fill(&mut seed); + let (length, amount): (usize, usize) = (1<<20, 600); + let v1 = sample(&mut xor_rng(seed), length, amount); + let v2 = sample_rejection(&mut xor_rng(seed), length, amount); + assert!(v1.iter().all(|e| e < length)); + assert_eq!(v1, v2); + } +} diff --git a/src/seq.rs b/src/seq/mod.rs similarity index 79% rename from src/seq.rs rename to src/seq/mod.rs index e030712b3d1..4e06bac2863 100644 --- a/src/seq.rs +++ b/src/seq/mod.rs @@ -12,18 +12,15 @@ //! //! TODO: module doc + +#[cfg(feature="alloc")] pub mod index; + #[cfg(feature="alloc")] use core::ops::Index; -#[cfg(feature="std")] use std::vec; -#[cfg(all(feature="alloc", not(feature="std")))] use alloc::vec; #[cfg(all(feature="alloc", not(feature="std")))] use alloc::vec::Vec; -// BTreeMap is not as fast in tests, but better than nothing. -#[cfg(feature="std")] use std::collections::HashMap; -#[cfg(all(feature="alloc", not(feature="std")))] use alloc::collections::BTreeMap; -#[cfg(feature = "alloc")] use distributions::WeightedError; - -use super::Rng; +use Rng; +#[cfg(feature="alloc")] use distributions::WeightedError; #[cfg(feature="alloc")] use distributions::uniform::{SampleUniform, SampleBorrow}; /// Extension trait on slices, providing random mutation and sampling methods. @@ -61,18 +58,12 @@ pub trait SliceRandom { where R: Rng + ?Sized; /// Produces an iterator that chooses `amount` elements from the slice at - /// random without repeating any. - /// - /// In case this API is not sufficiently flexible, use `sample_indices` then - /// apply the indices to the slice. + /// random without repeating any, and returns them in random order. /// - /// Although the elements are selected randomly, the order of returned - /// elements is neither stable nor fully random. If random ordering is - /// desired, either use `partial_shuffle` or use this method and shuffle - /// the result. If stable order is desired, use `sample_indices`, sort the - /// result, then apply to the slice. + /// In case this API is not sufficiently flexible, use `index::sample` then + /// apply the indices to the slice. /// - /// Complexity is expected to be the same as `sample_indices`. + /// Complexity is expected to be the same as `index::sample`. /// /// # Example /// ``` @@ -317,14 +308,15 @@ impl SliceRandom for [T] { } #[cfg(feature = "alloc")] - fn choose_multiple(&self, rng: &mut R, amount: usize) -> SliceChooseIter + fn choose_multiple(&self, rng: &mut R, amount: usize) + -> SliceChooseIter where R: Rng + ?Sized { let amount = ::core::cmp::min(amount, self.len()); SliceChooseIter { slice: self, _phantom: Default::default(), - indices: sample_indices(rng, self.len(), amount).into_iter(), + indices: index::sample(rng, self.len(), amount).into_iter(), } } @@ -396,7 +388,7 @@ impl IteratorRandom for I where I: Iterator + Sized {} pub struct SliceChooseIter<'a, S: ?Sized + 'a, T: 'a> { slice: &'a S, _phantom: ::core::marker::PhantomData, - indices: vec::IntoIter, + indices: index::IndexVecIntoIter, } #[cfg(feature = "alloc")] @@ -405,7 +397,7 @@ impl<'a, S: Index + ?Sized + 'a, T: 'a> Iterator for SliceCho fn next(&mut self) -> Option { // TODO: investigate using SliceIndex::get_unchecked when stable - self.indices.next().map(|i| &(*self.slice)[i]) + self.indices.next().map(|i| &self.slice[i as usize]) } fn size_hint(&self) -> (usize, Option) { @@ -461,10 +453,10 @@ pub fn sample_slice(rng: &mut R, slice: &[T], amount: usize) -> Vec where R: Rng + ?Sized, T: Clone { - let indices = sample_indices(rng, slice.len(), amount); + let indices = index::sample(rng, slice.len(), amount).into_iter(); let mut out = Vec::with_capacity(amount); - out.extend(indices.iter().map(|i| slice[*i].clone())); + out.extend(indices.map(|i| slice[i].clone())); out } @@ -484,113 +476,10 @@ pub fn sample_slice(rng: &mut R, slice: &[T], amount: usize) -> Vec pub fn sample_slice_ref<'a, R, T>(rng: &mut R, slice: &'a [T], amount: usize) -> Vec<&'a T> where R: Rng + ?Sized { - let indices = sample_indices(rng, slice.len(), amount); - - let mut out = Vec::with_capacity(amount); - out.extend(indices.iter().map(|i| &slice[*i])); - out -} - -/// Randomly sample exactly `amount` indices from `0..length`. -/// -/// The values are non-repeating and in random order. -/// -/// This implementation uses `O(amount)` time and memory. -/// -/// This method is used internally by the slice sampling methods, but it can sometimes be useful to -/// have the indices themselves so this is provided as an alternative. -/// -/// Panics if `amount > length` -#[cfg(feature = "alloc")] -pub fn sample_indices(rng: &mut R, length: usize, amount: usize) -> Vec - where R: Rng + ?Sized, -{ - if amount > length { - panic!("`amount` must be less than or equal to `slice.len()`"); - } - - // We are going to have to allocate at least `amount` for the output no matter what. However, - // if we use the `cached` version we will have to allocate `amount` as a HashMap as well since - // it inserts an element for every loop. - // - // Therefore, if `amount >= length / 2` then inplace will be both faster and use less memory. - // In fact, benchmarks show the inplace version is faster for length up to about 20 times - // faster than amount. - // - // TODO: there is probably even more fine-tuning that can be done here since - // `HashMap::with_capacity(amount)` probably allocates more than `amount` in practice, - // and a trade off could probably be made between memory/cpu, since hashmap operations - // are slower than array index swapping. - if amount >= length / 20 { - sample_indices_inplace(rng, length, amount) - } else { - sample_indices_cache(rng, length, amount) - } -} - -/// Sample an amount of indices using an inplace partial fisher yates method. -/// -/// This allocates the entire `length` of indices and randomizes only the first `amount`. -/// It then truncates to `amount` and returns. -/// -/// This is better than using a `HashMap` "cache" when `amount >= length / 2` -/// since it does not require allocating an extra cache and is much faster. -#[cfg(feature = "alloc")] -fn sample_indices_inplace(rng: &mut R, length: usize, amount: usize) -> Vec - where R: Rng + ?Sized, -{ - debug_assert!(amount <= length); - let mut indices: Vec = Vec::with_capacity(length); - indices.extend(0..length); - for i in 0..amount { - let j: usize = rng.gen_range(i, length); - indices.swap(i, j); - } - indices.truncate(amount); - debug_assert_eq!(indices.len(), amount); - indices -} + let indices = index::sample(rng, slice.len(), amount).into_iter(); - -/// This method performs a partial fisher-yates on a range of indices using a -/// `HashMap` as a cache to record potential collisions. -/// -/// The cache avoids allocating the entire `length` of values. This is especially useful when -/// `amount <<< length`, i.e. select 3 non-repeating from `1_000_000` -#[cfg(feature = "alloc")] -fn sample_indices_cache( - rng: &mut R, - length: usize, - amount: usize, -) -> Vec - where R: Rng + ?Sized, -{ - debug_assert!(amount <= length); - #[cfg(feature="std")] let mut cache = HashMap::with_capacity(amount); - #[cfg(not(feature="std"))] let mut cache = BTreeMap::new(); let mut out = Vec::with_capacity(amount); - for i in 0..amount { - let j: usize = rng.gen_range(i, length); - - // equiv: let tmp = slice[i]; - let tmp = match cache.get(&i) { - Some(e) => *e, - None => i, - }; - - // equiv: slice[i] = slice[j]; - let x = match cache.get(&j) { - Some(x) => *x, - None => j, - }; - - // equiv: slice[j] = tmp; - cache.insert(j, tmp); - - // note that in the inplace version, slice[i] is automatically "returned" value - out.push(x); - } - debug_assert_eq!(out.len(), amount); + out.extend(indices.map(|i| &slice[i])); out } @@ -648,7 +537,6 @@ mod test { #[test] fn test_shuffle() { - let mut r = ::test::rng(108); let empty: &mut [isize] = &mut []; empty.shuffle(&mut r); @@ -752,14 +640,6 @@ mod test { let v = sample_slice(&mut r, &[42, 133], 2); assert!(&v[..] == [42, 133] || v[..] == [133, 42]); - assert_eq!(&sample_indices_inplace(&mut r, 0, 0)[..], [0usize; 0]); - assert_eq!(&sample_indices_inplace(&mut r, 1, 0)[..], [0usize; 0]); - assert_eq!(&sample_indices_inplace(&mut r, 1, 1)[..], [0]); - - assert_eq!(&sample_indices_cache(&mut r, 0, 0)[..], [0usize; 0]); - assert_eq!(&sample_indices_cache(&mut r, 1, 0)[..], [0usize; 0]); - assert_eq!(&sample_indices_cache(&mut r, 1, 1)[..], [0]); - // Make sure lucky 777's aren't lucky let slice = &[42, 777]; let mut num_42 = 0; @@ -783,43 +663,29 @@ mod test { fn test_sample_slice() { let xor_rng = XorShiftRng::from_seed; - let max_range = 100; let mut r = ::test::rng(403); - for length in 1usize..max_range { + for n in 1..20 { + let length = 5*n - 4; // 1, 6, ... let amount = r.gen_range(0, length); let mut seed = [0u8; 16]; r.fill(&mut seed); - // assert that the two index methods give exactly the same result - let inplace = sample_indices_inplace( - &mut xor_rng(seed), length, amount); - let cache = sample_indices_cache( - &mut xor_rng(seed), length, amount); - assert_eq!(inplace, cache); - // assert the basics work - let regular = sample_indices( - &mut xor_rng(seed), length, amount); + let regular = index::sample(&mut xor_rng(seed), length, amount); assert_eq!(regular.len(), amount); - assert!(regular.iter().all(|e| *e < length)); - assert_eq!(regular, inplace); + assert!(regular.iter().all(|e| e < length)); // also test that sampling the slice works - let vec: Vec = (0..length).collect(); - { - let result = sample_slice(&mut xor_rng(seed), &vec, amount); - assert_eq!(result, regular); - } + let vec: Vec = (0..(length as u32)).collect(); + let result = sample_slice(&mut xor_rng(seed), &vec, amount); + assert_eq!(result, regular.iter().map(|i| i as u32).collect::>()); - { - let result = sample_slice_ref(&mut xor_rng(seed), &vec, amount); - let expected = regular.iter().map(|v| v).collect::>(); - assert_eq!(result, expected); - } + let result = sample_slice_ref(&mut xor_rng(seed), &vec, amount); + assert!(result.iter().zip(regular.iter()).all(|(i,j)| **i == j as u32)); } } - + #[test] #[cfg(feature = "alloc")] fn test_weighted() {