Skip to content

Commit

Permalink
seq: use Floyd's combination algorithm to sample indices
Browse files Browse the repository at this point in the history
  • Loading branch information
dhardy committed May 25, 2018
1 parent 112cae3 commit 87d5cd4
Showing 1 changed file with 9 additions and 100 deletions.
109 changes: 9 additions & 100 deletions src/seq.rs
Expand Up @@ -12,11 +12,6 @@

use super::Rng;

// This crate is only enabled when either std or alloc is available.
// BTreeMap is not as fast in tests, but better than nothing.
#[cfg(feature="std")] use std::collections::HashMap;
#[cfg(not(feature="std"))] use alloc::btree_map::BTreeMap;

#[cfg(not(feature="std"))] use alloc::Vec;

/// Randomly sample `amount` elements from a finite iterator.
Expand Down Expand Up @@ -139,87 +134,13 @@ pub fn sample_indices<R>(rng: &mut R, length: usize, amount: usize) -> Vec<usize
panic!("`amount` must be less than or equal to `slice.len()`");
}

// We are going to have to allocate at least `amount` for the output no matter what. However,
// if we use the `cached` version we will have to allocate `amount` as a HashMap as well since
// it inserts an element for every loop.
//
// Therefore, if `amount >= length / 2` then inplace will be both faster and use less memory.
// In fact, benchmarks show the inplace version is faster for length up to about 20 times
// faster than amount.
//
// TODO: there is probably even more fine-tuning that can be done here since
// `HashMap::with_capacity(amount)` probably allocates more than `amount` in practice,
// and a trade off could probably be made between memory/cpu, since hashmap operations
// are slower than array index swapping.
if amount >= length / 20 {
sample_indices_inplace(rng, length, amount)
} else {
sample_indices_cache(rng, length, amount)
}
}

/// Sample an amount of indices using an inplace partial fisher yates method.
///
/// This allocates the entire `length` of indices and randomizes only the first `amount`.
/// It then truncates to `amount` and returns.
///
/// This is better than using a `HashMap` "cache" when `amount >= length / 2`
/// since it does not require allocating an extra cache and is much faster.
fn sample_indices_inplace<R>(rng: &mut R, length: usize, amount: usize) -> Vec<usize>
where R: Rng + ?Sized,
{
debug_assert!(amount <= length);
let mut indices: Vec<usize> = Vec::with_capacity(length);
indices.extend(0..length);
for i in 0..amount {
let j: usize = rng.gen_range(i, length);
indices.swap(i, j);
}
indices.truncate(amount);
debug_assert_eq!(indices.len(), amount);
indices
}


/// This method performs a partial fisher-yates on a range of indices using a
/// `HashMap` as a cache to record potential collisions.
///
/// The cache avoids allocating the entire `length` of values. This is especially useful when
/// `amount <<< length`, i.e. select 3 non-repeating from `1_000_000`
fn sample_indices_cache<R>(
rng: &mut R,
length: usize,
amount: usize,
) -> Vec<usize>
where R: Rng + ?Sized,
{
debug_assert!(amount <= length);
#[cfg(feature="std")] let mut cache = HashMap::with_capacity(amount);
#[cfg(not(feature="std"))] let mut cache = BTreeMap::new();
let mut out = Vec::with_capacity(amount);
for i in 0..amount {
let j: usize = rng.gen_range(i, length);

// equiv: let tmp = slice[i];
let tmp = match cache.get(&i) {
Some(e) => *e,
None => i,
};

// equiv: slice[i] = slice[j];
let x = match cache.get(&j) {
Some(x) => *x,
None => j,
};

// equiv: slice[j] = tmp;
cache.insert(j, tmp);

// note that in the inplace version, slice[i] is automatically "returned" value
out.push(x);
let mut s = Vec::with_capacity(amount);
for j in length - amount .. length {
let t = rng.gen_range(0, j + 1);
let t = if s.contains(&t) { j } else { t };
s.push( t );
}
debug_assert_eq!(out.len(), amount);
out
s
}

#[cfg(test)]
Expand Down Expand Up @@ -267,13 +188,9 @@ mod test {
let v = sample_slice(&mut r, &[42, 133], 2);
assert!(&v[..] == [42, 133] || v[..] == [133, 42]);

assert_eq!(&sample_indices_inplace(&mut r, 0, 0)[..], [0usize; 0]);
assert_eq!(&sample_indices_inplace(&mut r, 1, 0)[..], [0usize; 0]);
assert_eq!(&sample_indices_inplace(&mut r, 1, 1)[..], [0]);

assert_eq!(&sample_indices_cache(&mut r, 0, 0)[..], [0usize; 0]);
assert_eq!(&sample_indices_cache(&mut r, 1, 0)[..], [0usize; 0]);
assert_eq!(&sample_indices_cache(&mut r, 1, 1)[..], [0]);
assert_eq!(&sample_indices(&mut r, 0, 0)[..], [0usize; 0]);
assert_eq!(&sample_indices(&mut r, 1, 0)[..], [0usize; 0]);
assert_eq!(&sample_indices(&mut r, 1, 1)[..], [0]);

// Make sure lucky 777's aren't lucky
let slice = &[42, 777];
Expand Down Expand Up @@ -304,19 +221,11 @@ mod test {
let mut seed = [0u8; 16];
r.fill(&mut seed);

// assert that the two index methods give exactly the same result
let inplace = sample_indices_inplace(
&mut xor_rng(seed), length, amount);
let cache = sample_indices_cache(
&mut xor_rng(seed), length, amount);
assert_eq!(inplace, cache);

// assert the basics work
let regular = sample_indices(
&mut xor_rng(seed), length, amount);
assert_eq!(regular.len(), amount);
assert!(regular.iter().all(|e| *e < length));
assert_eq!(regular, inplace);

// also test that sampling the slice works
let vec: Vec<usize> = (0..length).collect();
Expand Down

0 comments on commit 87d5cd4

Please sign in to comment.