diff --git a/CHANGELOG.md b/CHANGELOG.md
index 716f6639a20..856d3ac35e4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,11 @@ A [separate changelog is kept for rand_core](rand_core/CHANGELOG.md).
 
 You may also find the [Update Guide](UPDATING.md) useful.
 
+## [0.6.0] - Unreleased
+
+### Sequences module
+- Optimised and changed return type of the `sample_indices` function. (#479)
+
 ## [0.5.4] - 2018-07-11
 ### Platform support
 - Make `OsRng` work via WASM/stdweb for WebWorkers
diff --git a/benches/seq.rs b/benches/seq.rs
index 260e2334a41..f143131763b 100644
--- a/benches/seq.rs
+++ b/benches/seq.rs
@@ -1,4 +1,5 @@
 #![feature(test)]
+#![allow(non_snake_case)]
 
 extern crate test;
 extern crate rand;
@@ -27,28 +28,31 @@ fn seq_slice_choose_1_of_1000(b: &mut Bencher) {
     })
 }
 
-#[bench]
-fn seq_slice_choose_multiple_1_of_1000(b: &mut Bencher) {
-    let mut rng = SmallRng::from_rng(thread_rng()).unwrap();
-    let x : &[usize] = &[1; 1000];
-    b.iter(|| {
-        x.choose_multiple(&mut rng, 1).cloned().next()
-    })
-}
-
-#[bench]
-fn seq_slice_choose_multiple_10_of_100(b: &mut Bencher) {
-    let mut rng = SmallRng::from_rng(thread_rng()).unwrap();
-    let x : &[usize] = &[1; 100];
-    let mut buf = [0; 10];
-    b.iter(|| {
-        for (v, slot) in x.choose_multiple(&mut rng, buf.len()).zip(buf.iter_mut()) {
-            *slot = *v;
+macro_rules! seq_slice_choose_multiple {
+    ($name:ident, $amount:expr, $length:expr) => {
+        #[bench]
+        fn $name(b: &mut Bencher) {
+            let mut rng = SmallRng::from_rng(thread_rng()).unwrap();
+            let x : &[i32] = &[$amount; $length];
+            let mut result = [0i32; $amount];
+            b.iter(|| {
+                // Collect full result to prevent unwanted shortcuts getting
+                // first element (in case sample_indices returns an iterator).
+                for (slot, sample) in result.iter_mut().zip(
+                    x.choose_multiple(&mut rng, $amount)) {
+                    *slot = *sample;
+                }
+                result[$amount-1]
+            })
         }
-        buf
-    })
+    }
 }
 
+seq_slice_choose_multiple!(seq_slice_choose_multiple_1_of_1000, 1, 1000);
+seq_slice_choose_multiple!(seq_slice_choose_multiple_950_of_1000, 950, 1000);
+seq_slice_choose_multiple!(seq_slice_choose_multiple_10_of_100, 10, 100);
+seq_slice_choose_multiple!(seq_slice_choose_multiple_90_of_100, 90, 100);
+
 #[bench]
 fn seq_iter_choose_from_100(b: &mut Bencher) {
     let mut rng = SmallRng::from_rng(thread_rng()).unwrap();
@@ -78,17 +82,22 @@ fn seq_iter_choose_multiple_fill_10_of_100(b: &mut Bencher) {
 }
 
 macro_rules! sample_indices {
-    ($name:ident, $amount:expr, $length:expr) => {
+    ($name:ident, $fn:ident, $amount:expr, $length:expr) => {
         #[bench]
         fn $name(b: &mut Bencher) {
             let mut rng = SmallRng::from_rng(thread_rng()).unwrap();
             b.iter(|| {
-                sample_indices(&mut rng, $length, $amount)
+                index::$fn(&mut rng, $length, $amount)
             })
         }
     }
 }
 
-sample_indices!(seq_sample_indices_10_of_1k, 10, 1000);
-sample_indices!(seq_sample_indices_50_of_1k, 50, 1000);
-sample_indices!(seq_sample_indices_100_of_1k, 100, 1000);
+sample_indices!(misc_sample_indices_1_of_1k, sample, 1, 1000);
+sample_indices!(misc_sample_indices_10_of_1k, sample, 10, 1000);
+sample_indices!(misc_sample_indices_100_of_1k, sample, 100, 1000);
+sample_indices!(misc_sample_indices_100_of_1M, sample, 100, 1000_000);
+sample_indices!(misc_sample_indices_100_of_1G, sample, 100, 1000_000_000);
+sample_indices!(misc_sample_indices_200_of_1G, sample, 200, 1000_000_000);
+sample_indices!(misc_sample_indices_400_of_1G, sample, 400, 1000_000_000);
+sample_indices!(misc_sample_indices_600_of_1G, sample, 600, 1000_000_000);
diff --git a/src/lib.rs b/src/lib.rs
index f07a68c495a..197fc2546fc 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -234,7 +234,7 @@
 #![cfg_attr(feature = "wasm-bindgen", feature(wasm_import_module))]
 
 #[cfg(feature = "std")] extern crate core;
-#[cfg(all(feature = "alloc", not(feature="std")))] extern crate alloc;
+#[cfg(all(feature = "alloc", not(feature="std")))] #[macro_use] extern crate alloc;
 
 #[cfg(feature="simd_support")] extern crate packed_simd;
 
diff --git a/src/seq/index.rs b/src/seq/index.rs
new file mode 100644
index 00000000000..805b7f3c2f3
--- /dev/null
+++ b/src/seq/index.rs
@@ -0,0 +1,398 @@
+// Copyright 2018 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// https://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+//! Index sampling
+
+#[cfg(feature="alloc")] use core::slice;
+
+#[cfg(feature="std")] use std::vec;
+#[cfg(all(feature="alloc", not(feature="std")))] use alloc::vec::{self, Vec};
+// BTreeMap is not as fast in tests, but better than nothing.
+#[cfg(feature="std")] use std::collections::{HashSet};
+#[cfg(all(feature="alloc", not(feature="std")))] use alloc::collections::BTreeSet;
+
+#[cfg(feature="alloc")] use distributions::{Distribution, Uniform};
+use Rng;
+
+/// A vector of indices.
+/// 
+/// Multiple internal representations are possible.
+#[derive(Clone, Debug)]
+pub enum IndexVec {
+    #[doc(hidden)] U32(Vec<u32>),
+    #[doc(hidden)] USize(Vec<usize>),
+}
+
+impl IndexVec {
+    /// Returns the number of indices
+    pub fn len(&self) -> usize {
+        match self {
+            &IndexVec::U32(ref v) => v.len(),
+            &IndexVec::USize(ref v) => v.len(),
+        }
+    }
+    
+    /// Return the value at the given `index`.
+    /// 
+    /// (Note: we cannot implement `std::ops::Index` because of lifetime
+    /// restrictions.)
+    pub fn index(&self, index: usize) -> usize {
+        match self {
+            &IndexVec::U32(ref v) => v[index] as usize,
+            &IndexVec::USize(ref v) => v[index],
+        }
+    }
+
+    /// Return result as a `Vec<usize>`. Conversion may or may not be trivial.
+    pub fn into_vec(self) -> Vec<usize> {
+        match self {
+            IndexVec::U32(v) => v.into_iter().map(|i| i as usize).collect(),
+            IndexVec::USize(v) => v,
+        }
+    }
+
+    /// Iterate over the indices as a sequence of `usize` values
+    pub fn iter<'a>(&'a self) -> IndexVecIter<'a> {
+        match self {
+            &IndexVec::U32(ref v) => IndexVecIter::U32(v.iter()),
+            &IndexVec::USize(ref v) => IndexVecIter::USize(v.iter()),
+        }
+    }
+    
+    /// Convert into an iterator over the indices as a sequence of `usize` values
+    pub fn into_iter(self) -> IndexVecIntoIter {
+        match self {
+            IndexVec::U32(v) => IndexVecIntoIter::U32(v.into_iter()),
+            IndexVec::USize(v) => IndexVecIntoIter::USize(v.into_iter()),
+        }
+    }
+}
+
+impl PartialEq for IndexVec {
+    fn eq(&self, other: &IndexVec) -> bool {
+        use self::IndexVec::*;
+        match (self, other) {
+            (&U32(ref v1), &U32(ref v2)) => v1 == v2,
+            (&USize(ref v1), &USize(ref v2)) => v1 == v2,
+            (&U32(ref v1), &USize(ref v2)) => (v1.len() == v2.len())
+                && (v1.iter().zip(v2.iter()).all(|(x, y)| *x as usize == *y)),
+            (&USize(ref v1), &U32(ref v2)) => (v1.len() == v2.len())
+                && (v1.iter().zip(v2.iter()).all(|(x, y)| *x == *y as usize)),
+        }
+    }
+}
+
+impl From<Vec<u32>> for IndexVec {
+    fn from(v: Vec<u32>) -> Self {
+        IndexVec::U32(v)
+    }
+}
+
+impl From<Vec<usize>> for IndexVec {
+    fn from(v: Vec<usize>) -> Self {
+        IndexVec::USize(v)
+    }
+}
+
+/// Return type of `IndexVec::iter`.
+#[derive(Debug)]
+pub enum IndexVecIter<'a> {
+    #[doc(hidden)] U32(slice::Iter<'a, u32>),
+    #[doc(hidden)] USize(slice::Iter<'a, usize>),
+}
+
+impl<'a> Iterator for IndexVecIter<'a> {
+    type Item = usize;
+    fn next(&mut self) -> Option<usize> {
+        use self::IndexVecIter::*;
+        match self {
+            &mut U32(ref mut iter) => iter.next().map(|i| *i as usize),
+            &mut USize(ref mut iter) => iter.next().cloned(),
+        }
+    }
+    
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        match self {
+            &IndexVecIter::U32(ref v) => v.size_hint(),
+            &IndexVecIter::USize(ref v) => v.size_hint(),
+        }
+    }
+}
+
+impl<'a> ExactSizeIterator for IndexVecIter<'a> {}
+
+/// Return type of `IndexVec::into_iter`.
+#[derive(Clone, Debug)]
+pub enum IndexVecIntoIter {
+    #[doc(hidden)] U32(vec::IntoIter<u32>),
+    #[doc(hidden)] USize(vec::IntoIter<usize>),
+}
+
+impl Iterator for IndexVecIntoIter {
+    type Item = usize;
+    
+    fn next(&mut self) -> Option<Self::Item> {
+        use self::IndexVecIntoIter::*;
+        match self {
+            &mut U32(ref mut v) => v.next().map(|i| i as usize),
+            &mut USize(ref mut v) => v.next(),
+        }
+    }
+    
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        use self::IndexVecIntoIter::*;
+        match self {
+            &U32(ref v) => v.size_hint(),
+            &USize(ref v) => v.size_hint(),
+        }
+    }
+}
+
+impl ExactSizeIterator for IndexVecIntoIter {}
+
+
+/// Randomly sample exactly `amount` distinct indices from `0..length`, and
+/// return them in random order (fully shuffled).
+///
+/// This method is used internally by the slice sampling methods, but it can
+/// sometimes be useful to have the indices themselves so this is provided as
+/// an alternative.
+///
+/// The implementation used is not specified; we automatically select the
+/// fastest available algorithm for the `length` and `amount` parameters
+/// (based on detailed profiling on an Intel Haswell CPU). Roughly speaking,
+/// complexity is `O(amount)`, except that when `amount` is small, performance
+/// is closer to `O(amount^2)`, and when `length` is close to `amount` then
+/// `O(length)`.
+///
+/// Note that performance is significantly better over `u32` indices than over
+/// `u64` indices. Because of this we hide the underlying type behind an
+/// abstraction, `IndexVec`.
+/// 
+/// If an allocation-free `no_std` function is required, it is suggested
+/// to adapt the internal `sample_floyd` implementation.
+///
+/// Panics if `amount > length`.
+pub fn sample<R>(rng: &mut R, length: usize, amount: usize) -> IndexVec
+    where R: Rng + ?Sized,
+{
+    if amount > length {
+        panic!("`amount` of samples must be less than or equal to `length`");
+    }
+    if length > (::core::u32::MAX as usize) {
+        // We never want to use inplace here, but could use floyd's alg
+        // Lazy version: always use the cache alg.
+        return sample_rejection(rng, length, amount);
+    }
+    let amount = amount as u32;
+    let length = length as u32;
+    
+    // Choice of algorithm here depends on both length and amount. See:
+    // https://github.com/rust-lang-nursery/rand/pull/479
+    // We do some calculations with f32. Accuracy is not very important.
+
+    if amount < 163 {
+        const C: [[f32; 2]; 2] = [[1.6, 8.0/45.0], [10.0, 70.0/9.0]];
+        let j = if length < 500_000 { 0 } else { 1 };
+        let amount_fp = amount as f32;
+        let m4 = C[0][j] * amount_fp;
+        // Short-cut: when amount < 12, floyd's is always faster
+        if amount > 11 && (length as f32) < (C[1][j] + m4) * amount_fp {
+            sample_inplace(rng, length, amount)
+        } else {
+            sample_floyd(rng, length, amount)
+        }
+    } else {
+        const C: [f32; 2] = [270.0, 330.0/9.0];
+        let j = if length < 500_000 { 0 } else { 1 };
+        if (length as f32) < C[j] * (amount as f32) {
+            sample_inplace(rng, length, amount)
+        } else {
+            // note: could have a specific u32 impl, but I'm lazy and
+            // generics don't have usable conversions
+            sample_rejection(rng, length as usize, amount as usize)
+        }
+    }
+}
+
+/// Randomly sample exactly `amount` indices from `0..length`, using Floyd's
+/// combination algorithm.
+/// 
+/// The output values are fully shuffled. (Overhead is under 50%.)
+///
+/// This implementation uses `O(amount)` memory and `O(amount^2)` time.
+fn sample_floyd<R>(rng: &mut R, length: u32, amount: u32) -> IndexVec
+    where R: Rng + ?Sized,
+{
+    // Shouldn't this be on std::slice?
+    fn find_pos<T: Copy + PartialEq<T>>(slice: &[T], elt: T) -> Option<usize> {
+        for i in 0..slice.len() {
+            if slice[i] == elt {
+                return Some(i);
+            }
+        }
+        None
+    }
+    
+    // For small amount we use Floyd's fully-shuffled variant. For larger
+    // amounts this is slow due to Vec::insert performance, so we shuffle
+    // afterwards. Benchmarks show little overhead from extra logic.
+    let floyd_shuffle = amount < 50;
+    
+    debug_assert!(amount <= length);
+    let mut indices = Vec::with_capacity(amount as usize);
+    for j in length - amount .. length {
+        let t = rng.gen_range(0, j + 1);
+        if floyd_shuffle {
+            if let Some(pos) = find_pos(&indices, t) {
+                indices.insert(pos, j);
+                continue;
+            }
+        } else {
+            if indices.contains(&t) {
+                indices.push(j);
+                continue;
+            }
+        }
+        indices.push(t);
+    }
+    if !floyd_shuffle {
+        // Reimplement SliceRandom::shuffle with smaller indices
+        for i in (1..amount).rev() {
+            // invariant: elements with index > i have been locked in place.
+            indices.swap(i as usize, rng.gen_range(0, i + 1) as usize);
+        }
+    }
+    IndexVec::from(indices)
+}
+
+/// Randomly sample exactly `amount` indices from `0..length`, using an inplace
+/// partial Fisher-Yates method.
+/// Sample an amount of indices using an inplace partial fisher yates method.
+///
+/// This allocates the entire `length` of indices and randomizes only the first `amount`.
+/// It then truncates to `amount` and returns.
+/// 
+/// This method is not appropriate for large `length` and potentially uses a lot
+/// of memory; because of this we only implement for `u32` index (which improves
+/// performance in all cases).
+///
+/// Set-up is `O(length)` time and memory and shuffling is `O(amount)` time.
+fn sample_inplace<R>(rng: &mut R, length: u32, amount: u32) -> IndexVec
+    where R: Rng + ?Sized,
+{
+    debug_assert!(amount <= length);
+    let mut indices: Vec<u32> = Vec::with_capacity(length as usize);
+    indices.extend(0..length);
+    for i in 0..amount {
+        let j: u32 = rng.gen_range(i, length);
+        indices.swap(i as usize, j as usize);
+    }
+    indices.truncate(amount as usize);
+    debug_assert_eq!(indices.len(), amount as usize);
+    IndexVec::from(indices)
+}
+
+/// Randomly sample exactly `amount` indices from `0..length`, using rejection
+/// sampling.
+/// 
+/// Since `amount <<< length` there is a low chance of a random sample in
+/// `0..length` being a duplicate. We test for duplicates and resample where
+/// necessary. The algorithm is `O(amount)` time and memory.
+fn sample_rejection<R>(rng: &mut R, length: usize, amount: usize) -> IndexVec
+    where R: Rng + ?Sized,
+{
+    debug_assert!(amount < length);
+    #[cfg(feature="std")] let mut cache = HashSet::with_capacity(amount);
+    #[cfg(not(feature="std"))] let mut cache = BTreeSet::new();
+    let distr = Uniform::new(0, length);
+    let mut indices = Vec::with_capacity(amount);
+    for _ in 0..amount {
+        let mut pos = distr.sample(rng);
+        while !cache.insert(pos) {
+            pos = distr.sample(rng);
+        }
+        indices.push(pos);
+    }
+    
+    debug_assert_eq!(indices.len(), amount);
+    IndexVec::from(indices)
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use {Rng, SeedableRng};
+    use prng::XorShiftRng;
+    
+    #[test]
+    fn test_sample_boundaries() {
+        let mut r = ::test::rng(404);
+        
+        assert_eq!(sample_inplace(&mut r, 0, 0).len(), 0);
+        assert_eq!(sample_inplace(&mut r, 1, 0).len(), 0);
+        assert_eq!(sample_inplace(&mut r, 1, 1).into_vec(), vec![0]);
+
+        assert_eq!(sample_rejection(&mut r, 1, 0).len(), 0);
+
+        assert_eq!(sample_floyd(&mut r, 0, 0).len(), 0);
+        assert_eq!(sample_floyd(&mut r, 1, 0).len(), 0);
+        assert_eq!(sample_floyd(&mut r, 1, 1).into_vec(), vec![0]);
+        
+        // These algorithms should be fast with big numbers. Test average.
+        let sum: usize = sample_rejection(&mut r, 1 << 25, 10)
+                .into_iter().sum();
+        assert!(1 << 25 < sum && sum < (1 << 25) * 25);
+        
+        let sum: usize = sample_floyd(&mut r, 1 << 25, 10)
+                .into_iter().sum();
+        assert!(1 << 25 < sum && sum < (1 << 25) * 25);
+    }
+    
+    #[test]
+    fn test_sample_alg() {
+        let xor_rng = XorShiftRng::from_seed;
+
+        let mut r = ::test::rng(403);
+        let mut seed = [0u8; 16];
+        
+        // We can't test which algorithm is used directly, but Floyd's alg
+        // should produce different results from the others. (Also, `inplace`
+        // and `cached` currently use different sizes thus produce different results.)
+        
+        // A small length and relatively large amount should use inplace
+        r.fill(&mut seed);
+        let (length, amount): (usize, usize) = (100, 50);
+        let v1 = sample(&mut xor_rng(seed), length, amount);
+        let v2 = sample_inplace(&mut xor_rng(seed), length as u32, amount as u32);
+        assert!(v1.iter().all(|e| e < length));
+        assert_eq!(v1, v2);
+        
+        // Test Floyd's alg does produce different results
+        let v3 = sample_floyd(&mut xor_rng(seed), length as u32, amount as u32);
+        assert!(v1 != v3);
+        
+        // A large length and small amount should use Floyd
+        r.fill(&mut seed);
+        let (length, amount): (usize, usize) = (1<<20, 50);
+        let v1 = sample(&mut xor_rng(seed), length, amount);
+        let v2 = sample_floyd(&mut xor_rng(seed), length as u32, amount as u32);
+        assert!(v1.iter().all(|e| e < length));
+        assert_eq!(v1, v2);
+        
+        // A large length and larger amount should use cache
+        r.fill(&mut seed);
+        let (length, amount): (usize, usize) = (1<<20, 600);
+        let v1 = sample(&mut xor_rng(seed), length, amount);
+        let v2 = sample_rejection(&mut xor_rng(seed), length, amount);
+        assert!(v1.iter().all(|e| e < length));
+        assert_eq!(v1, v2);
+    }
+}
diff --git a/src/seq.rs b/src/seq/mod.rs
similarity index 79%
rename from src/seq.rs
rename to src/seq/mod.rs
index e030712b3d1..4e06bac2863 100644
--- a/src/seq.rs
+++ b/src/seq/mod.rs
@@ -12,18 +12,15 @@
 //! 
 //! TODO: module doc
 
+
+#[cfg(feature="alloc")] pub mod index;
+
 #[cfg(feature="alloc")] use core::ops::Index;
 
-#[cfg(feature="std")] use std::vec;
-#[cfg(all(feature="alloc", not(feature="std")))] use alloc::vec;
 #[cfg(all(feature="alloc", not(feature="std")))] use alloc::vec::Vec;
-// BTreeMap is not as fast in tests, but better than nothing.
-#[cfg(feature="std")] use std::collections::HashMap;
-#[cfg(all(feature="alloc", not(feature="std")))] use alloc::collections::BTreeMap;
 
-#[cfg(feature = "alloc")] use distributions::WeightedError;
-
-use super::Rng;
+use Rng;
+#[cfg(feature="alloc")] use distributions::WeightedError;
 #[cfg(feature="alloc")] use distributions::uniform::{SampleUniform, SampleBorrow};
 
 /// Extension trait on slices, providing random mutation and sampling methods.
@@ -61,18 +58,12 @@ pub trait SliceRandom {
         where R: Rng + ?Sized;
 
     /// Produces an iterator that chooses `amount` elements from the slice at
-    /// random without repeating any.
-    ///
-    /// In case this API is not sufficiently flexible, use `sample_indices` then
-    /// apply the indices to the slice.
+    /// random without repeating any, and returns them in random order.
     /// 
-    /// Although the elements are selected randomly, the order of returned
-    /// elements is neither stable nor fully random. If random ordering is
-    /// desired, either use `partial_shuffle` or use this method and shuffle
-    /// the result. If stable order is desired, use `sample_indices`, sort the
-    /// result, then apply to the slice.
+    /// In case this API is not sufficiently flexible, use `index::sample` then
+    /// apply the indices to the slice.
     /// 
-    /// Complexity is expected to be the same as `sample_indices`.
+    /// Complexity is expected to be the same as `index::sample`.
     /// 
     /// # Example
     /// ```
@@ -317,14 +308,15 @@ impl<T> SliceRandom for [T] {
     }
 
     #[cfg(feature = "alloc")]
-    fn choose_multiple<R>(&self, rng: &mut R, amount: usize) -> SliceChooseIter<Self, Self::Item>
+    fn choose_multiple<R>(&self, rng: &mut R, amount: usize)
+        -> SliceChooseIter<Self, Self::Item>
         where R: Rng + ?Sized
     {
         let amount = ::core::cmp::min(amount, self.len());
         SliceChooseIter {
             slice: self,
             _phantom: Default::default(),
-            indices: sample_indices(rng, self.len(), amount).into_iter(),
+            indices: index::sample(rng, self.len(), amount).into_iter(),
         }
     }
 
@@ -396,7 +388,7 @@ impl<I> IteratorRandom for I where I: Iterator + Sized {}
 pub struct SliceChooseIter<'a, S: ?Sized + 'a, T: 'a> {
     slice: &'a S,
     _phantom: ::core::marker::PhantomData<T>,
-    indices: vec::IntoIter<usize>,
+    indices: index::IndexVecIntoIter,
 }
 
 #[cfg(feature = "alloc")]
@@ -405,7 +397,7 @@ impl<'a, S: Index<usize, Output = T> + ?Sized + 'a, T: 'a> Iterator for SliceCho
 
     fn next(&mut self) -> Option<Self::Item> {
         // TODO: investigate using SliceIndex::get_unchecked when stable
-        self.indices.next().map(|i| &(*self.slice)[i])
+        self.indices.next().map(|i| &self.slice[i as usize])
     }
     
     fn size_hint(&self) -> (usize, Option<usize>) {
@@ -461,10 +453,10 @@ pub fn sample_slice<R, T>(rng: &mut R, slice: &[T], amount: usize) -> Vec<T>
     where R: Rng + ?Sized,
           T: Clone
 {
-    let indices = sample_indices(rng, slice.len(), amount);
+    let indices = index::sample(rng, slice.len(), amount).into_iter();
 
     let mut out = Vec::with_capacity(amount);
-    out.extend(indices.iter().map(|i| slice[*i].clone()));
+    out.extend(indices.map(|i| slice[i].clone()));
     out
 }
 
@@ -484,113 +476,10 @@ pub fn sample_slice<R, T>(rng: &mut R, slice: &[T], amount: usize) -> Vec<T>
 pub fn sample_slice_ref<'a, R, T>(rng: &mut R, slice: &'a [T], amount: usize) -> Vec<&'a T>
     where R: Rng + ?Sized
 {
-    let indices = sample_indices(rng, slice.len(), amount);
-
-    let mut out = Vec::with_capacity(amount);
-    out.extend(indices.iter().map(|i| &slice[*i]));
-    out
-}
-
-/// Randomly sample exactly `amount` indices from `0..length`.
-///
-/// The values are non-repeating and in random order.
-///
-/// This implementation uses `O(amount)` time and memory.
-///
-/// This method is used internally by the slice sampling methods, but it can sometimes be useful to
-/// have the indices themselves so this is provided as an alternative.
-///
-/// Panics if `amount > length`
-#[cfg(feature = "alloc")]
-pub fn sample_indices<R>(rng: &mut R, length: usize, amount: usize) -> Vec<usize>
-    where R: Rng + ?Sized,
-{
-    if amount > length {
-        panic!("`amount` must be less than or equal to `slice.len()`");
-    }
-
-    // We are going to have to allocate at least `amount` for the output no matter what. However,
-    // if we use the `cached` version we will have to allocate `amount` as a HashMap as well since
-    // it inserts an element for every loop.
-    //
-    // Therefore, if `amount >= length / 2` then inplace will be both faster and use less memory.
-    // In fact, benchmarks show the inplace version is faster for length up to about 20 times
-    // faster than amount.
-    //
-    // TODO: there is probably even more fine-tuning that can be done here since
-    // `HashMap::with_capacity(amount)` probably allocates more than `amount` in practice,
-    // and a trade off could probably be made between memory/cpu, since hashmap operations
-    // are slower than array index swapping.
-    if amount >= length / 20 {
-        sample_indices_inplace(rng, length, amount)
-    } else {
-        sample_indices_cache(rng, length, amount)
-    }
-}
-
-/// Sample an amount of indices using an inplace partial fisher yates method.
-///
-/// This allocates the entire `length` of indices and randomizes only the first `amount`.
-/// It then truncates to `amount` and returns.
-///
-/// This is better than using a `HashMap` "cache" when `amount >= length / 2`
-/// since it does not require allocating an extra cache and is much faster.
-#[cfg(feature = "alloc")]
-fn sample_indices_inplace<R>(rng: &mut R, length: usize, amount: usize) -> Vec<usize>
-    where R: Rng + ?Sized,
-{
-    debug_assert!(amount <= length);
-    let mut indices: Vec<usize> = Vec::with_capacity(length);
-    indices.extend(0..length);
-    for i in 0..amount {
-        let j: usize = rng.gen_range(i, length);
-        indices.swap(i, j);
-    }
-    indices.truncate(amount);
-    debug_assert_eq!(indices.len(), amount);
-    indices
-}
+    let indices = index::sample(rng, slice.len(), amount).into_iter();
 
-
-/// This method performs a partial fisher-yates on a range of indices using a
-/// `HashMap` as a cache to record potential collisions.
-///
-/// The cache avoids allocating the entire `length` of values. This is especially useful when
-/// `amount <<< length`, i.e. select 3 non-repeating from `1_000_000`
-#[cfg(feature = "alloc")]
-fn sample_indices_cache<R>(
-    rng: &mut R,
-    length: usize,
-    amount: usize,
-) -> Vec<usize>
-    where R: Rng + ?Sized,
-{
-    debug_assert!(amount <= length);
-    #[cfg(feature="std")] let mut cache = HashMap::with_capacity(amount);
-    #[cfg(not(feature="std"))] let mut cache = BTreeMap::new();
     let mut out = Vec::with_capacity(amount);
-    for i in 0..amount {
-        let j: usize = rng.gen_range(i, length);
-
-        // equiv: let tmp = slice[i];
-        let tmp = match cache.get(&i) {
-            Some(e) => *e,
-            None => i,
-        };
-
-        // equiv: slice[i] = slice[j];
-        let x = match cache.get(&j) {
-            Some(x) => *x,
-            None => j,
-        };
-
-        // equiv: slice[j] = tmp;
-        cache.insert(j, tmp);
-
-        // note that in the inplace version, slice[i] is automatically "returned" value
-        out.push(x);
-    }
-    debug_assert_eq!(out.len(), amount);
+    out.extend(indices.map(|i| &slice[i]));
     out
 }
 
@@ -648,7 +537,6 @@ mod test {
 
     #[test]
     fn test_shuffle() {
-
         let mut r = ::test::rng(108);
         let empty: &mut [isize] = &mut [];
         empty.shuffle(&mut r);
@@ -752,14 +640,6 @@ mod test {
         let v = sample_slice(&mut r, &[42, 133], 2);
         assert!(&v[..] == [42, 133] || v[..] == [133, 42]);
 
-        assert_eq!(&sample_indices_inplace(&mut r, 0, 0)[..], [0usize; 0]);
-        assert_eq!(&sample_indices_inplace(&mut r, 1, 0)[..], [0usize; 0]);
-        assert_eq!(&sample_indices_inplace(&mut r, 1, 1)[..], [0]);
-
-        assert_eq!(&sample_indices_cache(&mut r, 0, 0)[..], [0usize; 0]);
-        assert_eq!(&sample_indices_cache(&mut r, 1, 0)[..], [0usize; 0]);
-        assert_eq!(&sample_indices_cache(&mut r, 1, 1)[..], [0]);
-
         // Make sure lucky 777's aren't lucky
         let slice = &[42, 777];
         let mut num_42 = 0;
@@ -783,43 +663,29 @@ mod test {
     fn test_sample_slice() {
         let xor_rng = XorShiftRng::from_seed;
 
-        let max_range = 100;
         let mut r = ::test::rng(403);
 
-        for length in 1usize..max_range {
+        for n in 1..20 {
+            let length = 5*n - 4;   // 1, 6, ...
             let amount = r.gen_range(0, length);
             let mut seed = [0u8; 16];
             r.fill(&mut seed);
 
-            // assert that the two index methods give exactly the same result
-            let inplace = sample_indices_inplace(
-                &mut xor_rng(seed), length, amount);
-            let cache = sample_indices_cache(
-                &mut xor_rng(seed), length, amount);
-            assert_eq!(inplace, cache);
-
             // assert the basics work
-            let regular = sample_indices(
-                &mut xor_rng(seed), length, amount);
+            let regular = index::sample(&mut xor_rng(seed), length, amount);
             assert_eq!(regular.len(), amount);
-            assert!(regular.iter().all(|e| *e < length));
-            assert_eq!(regular, inplace);
+            assert!(regular.iter().all(|e| e < length));
 
             // also test that sampling the slice works
-            let vec: Vec<usize> = (0..length).collect();
-            {
-                let result = sample_slice(&mut xor_rng(seed), &vec, amount);
-                assert_eq!(result, regular);
-            }
+            let vec: Vec<u32> = (0..(length as u32)).collect();
+            let result = sample_slice(&mut xor_rng(seed), &vec, amount);
+            assert_eq!(result, regular.iter().map(|i| i as u32).collect::<Vec<_>>());
 
-            {
-                let result = sample_slice_ref(&mut xor_rng(seed), &vec, amount);
-                let expected = regular.iter().map(|v| v).collect::<Vec<_>>();
-                assert_eq!(result, expected);
-            }
+            let result = sample_slice_ref(&mut xor_rng(seed), &vec, amount);
+            assert!(result.iter().zip(regular.iter()).all(|(i,j)| **i == j as u32));
         }
     }
-
+    
     #[test]
     #[cfg(feature = "alloc")]
     fn test_weighted() {