From 46a739f5feb8786d6facad3dad7fda0d39260fb3 Mon Sep 17 00:00:00 2001 From: nicoo Date: Fri, 2 Oct 2020 12:11:59 +0200 Subject: [PATCH 01/13] Add Itertool::k_smallest method, in std context --- src/k_smallest.rs | 16 ++++++++++++++++ src/lib.rs | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+) create mode 100644 src/k_smallest.rs diff --git a/src/k_smallest.rs b/src/k_smallest.rs new file mode 100644 index 000000000..36ff70b2b --- /dev/null +++ b/src/k_smallest.rs @@ -0,0 +1,16 @@ +use std::collections::BinaryHeap; +use std::cmp::Ord; + +pub(crate) fn k_smallest>(mut iter: I, k: usize) -> BinaryHeap { + if k == 0 { return BinaryHeap::new(); } + + let mut heap = iter.by_ref().take(k).collect::>(); + + for i in iter { + // Guaranteed not-None, since we keep exactly k>0 elements in the heap. + let mut lorgest = heap.peek_mut().unwrap(); + if *lorgest > i { *lorgest = i; } + } + + heap +} diff --git a/src/lib.rs b/src/lib.rs index 9ce015209..b57b012fc 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -194,6 +194,8 @@ mod group_map; mod groupbylazy; mod intersperse; #[cfg(feature = "use_alloc")] +mod k_smallest; +#[cfg(feature = "use_alloc")] mod kmerge_impl; #[cfg(feature = "use_alloc")] mod lazy_buffer; @@ -2284,6 +2286,39 @@ pub trait Itertools : Iterator { v.into_iter() } + /// Sort the k smallest elements into a new iterator, in ascending order. + /// + /// **Note:** This consumes the entire iterator, and returns the result + /// as a new iterator that owns its elements. + /// + /// This is guaranteed to use `k * sizeof(Self::Item) + O(1)` memory + /// and `O(n log k)` time, with `n` the number of elements in the input. + /// + /// The sorted iterator, if directly collected to a `Vec`, is converted + /// without any extra copying or allocation cost. + /// + /// ``` + /// use itertools::Itertools; + /// + /// // A random permutation of 0..15 + /// let numbers = vec![6, 9, 1, 14, 0, 4, 8, 7, 11, 2, 10, 3, 13, 12, 5]; + /// + /// let five_smallest = numbers + /// .into_iter() + /// .k_smallest(5); + /// + /// itertools::assert_equal(five_smallest, 0..5); + /// ``` + #[cfg(feature = "use_alloc")] + fn k_smallest(self, k: usize) -> VecIntoIter + where Self: Sized, + Self::Item: Ord + { + crate::k_smallest::k_smallest(self, k) + .into_sorted_vec() + .into_iter() + } + /// Collect all iterator elements into one of two /// partitions. Unlike `Iterator::partition`, each partition may /// have a distinct type. From 51670c9108693f9b3b88bd54a1c4d7a5a925379b Mon Sep 17 00:00:00 2001 From: nicoo Date: Fri, 2 Oct 2020 12:12:08 +0200 Subject: [PATCH 02/13] k_smollest: Add debug_assert for invariant --- src/k_smallest.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/k_smallest.rs b/src/k_smallest.rs index 36ff70b2b..b5160fe90 100644 --- a/src/k_smallest.rs +++ b/src/k_smallest.rs @@ -7,6 +7,7 @@ pub(crate) fn k_smallest>(mut iter: I, k: usize) - let mut heap = iter.by_ref().take(k).collect::>(); for i in iter { + debug_assert_eq!(heap.len(), k); // Guaranteed not-None, since we keep exactly k>0 elements in the heap. let mut lorgest = heap.peek_mut().unwrap(); if *lorgest > i { *lorgest = i; } From e71977e031457da443ffc4963f0d4d34ec27277d Mon Sep 17 00:00:00 2001 From: nicoo Date: Fri, 2 Oct 2020 12:12:09 +0200 Subject: [PATCH 03/13] Add property-based test for k_smallest when applied to ranges --- tests/test_std.rs | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/test_std.rs b/tests/test_std.rs index 24d0f0888..a5cf9debf 100644 --- a/tests/test_std.rs +++ b/tests/test_std.rs @@ -1,4 +1,6 @@ use permutohedron; +use quickcheck::quickcheck; +use std::cmp::min; use itertools as it; use crate::it::Itertools; use crate::it::ExactlyOneError; @@ -354,6 +356,18 @@ fn sorted_by() { it::assert_equal(v, vec![4, 3, 2, 1, 0]); } +quickcheck! { + fn k_smallest_range(n: u64, m: u64, k: u64) -> () { + // Check that taking the k smallest elements in n..n+m + // yields n..n+min(k, m) + let i = (n..n.saturating_add(m)).into_iter(); + it::assert_equal( + i.k_smallest(k as usize), + n..n.saturating_add(min(k, m)) + ); + } +} + #[test] fn sorted_by_key() { let sc = [3, 4, 1, 2].iter().cloned().sorted_by_key(|&x| x); From 11b54cc8d284ca6f384553d19779c2c62458397a Mon Sep 17 00:00:00 2001 From: nicoo Date: Fri, 2 Oct 2020 12:12:11 +0200 Subject: [PATCH 04/13] tests::k_smallest_range: Shuffle the input before taking the k smallest --- tests/test_std.rs | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tests/test_std.rs b/tests/test_std.rs index a5cf9debf..81b451425 100644 --- a/tests/test_std.rs +++ b/tests/test_std.rs @@ -1,5 +1,6 @@ use permutohedron; use quickcheck::quickcheck; +use rand::{seq::SliceRandom, thread_rng}; use std::cmp::min; use itertools as it; use crate::it::Itertools; @@ -358,9 +359,14 @@ fn sorted_by() { quickcheck! { fn k_smallest_range(n: u64, m: u64, k: u64) -> () { - // Check that taking the k smallest elements in n..n+m - // yields n..n+min(k, m) - let i = (n..n.saturating_add(m)).into_iter(); + // Generate a random permutation of n..n+m + let i = { + let mut v: Vec = (n..n.saturating_add(m)).collect(); + v.shuffle(&mut thread_rng()); + v.into_iter() + }; + + // Check that taking the k smallest elements yields n..n+min(k, m) it::assert_equal( i.k_smallest(k as usize), n..n.saturating_add(min(k, m)) From b0443e0d8a2352ef51e10bcd0c633d2794ec23ce Mon Sep 17 00:00:00 2001 From: nicoo Date: Fri, 2 Oct 2020 12:12:12 +0200 Subject: [PATCH 05/13] k_smallest: Clarify what happens when self.len() < k --- src/lib.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index b57b012fc..c3b3a9418 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2289,7 +2289,8 @@ pub trait Itertools : Iterator { /// Sort the k smallest elements into a new iterator, in ascending order. /// /// **Note:** This consumes the entire iterator, and returns the result - /// as a new iterator that owns its elements. + /// as a new iterator that owns its elements. If the input contains + /// less than k elements, the result is equivalent to `self.sorted()`. /// /// This is guaranteed to use `k * sizeof(Self::Item) + O(1)` memory /// and `O(n log k)` time, with `n` the number of elements in the input. From d292fa0488d7cb43c49fdd605e5d70771b0ef597 Mon Sep 17 00:00:00 2001 From: nicoo Date: Fri, 2 Oct 2020 12:12:14 +0200 Subject: [PATCH 06/13] k_smallest: Document the equivalence with self.sorted().take(k) Consistency with `Iterator::take` is the rationale for returning less than `k` elements when the input is too short. --- src/lib.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index c3b3a9418..9d2f8b9f1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2298,6 +2298,9 @@ pub trait Itertools : Iterator { /// The sorted iterator, if directly collected to a `Vec`, is converted /// without any extra copying or allocation cost. /// + /// **Note:** This is functionally-equivalent to `self.sorted().take(k)` + /// but much more efficient. + /// /// ``` /// use itertools::Itertools; /// From 7aadb1ea1425525a94675dd32b0f32fe88567c22 Mon Sep 17 00:00:00 2001 From: nicoo Date: Fri, 2 Oct 2020 12:12:15 +0200 Subject: [PATCH 07/13] tests::test_std: Support generating random iterators with quickcheck --- tests/test_std.rs | 39 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/tests/test_std.rs b/tests/test_std.rs index 81b451425..d7e1c2ad4 100644 --- a/tests/test_std.rs +++ b/tests/test_std.rs @@ -1,7 +1,8 @@ use permutohedron; -use quickcheck::quickcheck; +use quickcheck as qc; +use rand::{distributions::{Distribution, Standard}, Rng, SeedableRng, rngs::StdRng}; use rand::{seq::SliceRandom, thread_rng}; -use std::cmp::min; +use std::{cmp::min, fmt::Debug, marker::PhantomData}; use itertools as it; use crate::it::Itertools; use crate::it::ExactlyOneError; @@ -357,7 +358,7 @@ fn sorted_by() { it::assert_equal(v, vec![4, 3, 2, 1, 0]); } -quickcheck! { +qc::quickcheck! { fn k_smallest_range(n: u64, m: u64, k: u64) -> () { // Generate a random permutation of n..n+m let i = { @@ -374,6 +375,38 @@ quickcheck! { } } +#[derive(Clone, Debug)] +struct RandIter { + idx: usize, + len: usize, + rng: R, + _t: PhantomData +} + +impl Iterator for RandIter +where Standard: Distribution { + type Item = T; + fn next(&mut self) -> Option { + if self.idx == self.len { + None + } else { + self.idx += 1; + Some(self.rng.gen()) + } + } +} + +impl qc::Arbitrary for RandIter { + fn arbitrary(g: &mut G) -> Self { + RandIter { + idx: 0, + len: g.size(), + rng: R::seed_from_u64(g.next_u64()), + _t : PhantomData{}, + } + } +} + #[test] fn sorted_by_key() { let sc = [3, 4, 1, 2].iter().cloned().sorted_by_key(|&x| x); From 036a4519113cd752b200b78eba6a2edeecc001da Mon Sep 17 00:00:00 2001 From: nicoo Date: Fri, 2 Oct 2020 12:12:16 +0200 Subject: [PATCH 08/13] Add generic test comparing k_smallest(k) and sorted().take(k) Instantiate the test over all unsigned integer types --- tests/test_std.rs | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tests/test_std.rs b/tests/test_std.rs index d7e1c2ad4..8fce9617f 100644 --- a/tests/test_std.rs +++ b/tests/test_std.rs @@ -407,6 +407,35 @@ impl qc::Arbitrary for Ran } } +// Check that taking the k smallest is the same as +// sorting then taking the k first elements +fn k_smallest_sort(i: I, k: usize) -> () +where + I: Iterator + Clone, + I::Item: Ord + Debug, +{ + let j = i.clone(); + it::assert_equal( + i.k_smallest(k), + j.sorted().take(k) + ) +} + +qc::quickcheck! { + fn k_smallest_sort_u8(i: RandIter, k: u16) -> () { + k_smallest_sort(i, k) + } + fn k_smallest_sort_u16(i: RandIter, k: u16) -> () { + k_smallest_sort(i, k) + } + fn k_smallest_sort_u32(i: RandIter, k: u16) -> () { + k_smallest_sort(i, k) + } + fn k_smallest_sort_u64(i: RandIter, k: u16) -> () { + k_smallest_sort(i, k) + } +} + #[test] fn sorted_by_key() { let sc = [3, 4, 1, 2].iter().cloned().sorted_by_key(|&x| x); From 900f1e8cef4082ed22df2ad61cc46ccc3ada7783 Mon Sep 17 00:00:00 2001 From: nicoo Date: Fri, 2 Oct 2020 12:12:18 +0200 Subject: [PATCH 09/13] k_smallest: Comment what we do with peek_mut() --- src/k_smallest.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/k_smallest.rs b/src/k_smallest.rs index b5160fe90..91afb4962 100644 --- a/src/k_smallest.rs +++ b/src/k_smallest.rs @@ -10,6 +10,7 @@ pub(crate) fn k_smallest>(mut iter: I, k: usize) - debug_assert_eq!(heap.len(), k); // Guaranteed not-None, since we keep exactly k>0 elements in the heap. let mut lorgest = heap.peek_mut().unwrap(); + // Equivalent to heap.push(min(i, heap.pop())) but more efficient. if *lorgest > i { *lorgest = i; } } From f31b61739bf275491597def9c86747ce504fcfc4 Mon Sep 17 00:00:00 2001 From: nicoo Date: Fri, 2 Oct 2020 12:15:17 +0200 Subject: [PATCH 10/13] test_std::k_smallest: Avoid allocating too much memory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit By limiting k and m to be u16, we use at most 2¹⁶ sizeof(u64) = 512 kiB for each allocated array and heap (at most 1MiB total for k_smallest_range) --- tests/test_std.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/test_std.rs b/tests/test_std.rs index 8fce9617f..c4f57c0e5 100644 --- a/tests/test_std.rs +++ b/tests/test_std.rs @@ -359,7 +359,11 @@ fn sorted_by() { } qc::quickcheck! { - fn k_smallest_range(n: u64, m: u64, k: u64) -> () { + fn k_smallest_range(n: u64, m: u16, k: u16) -> () { + // u16 is used to constrain k and m to 0..2¹⁶, + // otherwise the test could use too much memory. + let (k, m) = (k as u64, m as u64); + // Generate a random permutation of n..n+m let i = { let mut v: Vec = (n..n.saturating_add(m)).collect(); @@ -409,12 +413,13 @@ impl qc::Arbitrary for Ran // Check that taking the k smallest is the same as // sorting then taking the k first elements -fn k_smallest_sort(i: I, k: usize) -> () +fn k_smallest_sort(i: I, k: u16) -> () where I: Iterator + Clone, I::Item: Ord + Debug, { let j = i.clone(); + let k = k as usize; it::assert_equal( i.k_smallest(k), j.sorted().take(k) From 45e5a6494eebd880ff7114c9e7bff7e6a0fd8b36 Mon Sep 17 00:00:00 2001 From: Giacomo Stevanato Date: Fri, 2 Oct 2020 12:15:19 +0200 Subject: [PATCH 11/13] k_smallest: Work around suboptimality in `peek_mut` See https://github.com/rust-lang/rust/pull/75974 Co-authored-by: nicoo --- src/k_smallest.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/k_smallest.rs b/src/k_smallest.rs index 91afb4962..0995b6c1c 100644 --- a/src/k_smallest.rs +++ b/src/k_smallest.rs @@ -8,10 +8,12 @@ pub(crate) fn k_smallest>(mut iter: I, k: usize) - for i in iter { debug_assert_eq!(heap.len(), k); - // Guaranteed not-None, since we keep exactly k>0 elements in the heap. - let mut lorgest = heap.peek_mut().unwrap(); // Equivalent to heap.push(min(i, heap.pop())) but more efficient. - if *lorgest > i { *lorgest = i; } + // This should be done with a single `.peek_mut().unwrap()` but + // `PeekMut` sifts-down unconditionally on Rust 1.46.0 and prior. + if *heap.peek().unwrap() > i { + *heap.peek_mut().unwrap() = i; + } } heap From 11b2058960e636f4352e3d3316dbd2f13ee81c42 Mon Sep 17 00:00:00 2001 From: nicoo Date: Fri, 2 Oct 2020 12:15:21 +0200 Subject: [PATCH 12/13] test_std: Add generic_test! to instanciate tests w/ multiple types --- Cargo.toml | 1 + tests/test_std.rs | 27 +++++++++++++-------------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 3de576bc3..693f0f650 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,6 +28,7 @@ either = { version = "1.0", default-features = false } [dev-dependencies] rand = "0.7" criterion = "=0" # TODO how could this work with our minimum supported rust version? +paste = "1.0.0" # Used in test_std to instanciate generic tests [dev-dependencies.quickcheck] version = "0.9" diff --git a/tests/test_std.rs b/tests/test_std.rs index c4f57c0e5..9491c4d84 100644 --- a/tests/test_std.rs +++ b/tests/test_std.rs @@ -1,3 +1,4 @@ +use paste; use permutohedron; use quickcheck as qc; use rand::{distributions::{Distribution, Standard}, Rng, SeedableRng, rngs::StdRng}; @@ -426,21 +427,20 @@ where ) } -qc::quickcheck! { - fn k_smallest_sort_u8(i: RandIter, k: u16) -> () { - k_smallest_sort(i, k) - } - fn k_smallest_sort_u16(i: RandIter, k: u16) -> () { - k_smallest_sort(i, k) - } - fn k_smallest_sort_u32(i: RandIter, k: u16) -> () { - k_smallest_sort(i, k) - } - fn k_smallest_sort_u64(i: RandIter, k: u16) -> () { - k_smallest_sort(i, k) - } +macro_rules! generic_test { + ($f:ident, $($t:ty),+) => { + $(paste::item! { + qc::quickcheck! { + fn [< $f _ $t >](i: RandIter<$t>, k: u16) -> () { + $f(i, k) + } + } + })+ + }; } +generic_test!(k_smallest_sort, u8, u16, u32, u64, i8, i16, i32, i64); + #[test] fn sorted_by_key() { let sc = [3, 4, 1, 2].iter().cloned().sorted_by_key(|&x| x); @@ -474,7 +474,6 @@ fn test_multipeek() { assert_eq!(mp.next(), Some(5)); assert_eq!(mp.next(), None); assert_eq!(mp.peek(), None); - } #[test] From f28ffd083c6e032be8eb5785d916c8978c785548 Mon Sep 17 00:00:00 2001 From: nicoo Date: Fri, 2 Oct 2020 12:24:54 +0200 Subject: [PATCH 13/13] Make k_smallest work with just `alloc` --- src/k_smallest.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/k_smallest.rs b/src/k_smallest.rs index 0995b6c1c..d58ec70d0 100644 --- a/src/k_smallest.rs +++ b/src/k_smallest.rs @@ -1,5 +1,5 @@ -use std::collections::BinaryHeap; -use std::cmp::Ord; +use alloc::collections::BinaryHeap; +use core::cmp::Ord; pub(crate) fn k_smallest>(mut iter: I, k: usize) -> BinaryHeap { if k == 0 { return BinaryHeap::new(); }