From 929f36dbcb5bd5b93fd0ea5aa8b7301b8a877ba8 Mon Sep 17 00:00:00 2001 From: Petros Angelatos Date: Thu, 3 Dec 2020 18:33:43 +0100 Subject: [PATCH] add .duplicate() and .duplicate_by(..) operations Uses a HashMap to detect duplicates in an iterator and emits them only once. Items are never cloned. Signed-off-by: Petros Angelatos --- src/duplicate_impl.rs | 160 ++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 52 ++++++++++++++ tests/test_std.rs | 26 +++++++ 3 files changed, 238 insertions(+) create mode 100644 src/duplicate_impl.rs diff --git a/src/duplicate_impl.rs b/src/duplicate_impl.rs new file mode 100644 index 000000000..abfe85521 --- /dev/null +++ b/src/duplicate_impl.rs @@ -0,0 +1,160 @@ +use std::collections::HashMap; +use std::hash::Hash; +use std::fmt; + +/// An iterator adapter to filter out duplicate elements. +/// +/// See [`.duplicate_by()`](../trait.Itertools.html#method.duplicate) for more information. +#[derive(Clone)] +#[must_use = "iterator adaptors are lazy and do nothing unless consumed"] +pub struct DuplicateBy { + iter: I, + // Use a hashmap for the entry API + used: HashMap, + f: F, +} + +impl fmt::Debug for DuplicateBy + where I: Iterator + fmt::Debug, + V: fmt::Debug + Hash + Eq, +{ + debug_fmt_fields!(DuplicateBy, iter, used); +} + +/// Create a new `DuplicateBy` iterator. +pub fn duplicate_by(iter: I, f: F) -> DuplicateBy + where V: Eq + Hash, + F: FnMut(&I::Item) -> V, + I: Iterator, +{ + DuplicateBy { + iter, + used: HashMap::new(), + f, + } +} + +impl Iterator for DuplicateBy + where I: Iterator, + V: Eq + Hash, + F: FnMut(&I::Item) -> V +{ + type Item = I::Item; + + fn next(&mut self) -> Option { + while let Some(v) = self.iter.next() { + let key = (self.f)(&v); + match self.used.get_mut(&key) { + None => { self.used.insert(key, false); }, + Some(true) => (), + Some(produced) => { + *produced = true; + return Some(v); + }, + } + } + None + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + let (low, hi) = self.iter.size_hint(); + ((low > 0 && self.used.is_empty()) as usize, hi) + } +} + +impl DoubleEndedIterator for DuplicateBy + where I: DoubleEndedIterator, + V: Eq + Hash, + F: FnMut(&I::Item) -> V +{ + fn next_back(&mut self) -> Option { + while let Some(v) = self.iter.next_back() { + let key = (self.f)(&v); + match self.used.get_mut(&key) { + None => { self.used.insert(key, false); }, + Some(true) => (), + Some(produced) => { + *produced = true; + return Some(v); + }, + } + } + None + } +} + +impl Iterator for Duplicate + where I: Iterator, + I::Item: Eq + Hash +{ + type Item = I::Item; + + fn next(&mut self) -> Option { + while let Some(v) = self.iter.iter.next() { + match self.iter.used.get_mut(&v) { + None => { self.iter.used.insert(v, false); }, + Some(true) => (), + Some(produced) => { + *produced = true; + return Some(v); + }, + } + } + None + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + let (low, hi) = self.iter.iter.size_hint(); + ((low > 0 && self.iter.used.is_empty()) as usize, hi) + } +} + +impl DoubleEndedIterator for Duplicate + where I: DoubleEndedIterator, + I::Item: Eq + Hash +{ + fn next_back(&mut self) -> Option { + while let Some(v) = self.iter.iter.next_back() { + match self.iter.used.get_mut(&v) { + None => { self.iter.used.insert(v, false); }, + Some(true) => (), + Some(produced) => { + *produced = true; + return Some(v); + }, + } + } + None + } +} + +/// An iterator adapter to filter out duplicate elements. +/// +/// See [`.duplicate()`](../trait.Itertools.html#method.duplicate) for more information. +#[derive(Clone)] +#[must_use = "iterator adaptors are lazy and do nothing unless consumed"] +pub struct Duplicate { + iter: DuplicateBy, +} + +impl fmt::Debug for Duplicate + where I: Iterator + fmt::Debug, + I::Item: Hash + Eq + fmt::Debug, +{ + debug_fmt_fields!(Duplicate, iter); +} + +pub fn duplicate(iter: I) -> Duplicate + where I: Iterator, + I::Item: Eq + Hash, +{ + Duplicate { + iter: DuplicateBy { + iter, + used: HashMap::new(), + f: (), + } + } +} diff --git a/src/lib.rs b/src/lib.rs index ba7085bfd..b3012dfc6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -144,6 +144,8 @@ pub mod structs { pub use crate::tee::Tee; pub use crate::tuple_impl::{TupleBuffer, TupleWindows, CircularTupleWindows, Tuples}; #[cfg(feature = "use_std")] + pub use crate::duplicate_impl::{Duplicate, DuplicateBy}; + #[cfg(feature = "use_std")] pub use crate::unique_impl::{Unique, UniqueBy}; pub use crate::with_position::WithPosition; pub use crate::zip_eq_impl::ZipEq; @@ -219,6 +221,8 @@ mod sources; mod tee; mod tuple_impl; #[cfg(feature = "use_std")] +mod duplicate_impl; +#[cfg(feature = "use_std")] mod unique_impl; mod with_position; mod zip_eq_impl; @@ -1138,6 +1142,54 @@ pub trait Itertools : Iterator { adaptors::dedup_by_with_count(self, cmp) } + /// Return an iterator adaptor that produces elements that appear more than once during the + /// iteration. Duplicates are detected using hash and equality. + /// + /// The iterator is stable, returning the duplicate items in the order in which they occur in + /// the adapted iterator. Each duplicate item is returned exactly once. If an item appears more + /// than twice, the second item is the item retained and the rest are discarded. + /// + /// ``` + /// use itertools::Itertools; + /// + /// let data = vec![10, 20, 30, 20, 40, 10, 50]; + /// itertools::assert_equal(data.into_iter().duplicate(), + /// vec![20, 10]); + /// ``` + #[cfg(feature = "use_std")] + fn duplicate(self) -> Duplicate + where Self: Sized, + Self::Item: Eq + Hash + { + duplicate_impl::duplicate(self) + } + + /// Return an iterator adaptor that produces elements that appear more than once during the + /// iteration. Duplicates are detected using hash and equality. + /// + /// Duplicates are detected by comparing the key they map to with the keying function `f` by + /// hash and equality. The keys are stored in a hash map in the iterator. + /// + /// The iterator is stable, returning the duplicate items in the order in which they occur in + /// the adapted iterator. Each duplicate item is returned exactly once. If an item appears more + /// than twice, the second item is the item retained and the rest are discarded. + /// + /// ``` + /// use itertools::Itertools; + /// + /// let data = vec!["a", "bb", "aa", "c", "ccc"]; + /// itertools::assert_equal(data.into_iter().duplicate_by(|s| s.len()), + /// vec!["aa", "c"]); + /// ``` + #[cfg(feature = "use_std")] + fn duplicate_by(self, f: F) -> DuplicateBy + where Self: Sized, + V: Eq + Hash, + F: FnMut(&Self::Item) -> V + { + duplicate_impl::duplicate_by(self, f) + } + /// Return an iterator adaptor that filters out elements that have /// already been produced once during the iteration. Duplicates /// are detected using hash and equality. diff --git a/tests/test_std.rs b/tests/test_std.rs index 86528216b..f3c993a84 100644 --- a/tests/test_std.rs +++ b/tests/test_std.rs @@ -54,6 +54,32 @@ fn interleave_shortest() { assert_eq!(it.size_hint(), (6, Some(6))); } +#[test] +fn duplicate_by() { + let xs = ["aaa", "bbbbb", "aa", "ccc", "bbbb", "aaaaa", "cccc"]; + let ys = ["aa", "bbbb", "cccc"]; + it::assert_equal(ys.iter(), xs.iter().duplicate_by(|x| x[..2].to_string())); + it::assert_equal(ys.iter(), xs.iter().rev().duplicate_by(|x| x[..2].to_string()).rev()); + let ys_rev = ["ccc", "aa", "bbbbb"]; + it::assert_equal(ys_rev.iter(), xs.iter().duplicate_by(|x| x[..2].to_string()).rev()); +} + +#[test] +fn duplicate() { + let xs = [0, 1, 2, 3, 2, 1, 3]; + let ys = [2, 1, 3]; + it::assert_equal(ys.iter(), xs.iter().duplicate()); + it::assert_equal(ys.iter(), xs.iter().rev().duplicate().rev()); + let ys_rev = [3, 2, 1]; + it::assert_equal(ys_rev.iter(), xs.iter().duplicate().rev()); + + let xs = [0, 1, 0, 1]; + let ys = [0, 1]; + it::assert_equal(ys.iter(), xs.iter().duplicate()); + it::assert_equal(ys.iter(), xs.iter().rev().duplicate().rev()); + let ys_rev = [1, 0]; + it::assert_equal(ys_rev.iter(), xs.iter().duplicate().rev()); +} #[test] fn unique_by() {