From e1090aa0902943d2f6cc89a5878193d0ff950b29 Mon Sep 17 00:00:00 2001 From: Petros Angelatos Date: Thu, 3 Dec 2020 18:33:43 +0100 Subject: [PATCH] add .duplicates() and .duplicates_by(..) operations Uses a HashMap to detect duplicates in an iterator and emits them only once. Items are never cloned. Signed-off-by: Petros Angelatos --- src/duplicates_impl.rs | 206 +++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 52 +++++++++++ tests/quick.rs | 6 ++ tests/test_std.rs | 26 ++++++ 4 files changed, 290 insertions(+) create mode 100644 src/duplicates_impl.rs diff --git a/src/duplicates_impl.rs b/src/duplicates_impl.rs new file mode 100644 index 000000000..a581cc04d --- /dev/null +++ b/src/duplicates_impl.rs @@ -0,0 +1,206 @@ +use std::hash::Hash; + +mod private { + use std::collections::HashMap; + use std::hash::Hash; + use std::fmt; + + #[derive(Clone)] + #[must_use = "iterator adaptors are lazy and do nothing unless consumed"] + pub struct DuplicatesBy { + pub(crate) iter: I, + pub(crate) meta: Meta, + } + + impl fmt::Debug for DuplicatesBy + where + I: Iterator + fmt::Debug, + V: fmt::Debug + Hash + Eq, + { + debug_fmt_fields!(DuplicatesBy, iter, meta.used); + } + + impl DuplicatesBy { + pub(crate) fn new(iter: I, key_method: F) -> Self { + DuplicatesBy { + iter, + meta: Meta { + used: HashMap::new(), + pending: 0, + key_method, + }, + } + } + } + + #[derive(Clone)] + pub struct Meta { + used: HashMap, + pending: usize, + key_method: F, + } + + impl Meta + where + Key: Eq + Hash, + { + /// Takes an item and returns it back to the caller if it's the second time we see it. + /// Otherwise the item is consumed and None is returned + #[inline(always)] + fn filter(&mut self, item: I) -> Option + where + F: KeyMethod, + { + let kv = self.key_method.make(item); + match self.used.get_mut(kv.key_ref()) { + None => { + self.used.insert(kv.key(), false); + self.pending += 1; + None + } + Some(true) => None, + Some(produced) => { + *produced = true; + self.pending -= 1; + Some(kv.value()) + } + } + } + } + + impl Iterator for DuplicatesBy + where + I: Iterator, + Key: Eq + Hash, + F: KeyMethod, + { + type Item = I::Item; + + fn next(&mut self) -> Option { + let DuplicatesBy { iter, meta } = self; + iter.find_map(|v| meta.filter(v)) + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + let (_, hi) = self.iter.size_hint(); + // There are `hi` number of items left in the base iterator. In the best case scenario, + // these items are exactly the same as the ones pending (i.e items seen exactly once so + // far), plus (hi - pending) / 2 pairs of never seen before items. + let hi = hi.map(|hi| { + let max_pending = std::cmp::min(self.meta.pending, hi); + let max_new = std::cmp::max(hi - self.meta.pending, 0) / 2; + max_pending + max_new + }); + // The lower bound is always 0 since we might only get unique items from now on + (0, hi) + } + } + + impl DoubleEndedIterator for DuplicatesBy + where + I: DoubleEndedIterator, + Key: Eq + Hash, + F: KeyMethod, + { + fn next_back(&mut self) -> Option { + let DuplicatesBy { iter, meta } = self; + iter.rev().find_map(|v| meta.filter(v)) + } + } + + /// A keying method for use with `DuplicatesBy` + pub trait KeyMethod { + type Container: KeyXorValue; + + fn make(&mut self, value: V) -> Self::Container; + } + + /// Apply the identity function to elements before checking them for equality. + pub struct ById; + impl KeyMethod for ById { + type Container = JustValue; + + fn make(&mut self, v: V) -> Self::Container { + JustValue(v) + } + } + + /// Apply a user-supplied function to elements before checking them for equality. + pub struct ByFn(pub(crate) F); + impl KeyMethod for ByFn + where + F: FnMut(&V) -> K, + { + type Container = KeyValue; + + fn make(&mut self, v: V) -> Self::Container { + KeyValue((self.0)(&v), v) + } + } + + // Implementors of this trait can hold onto a key and a value but only give access to one of them + // at a time. This allows the key and the value to be the same value internally + pub trait KeyXorValue { + fn key_ref(&self) -> &K; + fn key(self) -> K; + fn value(self) -> V; + } + + pub struct KeyValue(K, V); + impl KeyXorValue for KeyValue { + fn key_ref(&self) -> &K { + &self.0 + } + fn key(self) -> K { + self.0 + } + fn value(self) -> V { + self.1 + } + } + + pub struct JustValue(V); + impl KeyXorValue for JustValue { + fn key_ref(&self) -> &V { + &self.0 + } + fn key(self) -> V { + self.0 + } + fn value(self) -> V { + self.0 + } + } +} + +/// An iterator adapter to filter for duplicate elements. +/// +/// See [`.duplicates_by()`](../trait.Itertools.html#method.duplicates_by) for more information. +#[must_use = "iterator adaptors are lazy and do nothing unless consumed"] +pub type DuplicatesBy = private::DuplicatesBy>; + +/// Create a new `DuplicatesBy` iterator. +pub fn duplicates_by(iter: I, f: F) -> DuplicatesBy +where + Key: Eq + Hash, + F: FnMut(&I::Item) -> Key, + I: Iterator, +{ + DuplicatesBy::new(iter, private::ByFn(f)) +} + +/// An iterator adapter to filter out duplicate elements. +/// +/// See [`.duplicates()`](../trait.Itertools.html#method.duplicates) for more information. +pub type Duplicates = private::DuplicatesBy::Item, private::ById>; + +/// Create a new `Duplicates` iterator. +pub fn duplicates(iter: I) -> Duplicates +where + I: Iterator, + I::Item: Eq + Hash, +{ + Duplicates::new(iter, private::ById) +} + diff --git a/src/lib.rs b/src/lib.rs index 7cd451d1f..26237b454 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -147,6 +147,8 @@ pub mod structs { pub use crate::tee::Tee; pub use crate::tuple_impl::{TupleBuffer, TupleWindows, CircularTupleWindows, Tuples}; #[cfg(feature = "use_std")] + pub use crate::duplicates_impl::{Duplicates, DuplicatesBy}; + #[cfg(feature = "use_std")] pub use crate::unique_impl::{Unique, UniqueBy}; pub use crate::with_position::WithPosition; pub use crate::zip_eq_impl::ZipEq; @@ -228,6 +230,8 @@ mod sources; mod tee; mod tuple_impl; #[cfg(feature = "use_std")] +mod duplicates_impl; +#[cfg(feature = "use_std")] mod unique_impl; mod with_position; mod zip_eq_impl; @@ -1145,6 +1149,54 @@ pub trait Itertools : Iterator { adaptors::dedup_by_with_count(self, cmp) } + /// Return an iterator adaptor that produces elements that appear more than once during the + /// iteration. Duplicates are detected using hash and equality. + /// + /// The iterator is stable, returning the duplicate items in the order in which they occur in + /// the adapted iterator. Each duplicate item is returned exactly once. If an item appears more + /// than twice, the second item is the item retained and the rest are discarded. + /// + /// ``` + /// use itertools::Itertools; + /// + /// let data = vec![10, 20, 30, 20, 40, 10, 50]; + /// itertools::assert_equal(data.into_iter().duplicates(), + /// vec![20, 10]); + /// ``` + #[cfg(feature = "use_std")] + fn duplicates(self) -> Duplicates + where Self: Sized, + Self::Item: Eq + Hash + { + duplicates_impl::duplicates(self) + } + + /// Return an iterator adaptor that produces elements that appear more than once during the + /// iteration. Duplicates are detected using hash and equality. + /// + /// Duplicates are detected by comparing the key they map to with the keying function `f` by + /// hash and equality. The keys are stored in a hash map in the iterator. + /// + /// The iterator is stable, returning the duplicate items in the order in which they occur in + /// the adapted iterator. Each duplicate item is returned exactly once. If an item appears more + /// than twice, the second item is the item retained and the rest are discarded. + /// + /// ``` + /// use itertools::Itertools; + /// + /// let data = vec!["a", "bb", "aa", "c", "ccc"]; + /// itertools::assert_equal(data.into_iter().duplicates_by(|s| s.len()), + /// vec!["aa", "c"]); + /// ``` + #[cfg(feature = "use_std")] + fn duplicates_by(self, f: F) -> DuplicatesBy + where Self: Sized, + V: Eq + Hash, + F: FnMut(&Self::Item) -> V + { + duplicates_impl::duplicates_by(self, f) + } + /// Return an iterator adaptor that filters out elements that have /// already been produced once during the iteration. Duplicates /// are detected using hash and equality. diff --git a/tests/quick.rs b/tests/quick.rs index e5bee1713..b4ae576af 100644 --- a/tests/quick.rs +++ b/tests/quick.rs @@ -915,6 +915,12 @@ quickcheck! { } } +quickcheck! { + fn size_duplicates(it: Iter) -> bool { + correct_size_hint(it.duplicates()) + } +} + quickcheck! { fn size_unique(it: Iter) -> bool { correct_size_hint(it.unique()) diff --git a/tests/test_std.rs b/tests/test_std.rs index d1ff815da..86b50b6de 100644 --- a/tests/test_std.rs +++ b/tests/test_std.rs @@ -59,6 +59,32 @@ fn interleave_shortest() { assert_eq!(it.size_hint(), (6, Some(6))); } +#[test] +fn duplicates_by() { + let xs = ["aaa", "bbbbb", "aa", "ccc", "bbbb", "aaaaa", "cccc"]; + let ys = ["aa", "bbbb", "cccc"]; + it::assert_equal(ys.iter(), xs.iter().duplicates_by(|x| x[..2].to_string())); + it::assert_equal(ys.iter(), xs.iter().rev().duplicates_by(|x| x[..2].to_string()).rev()); + let ys_rev = ["ccc", "aa", "bbbbb"]; + it::assert_equal(ys_rev.iter(), xs.iter().duplicates_by(|x| x[..2].to_string()).rev()); +} + +#[test] +fn duplicates() { + let xs = [0, 1, 2, 3, 2, 1, 3]; + let ys = [2, 1, 3]; + it::assert_equal(ys.iter(), xs.iter().duplicates()); + it::assert_equal(ys.iter(), xs.iter().rev().duplicates().rev()); + let ys_rev = [3, 2, 1]; + it::assert_equal(ys_rev.iter(), xs.iter().duplicates().rev()); + + let xs = [0, 1, 0, 1]; + let ys = [0, 1]; + it::assert_equal(ys.iter(), xs.iter().duplicates()); + it::assert_equal(ys.iter(), xs.iter().rev().duplicates().rev()); + let ys_rev = [1, 0]; + it::assert_equal(ys_rev.iter(), xs.iter().duplicates().rev()); +} #[test] fn unique_by() {