Skip to content

Commit

Permalink
add .duplicate() and .duplicate_by(..) operations
Browse files Browse the repository at this point in the history
Uses a HashMap to detect duplicates in an iterator and emits them only
once. Items are never cloned.

Signed-off-by: Petros Angelatos <petrosagg@gmail.com>
  • Loading branch information
petrosagg committed Dec 4, 2020
1 parent 9958c45 commit 88072e8
Show file tree
Hide file tree
Showing 4 changed files with 281 additions and 0 deletions.
197 changes: 197 additions & 0 deletions src/duplicate_impl.rs
@@ -0,0 +1,197 @@
use std::cmp::{max, min};
use std::collections::HashMap;
use std::hash::Hash;
use std::fmt;

/// An iterator adapter to filter out duplicate elements.
///
/// See [`.duplicate_by()`](../trait.Itertools.html#method.duplicate) for more information.
#[derive(Clone)]
#[must_use = "iterator adaptors are lazy and do nothing unless consumed"]
pub struct DuplicateBy<I: Iterator, V, F> {
iter: I,
used: HashMap<V, bool>,
pending: usize,
f: F,
}

impl<I, V, F> fmt::Debug for DuplicateBy<I, V, F>
where I: Iterator + fmt::Debug,
V: fmt::Debug + Hash + Eq,
{
debug_fmt_fields!(DuplicateBy, iter, used);
}

/// Create a new `DuplicateBy` iterator.
pub fn duplicate_by<I, V, F>(iter: I, f: F) -> DuplicateBy<I, V, F>
where V: Eq + Hash,
F: FnMut(&I::Item) -> V,
I: Iterator,
{
DuplicateBy {
iter,
used: HashMap::new(),
pending: 0,
f,
}
}

impl<I, V, F> Iterator for DuplicateBy<I, V, F>
where I: Iterator,
V: Eq + Hash,
F: FnMut(&I::Item) -> V
{
type Item = I::Item;

fn next(&mut self) -> Option<Self::Item> {
while let Some(v) = self.iter.next() {
let key = (self.f)(&v);
match self.used.get_mut(&key) {
None => {
self.used.insert(key, false);
self.pending += 1;
},
Some(true) => (),
Some(produced) => {
*produced = true;
self.pending -= 1;
return Some(v);
},
}
}
None
}

#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
let (_, hi) = self.iter.size_hint();
// There are `hi` number of items left in the base iterator. In the best case scenario,
// these items are exactly the same as the ones pending (i.e items seen exactly once so
// far), plus (hi - pending) / 2 pairs of never seen before items.
let hi = hi.map(|hi| {
let max_pending = min(self.pending, hi);
let max_new = max(hi - self.pending, 0) / 2;
max_pending + max_new
});
// The lower bound is always 0 since we might only get unique items from now on
(0, hi)
}
}

impl<I, V, F> DoubleEndedIterator for DuplicateBy<I, V, F>
where I: DoubleEndedIterator,
V: Eq + Hash,
F: FnMut(&I::Item) -> V
{
fn next_back(&mut self) -> Option<Self::Item> {
while let Some(v) = self.iter.next_back() {
let key = (self.f)(&v);
match self.used.get_mut(&key) {
None => {
self.used.insert(key, false);
self.pending += 1;
},
Some(true) => (),
Some(produced) => {
*produced = true;
self.pending -= 1;
return Some(v);
},
}
}
None
}
}

impl<I> Iterator for Duplicate<I>
where I: Iterator,
I::Item: Eq + Hash
{
type Item = I::Item;

fn next(&mut self) -> Option<Self::Item> {
while let Some(v) = self.iter.iter.next() {
match self.iter.used.get_mut(&v) {
None => {
self.iter.used.insert(v, false);
self.iter.pending += 1;
},
Some(true) => (),
Some(produced) => {
*produced = true;
self.iter.pending -= 1;
return Some(v);
},
}
}
None
}

#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
let (_, hi) = self.iter.iter.size_hint();
// There are `hi` number of items left in the base iterator. In the best case scenario,
// these items are exactly the same as the ones pending (i.e items seen exactly once so
// far), plus (hi - pending) / 2 pairs of never seen before items.
let hi = hi.map(|hi| {
let max_pending = min(self.iter.pending, hi);
let max_new = max(hi - self.iter.pending, 0) / 2;
max_pending + max_new
});
// The lower bound is always 0 since we might only get unique items from now on
(0, hi)
}
}

impl<I> DoubleEndedIterator for Duplicate<I>
where I: DoubleEndedIterator,
I::Item: Eq + Hash
{
fn next_back(&mut self) -> Option<Self::Item> {
while let Some(v) = self.iter.iter.next_back() {
match self.iter.used.get_mut(&v) {
None => {
self.iter.used.insert(v, false);
self.iter.pending += 1;
},
Some(true) => (),
Some(produced) => {
*produced = true;
self.iter.pending -= 1;
return Some(v);
},
}
}
None
}
}

/// An iterator adapter to filter out duplicate elements.
///
/// See [`.duplicate()`](../trait.Itertools.html#method.duplicate) for more information.
#[derive(Clone)]
#[must_use = "iterator adaptors are lazy and do nothing unless consumed"]
pub struct Duplicate<I: Iterator> {
iter: DuplicateBy<I, I::Item, ()>,
}

impl<I> fmt::Debug for Duplicate<I>
where I: Iterator + fmt::Debug,
I::Item: Hash + Eq + fmt::Debug,
{
debug_fmt_fields!(Duplicate, iter);
}

pub fn duplicate<I>(iter: I) -> Duplicate<I>
where I: Iterator,
I::Item: Eq + Hash,
{
Duplicate {
iter: DuplicateBy {
iter,
used: HashMap::new(),
pending: 0,
f: (),
}
}
}
52 changes: 52 additions & 0 deletions src/lib.rs
Expand Up @@ -144,6 +144,8 @@ pub mod structs {
pub use crate::tee::Tee;
pub use crate::tuple_impl::{TupleBuffer, TupleWindows, CircularTupleWindows, Tuples};
#[cfg(feature = "use_std")]
pub use crate::duplicate_impl::{Duplicate, DuplicateBy};
#[cfg(feature = "use_std")]
pub use crate::unique_impl::{Unique, UniqueBy};
pub use crate::with_position::WithPosition;
pub use crate::zip_eq_impl::ZipEq;
Expand Down Expand Up @@ -219,6 +221,8 @@ mod sources;
mod tee;
mod tuple_impl;
#[cfg(feature = "use_std")]
mod duplicate_impl;
#[cfg(feature = "use_std")]
mod unique_impl;
mod with_position;
mod zip_eq_impl;
Expand Down Expand Up @@ -1138,6 +1142,54 @@ pub trait Itertools : Iterator {
adaptors::dedup_by_with_count(self, cmp)
}

/// Return an iterator adaptor that produces elements that appear more than once during the
/// iteration. Duplicates are detected using hash and equality.
///
/// The iterator is stable, returning the duplicate items in the order in which they occur in
/// the adapted iterator. Each duplicate item is returned exactly once. If an item appears more
/// than twice, the second item is the item retained and the rest are discarded.
///
/// ```
/// use itertools::Itertools;
///
/// let data = vec![10, 20, 30, 20, 40, 10, 50];
/// itertools::assert_equal(data.into_iter().duplicate(),
/// vec![20, 10]);
/// ```
#[cfg(feature = "use_std")]
fn duplicate(self) -> Duplicate<Self>
where Self: Sized,
Self::Item: Eq + Hash
{
duplicate_impl::duplicate(self)
}

/// Return an iterator adaptor that produces elements that appear more than once during the
/// iteration. Duplicates are detected using hash and equality.
///
/// Duplicates are detected by comparing the key they map to with the keying function `f` by
/// hash and equality. The keys are stored in a hash map in the iterator.
///
/// The iterator is stable, returning the duplicate items in the order in which they occur in
/// the adapted iterator. Each duplicate item is returned exactly once. If an item appears more
/// than twice, the second item is the item retained and the rest are discarded.
///
/// ```
/// use itertools::Itertools;
///
/// let data = vec!["a", "bb", "aa", "c", "ccc"];
/// itertools::assert_equal(data.into_iter().duplicate_by(|s| s.len()),
/// vec!["aa", "c"]);
/// ```
#[cfg(feature = "use_std")]
fn duplicate_by<V, F>(self, f: F) -> DuplicateBy<Self, V, F>
where Self: Sized,
V: Eq + Hash,
F: FnMut(&Self::Item) -> V
{
duplicate_impl::duplicate_by(self, f)
}

/// Return an iterator adaptor that filters out elements that have
/// already been produced once during the iteration. Duplicates
/// are detected using hash and equality.
Expand Down
6 changes: 6 additions & 0 deletions tests/quick.rs
Expand Up @@ -907,6 +907,12 @@ quickcheck! {
}
}

quickcheck! {
fn size_duplicate(it: Iter<i8>) -> bool {
correct_size_hint(it.duplicate())
}
}

quickcheck! {
fn size_unique(it: Iter<i8>) -> bool {
correct_size_hint(it.unique())
Expand Down
26 changes: 26 additions & 0 deletions tests/test_std.rs
Expand Up @@ -54,6 +54,32 @@ fn interleave_shortest() {
assert_eq!(it.size_hint(), (6, Some(6)));
}

#[test]
fn duplicate_by() {
let xs = ["aaa", "bbbbb", "aa", "ccc", "bbbb", "aaaaa", "cccc"];
let ys = ["aa", "bbbb", "cccc"];
it::assert_equal(ys.iter(), xs.iter().duplicate_by(|x| x[..2].to_string()));
it::assert_equal(ys.iter(), xs.iter().rev().duplicate_by(|x| x[..2].to_string()).rev());
let ys_rev = ["ccc", "aa", "bbbbb"];
it::assert_equal(ys_rev.iter(), xs.iter().duplicate_by(|x| x[..2].to_string()).rev());
}

#[test]
fn duplicate() {
let xs = [0, 1, 2, 3, 2, 1, 3];
let ys = [2, 1, 3];
it::assert_equal(ys.iter(), xs.iter().duplicate());
it::assert_equal(ys.iter(), xs.iter().rev().duplicate().rev());
let ys_rev = [3, 2, 1];
it::assert_equal(ys_rev.iter(), xs.iter().duplicate().rev());

let xs = [0, 1, 0, 1];
let ys = [0, 1];
it::assert_equal(ys.iter(), xs.iter().duplicate());
it::assert_equal(ys.iter(), xs.iter().rev().duplicate().rev());
let ys_rev = [1, 0];
it::assert_equal(ys_rev.iter(), xs.iter().duplicate().rev());
}

#[test]
fn unique_by() {
Expand Down

0 comments on commit 88072e8

Please sign in to comment.