Skip to content

Commit

Permalink
Merge #502
Browse files Browse the repository at this point in the history
502: add .duplicates() and .duplicates_by(..) operations r=jswrenn a=petrosagg

Uses a HashMap to detect duplicates in an iterator and emits them only once. The implemention is similar to the `unique()` and `unique_by()` methods but in this case the items are never cloned.

Co-authored-by: Petros Angelatos <petrosagg@gmail.com>
  • Loading branch information
bors[bot] and petrosagg committed Jan 21, 2021
2 parents a71fce5 + e1090aa commit 853f064
Show file tree
Hide file tree
Showing 4 changed files with 290 additions and 0 deletions.
206 changes: 206 additions & 0 deletions src/duplicates_impl.rs
@@ -0,0 +1,206 @@
use std::hash::Hash;

mod private {
use std::collections::HashMap;
use std::hash::Hash;
use std::fmt;

#[derive(Clone)]
#[must_use = "iterator adaptors are lazy and do nothing unless consumed"]
pub struct DuplicatesBy<I: Iterator, Key, F> {
pub(crate) iter: I,
pub(crate) meta: Meta<Key, F>,
}

impl<I, V, F> fmt::Debug for DuplicatesBy<I, V, F>
where
I: Iterator + fmt::Debug,
V: fmt::Debug + Hash + Eq,
{
debug_fmt_fields!(DuplicatesBy, iter, meta.used);
}

impl<I: Iterator, Key: Eq + Hash, F> DuplicatesBy<I, Key, F> {
pub(crate) fn new(iter: I, key_method: F) -> Self {
DuplicatesBy {
iter,
meta: Meta {
used: HashMap::new(),
pending: 0,
key_method,
},
}
}
}

#[derive(Clone)]
pub struct Meta<Key, F> {
used: HashMap<Key, bool>,
pending: usize,
key_method: F,
}

impl<Key, F> Meta<Key, F>
where
Key: Eq + Hash,
{
/// Takes an item and returns it back to the caller if it's the second time we see it.
/// Otherwise the item is consumed and None is returned
#[inline(always)]
fn filter<I>(&mut self, item: I) -> Option<I>
where
F: KeyMethod<Key, I>,
{
let kv = self.key_method.make(item);
match self.used.get_mut(kv.key_ref()) {
None => {
self.used.insert(kv.key(), false);
self.pending += 1;
None
}
Some(true) => None,
Some(produced) => {
*produced = true;
self.pending -= 1;
Some(kv.value())
}
}
}
}

impl<I, Key, F> Iterator for DuplicatesBy<I, Key, F>
where
I: Iterator,
Key: Eq + Hash,
F: KeyMethod<Key, I::Item>,
{
type Item = I::Item;

fn next(&mut self) -> Option<Self::Item> {
let DuplicatesBy { iter, meta } = self;
iter.find_map(|v| meta.filter(v))
}

#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
let (_, hi) = self.iter.size_hint();
// There are `hi` number of items left in the base iterator. In the best case scenario,
// these items are exactly the same as the ones pending (i.e items seen exactly once so
// far), plus (hi - pending) / 2 pairs of never seen before items.
let hi = hi.map(|hi| {
let max_pending = std::cmp::min(self.meta.pending, hi);
let max_new = std::cmp::max(hi - self.meta.pending, 0) / 2;
max_pending + max_new
});
// The lower bound is always 0 since we might only get unique items from now on
(0, hi)
}
}

impl<I, Key, F> DoubleEndedIterator for DuplicatesBy<I, Key, F>
where
I: DoubleEndedIterator,
Key: Eq + Hash,
F: KeyMethod<Key, I::Item>,
{
fn next_back(&mut self) -> Option<Self::Item> {
let DuplicatesBy { iter, meta } = self;
iter.rev().find_map(|v| meta.filter(v))
}
}

/// A keying method for use with `DuplicatesBy`
pub trait KeyMethod<K, V> {
type Container: KeyXorValue<K, V>;

fn make(&mut self, value: V) -> Self::Container;
}

/// Apply the identity function to elements before checking them for equality.
pub struct ById;
impl<V> KeyMethod<V, V> for ById {
type Container = JustValue<V>;

fn make(&mut self, v: V) -> Self::Container {
JustValue(v)
}
}

/// Apply a user-supplied function to elements before checking them for equality.
pub struct ByFn<F>(pub(crate) F);
impl<K, V, F> KeyMethod<K, V> for ByFn<F>
where
F: FnMut(&V) -> K,
{
type Container = KeyValue<K, V>;

fn make(&mut self, v: V) -> Self::Container {
KeyValue((self.0)(&v), v)
}
}

// Implementors of this trait can hold onto a key and a value but only give access to one of them
// at a time. This allows the key and the value to be the same value internally
pub trait KeyXorValue<K, V> {
fn key_ref(&self) -> &K;
fn key(self) -> K;
fn value(self) -> V;
}

pub struct KeyValue<K, V>(K, V);
impl<K, V> KeyXorValue<K, V> for KeyValue<K, V> {
fn key_ref(&self) -> &K {
&self.0
}
fn key(self) -> K {
self.0
}
fn value(self) -> V {
self.1
}
}

pub struct JustValue<V>(V);
impl<V> KeyXorValue<V, V> for JustValue<V> {
fn key_ref(&self) -> &V {
&self.0
}
fn key(self) -> V {
self.0
}
fn value(self) -> V {
self.0
}
}
}

/// An iterator adapter to filter for duplicate elements.
///
/// See [`.duplicates_by()`](../trait.Itertools.html#method.duplicates_by) for more information.
#[must_use = "iterator adaptors are lazy and do nothing unless consumed"]
pub type DuplicatesBy<I, V, F> = private::DuplicatesBy<I, V, private::ByFn<F>>;

/// Create a new `DuplicatesBy` iterator.
pub fn duplicates_by<I, Key, F>(iter: I, f: F) -> DuplicatesBy<I, Key, F>
where
Key: Eq + Hash,
F: FnMut(&I::Item) -> Key,
I: Iterator,
{
DuplicatesBy::new(iter, private::ByFn(f))
}

/// An iterator adapter to filter out duplicate elements.
///
/// See [`.duplicates()`](../trait.Itertools.html#method.duplicates) for more information.
pub type Duplicates<I> = private::DuplicatesBy<I, <I as Iterator>::Item, private::ById>;

/// Create a new `Duplicates` iterator.
pub fn duplicates<I>(iter: I) -> Duplicates<I>
where
I: Iterator,
I::Item: Eq + Hash,
{
Duplicates::new(iter, private::ById)
}

52 changes: 52 additions & 0 deletions src/lib.rs
Expand Up @@ -149,6 +149,8 @@ pub mod structs {
pub use crate::tee::Tee;
pub use crate::tuple_impl::{TupleBuffer, TupleWindows, CircularTupleWindows, Tuples};
#[cfg(feature = "use_std")]
pub use crate::duplicates_impl::{Duplicates, DuplicatesBy};
#[cfg(feature = "use_std")]
pub use crate::unique_impl::{Unique, UniqueBy};
pub use crate::with_position::WithPosition;
pub use crate::zip_eq_impl::ZipEq;
Expand Down Expand Up @@ -230,6 +232,8 @@ mod sources;
mod tee;
mod tuple_impl;
#[cfg(feature = "use_std")]
mod duplicates_impl;
#[cfg(feature = "use_std")]
mod unique_impl;
mod with_position;
mod zip_eq_impl;
Expand Down Expand Up @@ -1147,6 +1151,54 @@ pub trait Itertools : Iterator {
adaptors::dedup_by_with_count(self, cmp)
}

/// Return an iterator adaptor that produces elements that appear more than once during the
/// iteration. Duplicates are detected using hash and equality.
///
/// The iterator is stable, returning the duplicate items in the order in which they occur in
/// the adapted iterator. Each duplicate item is returned exactly once. If an item appears more
/// than twice, the second item is the item retained and the rest are discarded.
///
/// ```
/// use itertools::Itertools;
///
/// let data = vec![10, 20, 30, 20, 40, 10, 50];
/// itertools::assert_equal(data.into_iter().duplicates(),
/// vec![20, 10]);
/// ```
#[cfg(feature = "use_std")]
fn duplicates(self) -> Duplicates<Self>
where Self: Sized,
Self::Item: Eq + Hash
{
duplicates_impl::duplicates(self)
}

/// Return an iterator adaptor that produces elements that appear more than once during the
/// iteration. Duplicates are detected using hash and equality.
///
/// Duplicates are detected by comparing the key they map to with the keying function `f` by
/// hash and equality. The keys are stored in a hash map in the iterator.
///
/// The iterator is stable, returning the duplicate items in the order in which they occur in
/// the adapted iterator. Each duplicate item is returned exactly once. If an item appears more
/// than twice, the second item is the item retained and the rest are discarded.
///
/// ```
/// use itertools::Itertools;
///
/// let data = vec!["a", "bb", "aa", "c", "ccc"];
/// itertools::assert_equal(data.into_iter().duplicates_by(|s| s.len()),
/// vec!["aa", "c"]);
/// ```
#[cfg(feature = "use_std")]
fn duplicates_by<V, F>(self, f: F) -> DuplicatesBy<Self, V, F>
where Self: Sized,
V: Eq + Hash,
F: FnMut(&Self::Item) -> V
{
duplicates_impl::duplicates_by(self, f)
}

/// Return an iterator adaptor that filters out elements that have
/// already been produced once during the iteration. Duplicates
/// are detected using hash and equality.
Expand Down
6 changes: 6 additions & 0 deletions tests/quick.rs
Expand Up @@ -915,6 +915,12 @@ quickcheck! {
}
}

quickcheck! {
fn size_duplicates(it: Iter<i8>) -> bool {
correct_size_hint(it.duplicates())
}
}

quickcheck! {
fn size_unique(it: Iter<i8>) -> bool {
correct_size_hint(it.unique())
Expand Down
26 changes: 26 additions & 0 deletions tests/test_std.rs
Expand Up @@ -59,6 +59,32 @@ fn interleave_shortest() {
assert_eq!(it.size_hint(), (6, Some(6)));
}

#[test]
fn duplicates_by() {
let xs = ["aaa", "bbbbb", "aa", "ccc", "bbbb", "aaaaa", "cccc"];
let ys = ["aa", "bbbb", "cccc"];
it::assert_equal(ys.iter(), xs.iter().duplicates_by(|x| x[..2].to_string()));
it::assert_equal(ys.iter(), xs.iter().rev().duplicates_by(|x| x[..2].to_string()).rev());
let ys_rev = ["ccc", "aa", "bbbbb"];
it::assert_equal(ys_rev.iter(), xs.iter().duplicates_by(|x| x[..2].to_string()).rev());
}

#[test]
fn duplicates() {
let xs = [0, 1, 2, 3, 2, 1, 3];
let ys = [2, 1, 3];
it::assert_equal(ys.iter(), xs.iter().duplicates());
it::assert_equal(ys.iter(), xs.iter().rev().duplicates().rev());
let ys_rev = [3, 2, 1];
it::assert_equal(ys_rev.iter(), xs.iter().duplicates().rev());

let xs = [0, 1, 0, 1];
let ys = [0, 1];
it::assert_equal(ys.iter(), xs.iter().duplicates());
it::assert_equal(ys.iter(), xs.iter().rev().duplicates().rev());
let ys_rev = [1, 0];
it::assert_equal(ys_rev.iter(), xs.iter().duplicates().rev());
}

#[test]
fn unique_by() {
Expand Down

0 comments on commit 853f064

Please sign in to comment.