From 7baa6e3ec58c53fc691ff8032f51fd8b25323f28 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Wed, 5 Oct 2022 17:12:06 +0900 Subject: [PATCH 1/2] Removing alloc on all .next() in MultiValueColumn --- src/indexer/flat_map_with_buffer.rs | 74 +++++++++++++++++++ src/indexer/mod.rs | 2 + .../sorted_doc_id_multivalue_column.rs | 20 +++-- 3 files changed, 85 insertions(+), 11 deletions(-) create mode 100644 src/indexer/flat_map_with_buffer.rs diff --git a/src/indexer/flat_map_with_buffer.rs b/src/indexer/flat_map_with_buffer.rs new file mode 100644 index 0000000000..2e9194018c --- /dev/null +++ b/src/indexer/flat_map_with_buffer.rs @@ -0,0 +1,74 @@ +struct FlatMapWithgBuffer { + buffer: Vec, + fill_buffer: F, + underlying_it: Iter, +} + +impl Iterator for FlatMapWithgBuffer +where + Iter: Iterator, + F: Fn(I, &mut Vec), +{ + type Item = T; + + fn next(&mut self) -> Option { + while self.buffer.is_empty() { + let next_el = self.underlying_it.next()?; + (self.fill_buffer)(next_el, &mut self.buffer); + // We will pop elements, so we reverse the buffer first. + self.buffer.reverse(); + } + self.buffer.pop() + } +} + +/// FUnction similar to `flat_map`, but the generating function fills a buffer +/// instead of returning an Iterator. +pub fn flat_map_with_buffer( + underlying_it: Iter, + fill_buffer: F, +) -> impl Iterator +where + F: Fn(I, &mut Vec), + Iter: Iterator, +{ + FlatMapWithgBuffer { + buffer: Vec::with_capacity(10), + fill_buffer, + underlying_it, + } +} + +#[cfg(test)] +mod tests { + use super::flat_map_with_buffer; + + #[test] + fn test_flat_map_with_buffer_empty() { + let vals: Vec = flat_map_with_buffer( + std::iter::empty::(), + |_val: usize, _buffer: &mut Vec| {}, + ) + .collect(); + assert!(vals.is_empty()); + } + + #[test] + fn test_flat_map_with_buffer_simple() { + let vals: Vec = flat_map_with_buffer(1..5, |val: usize, buffer: &mut Vec| { + buffer.extend(0..val) + }) + .collect(); + assert_eq!(&[0, 0, 1, 0, 1, 2, 0, 1, 2, 3], &vals[..]); + } + + #[test] + fn test_flat_map_filling_no_elements_does_not_stop_iterator() { + let vals: Vec = flat_map_with_buffer( + [2, 0, 0, 3].into_iter(), + |val: usize, buffer: &mut Vec| buffer.extend(0..val), + ) + .collect(); + assert_eq!(&[0, 1, 0, 1, 2], &vals[..]); + } +} diff --git a/src/indexer/mod.rs b/src/indexer/mod.rs index fa4db5e660..28e845e636 100644 --- a/src/indexer/mod.rs +++ b/src/indexer/mod.rs @@ -3,6 +3,7 @@ pub mod delete_queue; pub mod demuxer; pub mod doc_id_mapping; mod doc_opstamp_mapping; +mod flat_map_with_buffer; pub mod index_writer; mod index_writer_status; mod json_term_writer; @@ -26,6 +27,7 @@ mod stamper; use crossbeam_channel as channel; use smallvec::SmallVec; +pub use self::flat_map_with_buffer::flat_map_with_buffer; pub use self::index_writer::IndexWriter; pub(crate) use self::json_term_writer::{ convert_to_fast_value_and_get_term, set_string_and_get_terms, JsonTermWriter, diff --git a/src/indexer/sorted_doc_id_multivalue_column.rs b/src/indexer/sorted_doc_id_multivalue_column.rs index 1e126af815..c960a01bd2 100644 --- a/src/indexer/sorted_doc_id_multivalue_column.rs +++ b/src/indexer/sorted_doc_id_multivalue_column.rs @@ -4,8 +4,9 @@ use fastfield_codecs::Column; use crate::fastfield::MultiValuedFastFieldReader; use crate::indexer::doc_id_mapping::SegmentDocIdMapping; +use crate::indexer::flat_map_with_buffer; use crate::schema::Field; -use crate::SegmentReader; +use crate::{DocAddress, SegmentReader}; pub(crate) struct RemappedDocIdMultiValueColumn<'a> { doc_id_mapping: &'a SegmentDocIdMapping, @@ -71,16 +72,13 @@ impl<'a> Column for RemappedDocIdMultiValueColumn<'a> { } fn iter(&self) -> Box + '_> { - Box::new( - self.doc_id_mapping - .iter_old_doc_addrs() - .flat_map(|old_doc_addr| { - let ff_reader = &self.fast_field_readers[old_doc_addr.segment_ord as usize]; - let mut vals = Vec::new(); - ff_reader.get_vals(old_doc_addr.doc_id, &mut vals); - vals.into_iter() - }), - ) + Box::new(flat_map_with_buffer( + self.doc_id_mapping.iter_old_doc_addrs(), + |old_doc_addr: DocAddress, buffer| { + let ff_reader = &self.fast_field_readers[old_doc_addr.segment_ord as usize]; + ff_reader.get_vals(old_doc_addr.doc_id, buffer); + }, + )) } fn min_value(&self) -> u64 { self.min_value From f60a5518907d0f594c4fe900e5b61f2d4762315e Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Wed, 5 Oct 2022 17:30:41 +0800 Subject: [PATCH 2/2] add flat_map_with_buffer to Iterator trait --- src/indexer/flat_map_with_buffer.rs | 59 +++++++++---------- src/indexer/mod.rs | 1 - .../sorted_doc_id_multivalue_column.rs | 17 +++--- 3 files changed, 36 insertions(+), 41 deletions(-) diff --git a/src/indexer/flat_map_with_buffer.rs b/src/indexer/flat_map_with_buffer.rs index 2e9194018c..88b509cdbe 100644 --- a/src/indexer/flat_map_with_buffer.rs +++ b/src/indexer/flat_map_with_buffer.rs @@ -1,10 +1,10 @@ -struct FlatMapWithgBuffer { +pub struct FlatMapWithBuffer { buffer: Vec, fill_buffer: F, underlying_it: Iter, } -impl Iterator for FlatMapWithgBuffer +impl Iterator for FlatMapWithBuffer where Iter: Iterator, F: Fn(I, &mut Vec), @@ -22,53 +22,48 @@ where } } -/// FUnction similar to `flat_map`, but the generating function fills a buffer -/// instead of returning an Iterator. -pub fn flat_map_with_buffer( - underlying_it: Iter, - fill_buffer: F, -) -> impl Iterator -where - F: Fn(I, &mut Vec), - Iter: Iterator, -{ - FlatMapWithgBuffer { - buffer: Vec::with_capacity(10), - fill_buffer, - underlying_it, +pub trait FlatMapWithBufferIter: Iterator { + /// Function similar to `flat_map`, but allows reusing a shared `Vec`. + fn flat_map_with_buffer(self, fill_buffer: F) -> FlatMapWithBuffer + where + F: Fn(Self::Item, &mut Vec), + Self: Sized, + { + FlatMapWithBuffer { + buffer: Vec::with_capacity(10), + fill_buffer, + underlying_it: self, + } } } +impl FlatMapWithBufferIter for T where T: Iterator {} + #[cfg(test)] mod tests { - use super::flat_map_with_buffer; + use crate::indexer::flat_map_with_buffer::FlatMapWithBufferIter; #[test] fn test_flat_map_with_buffer_empty() { - let vals: Vec = flat_map_with_buffer( - std::iter::empty::(), - |_val: usize, _buffer: &mut Vec| {}, - ) - .collect(); - assert!(vals.is_empty()); + let mut empty_iter = std::iter::empty::() + .flat_map_with_buffer(|_val: usize, _buffer: &mut Vec| {}); + assert!(empty_iter.next().is_none()); } #[test] fn test_flat_map_with_buffer_simple() { - let vals: Vec = flat_map_with_buffer(1..5, |val: usize, buffer: &mut Vec| { - buffer.extend(0..val) - }) - .collect(); + let vals: Vec = (1..5) + .flat_map_with_buffer(|val: usize, buffer: &mut Vec| buffer.extend(0..val)) + .collect(); assert_eq!(&[0, 0, 1, 0, 1, 2, 0, 1, 2, 3], &vals[..]); } #[test] fn test_flat_map_filling_no_elements_does_not_stop_iterator() { - let vals: Vec = flat_map_with_buffer( - [2, 0, 0, 3].into_iter(), - |val: usize, buffer: &mut Vec| buffer.extend(0..val), - ) - .collect(); + let vals: Vec = [2, 0, 0, 3] + .into_iter() + .flat_map_with_buffer(|val: usize, buffer: &mut Vec| buffer.extend(0..val)) + .collect(); assert_eq!(&[0, 1, 0, 1, 2], &vals[..]); } } diff --git a/src/indexer/mod.rs b/src/indexer/mod.rs index 28e845e636..c557350cf1 100644 --- a/src/indexer/mod.rs +++ b/src/indexer/mod.rs @@ -27,7 +27,6 @@ mod stamper; use crossbeam_channel as channel; use smallvec::SmallVec; -pub use self::flat_map_with_buffer::flat_map_with_buffer; pub use self::index_writer::IndexWriter; pub(crate) use self::json_term_writer::{ convert_to_fast_value_and_get_term, set_string_and_get_terms, JsonTermWriter, diff --git a/src/indexer/sorted_doc_id_multivalue_column.rs b/src/indexer/sorted_doc_id_multivalue_column.rs index c960a01bd2..45df329f54 100644 --- a/src/indexer/sorted_doc_id_multivalue_column.rs +++ b/src/indexer/sorted_doc_id_multivalue_column.rs @@ -2,9 +2,9 @@ use std::cmp; use fastfield_codecs::Column; +use super::flat_map_with_buffer::FlatMapWithBufferIter; use crate::fastfield::MultiValuedFastFieldReader; use crate::indexer::doc_id_mapping::SegmentDocIdMapping; -use crate::indexer::flat_map_with_buffer; use crate::schema::Field; use crate::{DocAddress, SegmentReader}; @@ -72,13 +72,14 @@ impl<'a> Column for RemappedDocIdMultiValueColumn<'a> { } fn iter(&self) -> Box + '_> { - Box::new(flat_map_with_buffer( - self.doc_id_mapping.iter_old_doc_addrs(), - |old_doc_addr: DocAddress, buffer| { - let ff_reader = &self.fast_field_readers[old_doc_addr.segment_ord as usize]; - ff_reader.get_vals(old_doc_addr.doc_id, buffer); - }, - )) + Box::new( + self.doc_id_mapping + .iter_old_doc_addrs() + .flat_map_with_buffer(|old_doc_addr: DocAddress, buffer| { + let ff_reader = &self.fast_field_readers[old_doc_addr.segment_ord as usize]; + ff_reader.get_vals(old_doc_addr.doc_id, buffer); + }), + ) } fn min_value(&self) -> u64 { self.min_value