From 3f88718f387df7969e68ab52ed7714681cfe3273 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Tue, 10 May 2022 16:29:16 +0800 Subject: [PATCH 01/39] refactor aggregations --- src/aggregation/agg_result.rs | 186 +----------------- src/aggregation/bucket/histogram/histogram.rs | 12 +- src/aggregation/bucket/range.rs | 2 +- src/aggregation/collector.rs | 2 +- src/aggregation/intermediate_agg_result.rs | 182 ++++++++++++++++- src/aggregation/mod.rs | 11 +- 6 files changed, 193 insertions(+), 202 deletions(-) diff --git a/src/aggregation/agg_result.rs b/src/aggregation/agg_result.rs index 3dba73d1a6..fc614990b4 100644 --- a/src/aggregation/agg_result.rs +++ b/src/aggregation/agg_result.rs @@ -4,21 +4,15 @@ //! intermediate average results, which is the sum and the number of values. The actual average is //! calculated on the step from intermediate to final aggregation result tree. -use std::cmp::Ordering; use std::collections::HashMap; use serde::{Deserialize, Serialize}; -use super::agg_req::{ - Aggregations, AggregationsInternal, BucketAggregationInternal, MetricAggregation, -}; -use super::bucket::{intermediate_buckets_to_final_buckets, GetDocCount}; -use super::intermediate_agg_result::{ - IntermediateAggregationResults, IntermediateBucketResult, IntermediateHistogramBucketEntry, - IntermediateMetricResult, IntermediateRangeBucketEntry, -}; +use super::agg_req::BucketAggregationInternal; +use super::bucket::GetDocCount; +use super::intermediate_agg_result::{IntermediateBucketResult, IntermediateMetricResult}; use super::metric::{SingleMetricResult, Stats}; -use super::{Key, VecWithNames}; +use super::Key; use crate::TantivyError; #[derive(Clone, Default, Debug, PartialEq, Serialize, Deserialize)] @@ -41,98 +35,6 @@ impl AggregationResults { ))) } } - - /// Convert and intermediate result and its aggregation request to the final result - pub fn from_intermediate_and_req( - results: IntermediateAggregationResults, - agg: Aggregations, - ) -> crate::Result { - AggregationResults::from_intermediate_and_req_internal(results, &(agg.into())) - } - - /// Convert and intermediate result and its aggregation request to the final result - /// - /// Internal function, CollectorAggregations is used instead Aggregations, which is optimized - /// for internal processing, by splitting metric and buckets into seperate groups. - pub(crate) fn from_intermediate_and_req_internal( - intermediate_results: IntermediateAggregationResults, - req: &AggregationsInternal, - ) -> crate::Result { - // Important assumption: - // When the tree contains buckets/metric, we expect it to have all buckets/metrics from the - // request - let mut results: HashMap = HashMap::new(); - - if let Some(buckets) = intermediate_results.buckets { - add_coverted_final_buckets_to_result(&mut results, buckets, &req.buckets)? - } else { - // When there are no buckets, we create empty buckets, so that the serialized json - // format is constant - add_empty_final_buckets_to_result(&mut results, &req.buckets)? - }; - - if let Some(metrics) = intermediate_results.metrics { - add_converted_final_metrics_to_result(&mut results, metrics); - } else { - // When there are no metrics, we create empty metric results, so that the serialized - // json format is constant - add_empty_final_metrics_to_result(&mut results, &req.metrics)?; - } - Ok(Self(results)) - } -} - -fn add_converted_final_metrics_to_result( - results: &mut HashMap, - metrics: VecWithNames, -) { - results.extend( - metrics - .into_iter() - .map(|(key, metric)| (key, AggregationResult::MetricResult(metric.into()))), - ); -} - -fn add_empty_final_metrics_to_result( - results: &mut HashMap, - req_metrics: &VecWithNames, -) -> crate::Result<()> { - results.extend(req_metrics.iter().map(|(key, req)| { - let empty_bucket = IntermediateMetricResult::empty_from_req(req); - ( - key.to_string(), - AggregationResult::MetricResult(empty_bucket.into()), - ) - })); - Ok(()) -} - -fn add_empty_final_buckets_to_result( - results: &mut HashMap, - req_buckets: &VecWithNames, -) -> crate::Result<()> { - let requested_buckets = req_buckets.iter(); - for (key, req) in requested_buckets { - let empty_bucket = AggregationResult::BucketResult(BucketResult::empty_from_req(req)?); - results.insert(key.to_string(), empty_bucket); - } - Ok(()) -} - -fn add_coverted_final_buckets_to_result( - results: &mut HashMap, - buckets: VecWithNames, - req_buckets: &VecWithNames, -) -> crate::Result<()> { - assert_eq!(buckets.len(), req_buckets.len()); - - let buckets_with_request = buckets.into_iter().zip(req_buckets.values()); - for ((key, bucket), req) in buckets_with_request { - let result = - AggregationResult::BucketResult(BucketResult::from_intermediate_and_req(bucket, req)?); - results.insert(key, result); - } - Ok(()) } #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] @@ -154,7 +56,8 @@ impl AggregationResult { match self { AggregationResult::BucketResult(_bucket) => Err(TantivyError::InternalError( "Tried to retrieve value from bucket aggregation. This is not supported and \ - should not happen during collection, but should be catched during validation" + should not happen during collection phase, but should be catched during \ + validation" .to_string(), )), AggregationResult::MetricResult(metric) => metric.get_value(agg_property), @@ -230,48 +133,7 @@ pub enum BucketResult { impl BucketResult { pub(crate) fn empty_from_req(req: &BucketAggregationInternal) -> crate::Result { let empty_bucket = IntermediateBucketResult::empty_from_req(&req.bucket_agg); - BucketResult::from_intermediate_and_req(empty_bucket, req) - } - - fn from_intermediate_and_req( - bucket_result: IntermediateBucketResult, - req: &BucketAggregationInternal, - ) -> crate::Result { - match bucket_result { - IntermediateBucketResult::Range(range_res) => { - let mut buckets: Vec = range_res - .buckets - .into_iter() - .map(|(_, bucket)| { - RangeBucketEntry::from_intermediate_and_req(bucket, &req.sub_aggregation) - }) - .collect::>>()?; - - buckets.sort_by(|left, right| { - // TODO use total_cmp next stable rust release - left.from - .unwrap_or(f64::MIN) - .partial_cmp(&right.from.unwrap_or(f64::MIN)) - .unwrap_or(Ordering::Equal) - }); - Ok(BucketResult::Range { buckets }) - } - IntermediateBucketResult::Histogram { buckets } => { - let buckets = intermediate_buckets_to_final_buckets( - buckets, - req.as_histogram() - .expect("unexpected aggregation, expected histogram aggregation"), - &req.sub_aggregation, - )?; - - Ok(BucketResult::Histogram { buckets }) - } - IntermediateBucketResult::Terms(terms) => terms.into_final_result( - req.as_term() - .expect("unexpected aggregation, expected term aggregation"), - &req.sub_aggregation, - ), - } + empty_bucket.into_final_bucket_result(req) } } @@ -311,22 +173,6 @@ pub struct BucketEntry { /// Sub-aggregations in this bucket. pub sub_aggregation: AggregationResults, } - -impl BucketEntry { - pub(crate) fn from_intermediate_and_req( - entry: IntermediateHistogramBucketEntry, - req: &AggregationsInternal, - ) -> crate::Result { - Ok(BucketEntry { - key: Key::F64(entry.key), - doc_count: entry.doc_count, - sub_aggregation: AggregationResults::from_intermediate_and_req_internal( - entry.sub_aggregation, - req, - )?, - }) - } -} impl GetDocCount for &BucketEntry { fn doc_count(&self) -> u64 { self.doc_count @@ -384,21 +230,3 @@ pub struct RangeBucketEntry { #[serde(skip_serializing_if = "Option::is_none")] pub to: Option, } - -impl RangeBucketEntry { - fn from_intermediate_and_req( - entry: IntermediateRangeBucketEntry, - req: &AggregationsInternal, - ) -> crate::Result { - Ok(RangeBucketEntry { - key: entry.key, - doc_count: entry.doc_count, - sub_aggregation: AggregationResults::from_intermediate_and_req_internal( - entry.sub_aggregation, - req, - )?, - to: entry.to, - from: entry.from, - }) - } -} diff --git a/src/aggregation/bucket/histogram/histogram.rs b/src/aggregation/bucket/histogram/histogram.rs index a2a4a87e59..0d5f5574c1 100644 --- a/src/aggregation/bucket/histogram/histogram.rs +++ b/src/aggregation/bucket/histogram/histogram.rs @@ -482,14 +482,12 @@ fn intermediate_buckets_to_final_buckets_fill_gaps( sub_aggregation: empty_sub_aggregation.clone(), }, }) - .map(|intermediate_bucket| { - BucketEntry::from_intermediate_and_req(intermediate_bucket, sub_aggregation) - }) + .map(|intermediate_bucket| intermediate_bucket.into_final_bucket_entry(sub_aggregation)) .collect::>>() } // Convert to BucketEntry -pub(crate) fn intermediate_buckets_to_final_buckets( +pub(crate) fn intermediate_histogram_buckets_to_final_buckets( buckets: Vec, histogram_req: &HistogramAggregation, sub_aggregation: &AggregationsInternal, @@ -503,8 +501,8 @@ pub(crate) fn intermediate_buckets_to_final_buckets( } else { buckets .into_iter() - .filter(|bucket| bucket.doc_count >= histogram_req.min_doc_count()) - .map(|bucket| BucketEntry::from_intermediate_and_req(bucket, sub_aggregation)) + .filter(|histogram_bucket| histogram_bucket.doc_count >= histogram_req.min_doc_count()) + .map(|histogram_bucket| histogram_bucket.into_final_bucket_entry(sub_aggregation)) .collect::>>() } } @@ -546,7 +544,7 @@ pub(crate) fn generate_buckets_with_opt_minmax( let offset = req.offset.unwrap_or(0.0); let first_bucket_num = get_bucket_num_f64(min, req.interval, offset) as i64; let last_bucket_num = get_bucket_num_f64(max, req.interval, offset) as i64; - let mut buckets = vec![]; + let mut buckets = Vec::with_capacity((first_bucket_num..=last_bucket_num).count()); for bucket_pos in first_bucket_num..=last_bucket_num { let bucket_key = bucket_pos as f64 * req.interval + offset; buckets.push(bucket_key); diff --git a/src/aggregation/bucket/range.rs b/src/aggregation/bucket/range.rs index d570e96a37..69206b1100 100644 --- a/src/aggregation/bucket/range.rs +++ b/src/aggregation/bucket/range.rs @@ -317,7 +317,7 @@ fn to_u64_range(range: &RangeAggregationRange, field_type: &Type) -> crate::Resu } /// Extends the provided buckets to contain the whole value range, by inserting buckets at the -/// beginning and end. +/// beginning and end and filling gaps. fn extend_validate_ranges( buckets: &[RangeAggregationRange], field_type: &Type, diff --git a/src/aggregation/collector.rs b/src/aggregation/collector.rs index f35a5e3e17..47cd94e6c9 100644 --- a/src/aggregation/collector.rs +++ b/src/aggregation/collector.rs @@ -87,7 +87,7 @@ impl Collector for AggregationCollector { segment_fruits: Vec<::Fruit>, ) -> crate::Result { let res = merge_fruits(segment_fruits)?; - AggregationResults::from_intermediate_and_req(res, self.agg.clone()) + res.into_final_bucket_result(self.agg.clone()) } } diff --git a/src/aggregation/intermediate_agg_result.rs b/src/aggregation/intermediate_agg_result.rs index 9bde00707e..cb2f9f416c 100644 --- a/src/aggregation/intermediate_agg_result.rs +++ b/src/aggregation/intermediate_agg_result.rs @@ -3,16 +3,20 @@ //! indices. use std::cmp::Ordering; +use std::collections::HashMap; use fnv::FnvHashMap; use itertools::Itertools; use serde::{Deserialize, Serialize}; -use super::agg_req::{AggregationsInternal, BucketAggregationType, MetricAggregation}; -use super::agg_result::BucketResult; +use super::agg_req::{ + Aggregations, AggregationsInternal, BucketAggregationInternal, BucketAggregationType, + MetricAggregation, +}; +use super::agg_result::{AggregationResult, BucketResult, RangeBucketEntry}; use super::bucket::{ - cut_off_buckets, get_agg_name_and_property, GetDocCount, Order, OrderTarget, - SegmentHistogramBucketEntry, TermsAggregation, + cut_off_buckets, get_agg_name_and_property, intermediate_histogram_buckets_to_final_buckets, + GetDocCount, Order, OrderTarget, SegmentHistogramBucketEntry, TermsAggregation, }; use super::metric::{IntermediateAverage, IntermediateStats}; use super::segment_agg_result::SegmentMetricResultCollector; @@ -31,6 +35,46 @@ pub struct IntermediateAggregationResults { } impl IntermediateAggregationResults { + /// Convert and intermediate result and its aggregation request to the final result + pub(crate) fn into_final_bucket_result( + self, + req: Aggregations, + ) -> crate::Result { + self.into_final_bucket_result_internal(&(req.into())) + } + + /// Convert and intermediate result and its aggregation request to the final result + /// + /// Internal function, AggregationsInternal is used instead Aggregations, which is optimized + /// for internal processing, by splitting metric and buckets into seperate groups. + pub(crate) fn into_final_bucket_result_internal( + self, + req: &AggregationsInternal, + ) -> crate::Result { + // Important assumption: + // When the tree contains buckets/metric, we expect it to have all buckets/metrics from the + // request + let mut results: HashMap = HashMap::new(); + + if let Some(buckets) = self.buckets { + convert_and_add_final_buckets_to_result(&mut results, buckets, &req.buckets)? + } else { + // When there are no buckets, we create empty buckets, so that the serialized json + // format is constant + add_empty_final_buckets_to_result(&mut results, &req.buckets)? + }; + + if let Some(metrics) = self.metrics { + convert_and_add_final_metrics_to_result(&mut results, metrics); + } else { + // When there are no metrics, we create empty metric results, so that the serialized + // json format is constant + add_empty_final_metrics_to_result(&mut results, &req.metrics)?; + } + + Ok(AggregationResults(results)) + } + pub(crate) fn empty_from_req(req: &AggregationsInternal) -> Self { let metrics = if req.metrics.is_empty() { None @@ -90,6 +134,58 @@ impl IntermediateAggregationResults { } } +fn convert_and_add_final_metrics_to_result( + results: &mut HashMap, + metrics: VecWithNames, +) { + results.extend( + metrics + .into_iter() + .map(|(key, metric)| (key, AggregationResult::MetricResult(metric.into()))), + ); +} + +fn add_empty_final_metrics_to_result( + results: &mut HashMap, + req_metrics: &VecWithNames, +) -> crate::Result<()> { + results.extend(req_metrics.iter().map(|(key, req)| { + let empty_bucket = IntermediateMetricResult::empty_from_req(req); + ( + key.to_string(), + AggregationResult::MetricResult(empty_bucket.into()), + ) + })); + Ok(()) +} + +fn add_empty_final_buckets_to_result( + results: &mut HashMap, + req_buckets: &VecWithNames, +) -> crate::Result<()> { + let requested_buckets = req_buckets.iter(); + for (key, req) in requested_buckets { + let empty_bucket = AggregationResult::BucketResult(BucketResult::empty_from_req(req)?); + results.insert(key.to_string(), empty_bucket); + } + Ok(()) +} + +fn convert_and_add_final_buckets_to_result( + results: &mut HashMap, + buckets: VecWithNames, + req_buckets: &VecWithNames, +) -> crate::Result<()> { + assert_eq!(buckets.len(), req_buckets.len()); + + let buckets_with_request = buckets.into_iter().zip(req_buckets.values()); + for ((key, bucket), req) in buckets_with_request { + let result = AggregationResult::BucketResult(bucket.into_final_bucket_result(req)?); + results.insert(key, result); + } + Ok(()) +} + /// An aggregation is either a bucket or a metric. #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] pub enum IntermediateAggregationResult { @@ -171,6 +267,45 @@ pub enum IntermediateBucketResult { } impl IntermediateBucketResult { + pub(crate) fn into_final_bucket_result( + self, + req: &BucketAggregationInternal, + ) -> crate::Result { + match self { + IntermediateBucketResult::Range(range_res) => { + let mut buckets: Vec = range_res + .buckets + .into_iter() + .map(|(_, bucket)| bucket.into_final_bucket_entry(&req.sub_aggregation)) + .collect::>>()?; + + buckets.sort_by(|left, right| { + // TODO use total_cmp next stable rust release + left.from + .unwrap_or(f64::MIN) + .partial_cmp(&right.from.unwrap_or(f64::MIN)) + .unwrap_or(Ordering::Equal) + }); + Ok(BucketResult::Range { buckets }) + } + IntermediateBucketResult::Histogram { buckets } => { + let buckets = intermediate_histogram_buckets_to_final_buckets( + buckets, + req.as_histogram() + .expect("unexpected aggregation, expected histogram aggregation"), + &req.sub_aggregation, + )?; + + Ok(BucketResult::Histogram { buckets }) + } + IntermediateBucketResult::Terms(terms) => terms.into_final_result( + req.as_term() + .expect("unexpected aggregation, expected term aggregation"), + &req.sub_aggregation, + ), + } + } + pub(crate) fn empty_from_req(req: &BucketAggregationType) -> Self { match req { BucketAggregationType::Terms(_) => IntermediateBucketResult::Terms(Default::default()), @@ -267,10 +402,9 @@ impl IntermediateTermBucketResult { Ok(BucketEntry { key: Key::Str(key), doc_count: entry.doc_count, - sub_aggregation: AggregationResults::from_intermediate_and_req_internal( - entry.sub_aggregation, - sub_aggregation_req, - )?, + sub_aggregation: entry + .sub_aggregation + .into_final_bucket_result_internal(sub_aggregation_req)?, }) }) .collect::>()?; @@ -374,6 +508,21 @@ pub struct IntermediateHistogramBucketEntry { pub sub_aggregation: IntermediateAggregationResults, } +impl IntermediateHistogramBucketEntry { + pub(crate) fn into_final_bucket_entry( + self, + req: &AggregationsInternal, + ) -> crate::Result { + Ok(BucketEntry { + key: Key::F64(self.key), + doc_count: self.doc_count, + sub_aggregation: self + .sub_aggregation + .into_final_bucket_result_internal(req)?, + }) + } +} + impl From for IntermediateHistogramBucketEntry { fn from(entry: SegmentHistogramBucketEntry) -> Self { IntermediateHistogramBucketEntry { @@ -402,6 +551,23 @@ pub struct IntermediateRangeBucketEntry { pub to: Option, } +impl IntermediateRangeBucketEntry { + pub(crate) fn into_final_bucket_entry( + self, + req: &AggregationsInternal, + ) -> crate::Result { + Ok(RangeBucketEntry { + key: self.key, + doc_count: self.doc_count, + sub_aggregation: self + .sub_aggregation + .into_final_bucket_result_internal(req)?, + to: self.to, + from: self.from, + }) + } +} + /// This is the term entry for a bucket, which contains a count, and optionally /// sub_aggregations. #[derive(Clone, Default, Debug, PartialEq, Serialize, Deserialize)] diff --git a/src/aggregation/mod.rs b/src/aggregation/mod.rs index 37fa05c0ff..dfaaf3265a 100644 --- a/src/aggregation/mod.rs +++ b/src/aggregation/mod.rs @@ -545,11 +545,10 @@ mod tests { let collector = DistributedAggregationCollector::from_aggs(agg_req.clone()); let searcher = reader.searcher(); - AggregationResults::from_intermediate_and_req( - searcher.search(&AllQuery, &collector).unwrap(), - agg_req, - ) - .unwrap() + let intermediate_agg_result = searcher.search(&AllQuery, &collector).unwrap(); + intermediate_agg_result + .into_final_bucket_result(agg_req) + .unwrap() } else { let collector = AggregationCollector::from_aggs(agg_req); @@ -985,7 +984,7 @@ mod tests { // Test de/serialization roundtrip on intermediate_agg_result let res: IntermediateAggregationResults = serde_json::from_str(&serde_json::to_string(&res).unwrap()).unwrap(); - AggregationResults::from_intermediate_and_req(res, agg_req.clone()).unwrap() + res.into_final_bucket_result(agg_req.clone()).unwrap() } else { let collector = AggregationCollector::from_aggs(agg_req.clone()); From a99e5459e3e58064ab8917b81e21e924e5ca0548 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Wed, 11 May 2022 17:05:50 +0800 Subject: [PATCH 02/39] return result from segment collector --- examples/custom_collector.rs | 3 +- src/aggregation/collector.rs | 3 +- src/collector/count_collector.rs | 11 +++--- src/collector/custom_score_top_collector.rs | 3 +- src/collector/docset_collector.rs | 3 +- src/collector/facet_collector.rs | 3 +- src/collector/filter_collector_wrapper.rs | 5 +-- src/collector/histogram_collector.rs | 3 +- src/collector/mod.rs | 38 ++++++++++++--------- src/collector/multi_collector.rs | 16 +++++---- src/collector/tests.rs | 9 +++-- src/collector/top_score_collector.rs | 3 +- src/collector/tweak_score_top_collector.rs | 3 +- 13 files changed, 61 insertions(+), 42 deletions(-) diff --git a/examples/custom_collector.rs b/examples/custom_collector.rs index 7bdc9d06b4..12f846a430 100644 --- a/examples/custom_collector.rs +++ b/examples/custom_collector.rs @@ -102,11 +102,12 @@ struct StatsSegmentCollector { impl SegmentCollector for StatsSegmentCollector { type Fruit = Option; - fn collect(&mut self, doc: u32, _score: Score) { + fn collect(&mut self, doc: u32, _score: Score) -> crate::Result<()> { let value = self.fast_field_reader.get(doc) as f64; self.stats.count += 1; self.stats.sum += value; self.stats.squared_sum += value * value; + Ok(()) } fn harvest(self) -> ::Fruit { diff --git a/src/aggregation/collector.rs b/src/aggregation/collector.rs index 47cd94e6c9..3cbbbcdc4e 100644 --- a/src/aggregation/collector.rs +++ b/src/aggregation/collector.rs @@ -132,8 +132,9 @@ impl SegmentCollector for AggregationSegmentCollector { type Fruit = crate::Result; #[inline] - fn collect(&mut self, doc: crate::DocId, _score: crate::Score) { + fn collect(&mut self, doc: crate::DocId, _score: crate::Score) -> crate::Result<()> { self.result.collect(doc, &self.aggs_with_accessor); + Ok(()) } fn harvest(mut self) -> Self::Fruit { diff --git a/src/collector/count_collector.rs b/src/collector/count_collector.rs index 075a4f36b4..02f30f85c1 100644 --- a/src/collector/count_collector.rs +++ b/src/collector/count_collector.rs @@ -65,8 +65,9 @@ pub struct SegmentCountCollector { impl SegmentCollector for SegmentCountCollector { type Fruit = usize; - fn collect(&mut self, _: DocId, _: Score) { + fn collect(&mut self, _: DocId, _: Score) -> crate::Result<()> { self.count += 1; + Ok(()) } fn harvest(self) -> usize { @@ -92,18 +93,18 @@ mod tests { } { let mut count_collector = SegmentCountCollector::default(); - count_collector.collect(0u32, 1.0); + count_collector.collect(0u32, 1.0).unwrap(); assert_eq!(count_collector.harvest(), 1); } { let mut count_collector = SegmentCountCollector::default(); - count_collector.collect(0u32, 1.0); + count_collector.collect(0u32, 1.0).unwrap(); assert_eq!(count_collector.harvest(), 1); } { let mut count_collector = SegmentCountCollector::default(); - count_collector.collect(0u32, 1.0); - count_collector.collect(1u32, 1.0); + count_collector.collect(0u32, 1.0).unwrap(); + count_collector.collect(1u32, 1.0).unwrap(); assert_eq!(count_collector.harvest(), 2); } } diff --git a/src/collector/custom_score_top_collector.rs b/src/collector/custom_score_top_collector.rs index d645004ade..d597727ed1 100644 --- a/src/collector/custom_score_top_collector.rs +++ b/src/collector/custom_score_top_collector.rs @@ -90,9 +90,10 @@ where { type Fruit = Vec<(TScore, DocAddress)>; - fn collect(&mut self, doc: DocId, _score: Score) { + fn collect(&mut self, doc: DocId, _score: Score) -> crate::Result<()> { let score = self.segment_scorer.score(doc); self.segment_collector.collect(doc, score); + Ok(()) } fn harvest(self) -> Vec<(TScore, DocAddress)> { diff --git a/src/collector/docset_collector.rs b/src/collector/docset_collector.rs index a27a394189..9f6a5fd3bd 100644 --- a/src/collector/docset_collector.rs +++ b/src/collector/docset_collector.rs @@ -50,8 +50,9 @@ pub struct DocSetChildCollector { impl SegmentCollector for DocSetChildCollector { type Fruit = (u32, HashSet); - fn collect(&mut self, doc: crate::DocId, _score: Score) { + fn collect(&mut self, doc: crate::DocId, _score: Score) -> crate::Result<()> { self.docs.insert(doc); + Ok(()) } fn harvest(self) -> (u32, HashSet) { diff --git a/src/collector/facet_collector.rs b/src/collector/facet_collector.rs index e2ef47f989..8ad3311e28 100644 --- a/src/collector/facet_collector.rs +++ b/src/collector/facet_collector.rs @@ -333,7 +333,7 @@ impl Collector for FacetCollector { impl SegmentCollector for FacetSegmentCollector { type Fruit = FacetCounts; - fn collect(&mut self, doc: DocId, _: Score) { + fn collect(&mut self, doc: DocId, _: Score) -> crate::Result<()> { self.reader.facet_ords(doc, &mut self.facet_ords_buf); let mut previous_collapsed_ord: usize = usize::MAX; for &facet_ord in &self.facet_ords_buf { @@ -345,6 +345,7 @@ impl SegmentCollector for FacetSegmentCollector { }; previous_collapsed_ord = collapsed_ord; } + Ok(()) } /// Returns the results of the collection. diff --git a/src/collector/filter_collector_wrapper.rs b/src/collector/filter_collector_wrapper.rs index b1dbaaa203..15e7f80212 100644 --- a/src/collector/filter_collector_wrapper.rs +++ b/src/collector/filter_collector_wrapper.rs @@ -173,11 +173,12 @@ where { type Fruit = TSegmentCollector::Fruit; - fn collect(&mut self, doc: u32, score: Score) { + fn collect(&mut self, doc: u32, score: Score) -> crate::Result<()> { let value = self.fast_field_reader.get(doc); if (self.predicate)(value) { - self.segment_collector.collect(doc, score) + self.segment_collector.collect(doc, score)?; } + Ok(()) } fn harvest(self) -> ::Fruit { diff --git a/src/collector/histogram_collector.rs b/src/collector/histogram_collector.rs index 22956a86a2..fbf398627a 100644 --- a/src/collector/histogram_collector.rs +++ b/src/collector/histogram_collector.rs @@ -91,9 +91,10 @@ pub struct SegmentHistogramCollector { impl SegmentCollector for SegmentHistogramCollector { type Fruit = Vec; - fn collect(&mut self, doc: DocId, _score: Score) { + fn collect(&mut self, doc: DocId, _score: Score) -> crate::Result<()> { let value = self.ff_reader.get(doc); self.histogram_computer.add_value(value); + Ok(()) } fn harvest(self) -> Self::Fruit { diff --git a/src/collector/mod.rs b/src/collector/mod.rs index 1597d7fe45..97b6020340 100644 --- a/src/collector/mod.rs +++ b/src/collector/mod.rs @@ -175,12 +175,12 @@ pub trait Collector: Sync + Send { if let Some(alive_bitset) = reader.alive_bitset() { weight.for_each(reader, &mut |doc, score| { if alive_bitset.is_alive(doc) { - segment_collector.collect(doc, score); + segment_collector.collect(doc, score).unwrap(); // TODO } })?; } else { weight.for_each(reader, &mut |doc, score| { - segment_collector.collect(doc, score); + segment_collector.collect(doc, score).unwrap(); // TODO })?; } Ok(segment_collector.harvest()) @@ -190,10 +190,11 @@ pub trait Collector: Sync + Send { impl SegmentCollector for Option { type Fruit = Option; - fn collect(&mut self, doc: DocId, score: Score) { + fn collect(&mut self, doc: DocId, score: Score) -> crate::Result<()> { if let Some(segment_collector) = self { - segment_collector.collect(doc, score); + segment_collector.collect(doc, score)?; } + Ok(()) } fn harvest(self) -> Self::Fruit { @@ -253,7 +254,7 @@ pub trait SegmentCollector: 'static { type Fruit: Fruit; /// The query pushes the scored document to the collector via this method. - fn collect(&mut self, doc: DocId, score: Score); + fn collect(&mut self, doc: DocId, score: Score) -> crate::Result<()>; /// Extract the fruit of the collection from the `SegmentCollector`. fn harvest(self) -> Self::Fruit; @@ -308,9 +309,10 @@ where { type Fruit = (Left::Fruit, Right::Fruit); - fn collect(&mut self, doc: DocId, score: Score) { - self.0.collect(doc, score); - self.1.collect(doc, score); + fn collect(&mut self, doc: DocId, score: Score) -> crate::Result<()> { + self.0.collect(doc, score)?; + self.1.collect(doc, score)?; + Ok(()) } fn harvest(self) -> ::Fruit { @@ -372,10 +374,11 @@ where { type Fruit = (One::Fruit, Two::Fruit, Three::Fruit); - fn collect(&mut self, doc: DocId, score: Score) { - self.0.collect(doc, score); - self.1.collect(doc, score); - self.2.collect(doc, score); + fn collect(&mut self, doc: DocId, score: Score) -> crate::Result<()> { + self.0.collect(doc, score)?; + self.1.collect(doc, score)?; + self.2.collect(doc, score)?; + Ok(()) } fn harvest(self) -> ::Fruit { @@ -446,11 +449,12 @@ where { type Fruit = (One::Fruit, Two::Fruit, Three::Fruit, Four::Fruit); - fn collect(&mut self, doc: DocId, score: Score) { - self.0.collect(doc, score); - self.1.collect(doc, score); - self.2.collect(doc, score); - self.3.collect(doc, score); + fn collect(&mut self, doc: DocId, score: Score) -> crate::Result<()> { + self.0.collect(doc, score)?; + self.1.collect(doc, score)?; + self.2.collect(doc, score)?; + self.3.collect(doc, score)?; + Ok(()) } fn harvest(self) -> ::Fruit { diff --git a/src/collector/multi_collector.rs b/src/collector/multi_collector.rs index 039902ff4f..7b119ad868 100644 --- a/src/collector/multi_collector.rs +++ b/src/collector/multi_collector.rs @@ -52,8 +52,9 @@ impl Collector for CollectorWrapper { impl SegmentCollector for Box { type Fruit = Box; - fn collect(&mut self, doc: u32, score: Score) { - self.as_mut().collect(doc, score); + fn collect(&mut self, doc: u32, score: Score) -> crate::Result<()> { + self.as_mut().collect(doc, score)?; + Ok(()) } fn harvest(self) -> Box { @@ -62,7 +63,7 @@ impl SegmentCollector for Box { } pub trait BoxableSegmentCollector { - fn collect(&mut self, doc: u32, score: Score); + fn collect(&mut self, doc: u32, score: Score) -> crate::Result<()>; fn harvest_from_box(self: Box) -> Box; } @@ -71,8 +72,8 @@ pub struct SegmentCollectorWrapper(TSegment impl BoxableSegmentCollector for SegmentCollectorWrapper { - fn collect(&mut self, doc: u32, score: Score) { - self.0.collect(doc, score); + fn collect(&mut self, doc: u32, score: Score) -> crate::Result<()> { + self.0.collect(doc, score) } fn harvest_from_box(self: Box) -> Box { @@ -228,10 +229,11 @@ pub struct MultiCollectorChild { impl SegmentCollector for MultiCollectorChild { type Fruit = MultiFruit; - fn collect(&mut self, doc: DocId, score: Score) { + fn collect(&mut self, doc: DocId, score: Score) -> crate::Result<()> { for child in &mut self.children { - child.collect(doc, score); + child.collect(doc, score)?; } + Ok(()) } fn harvest(self) -> MultiFruit { diff --git a/src/collector/tests.rs b/src/collector/tests.rs index 3bda822a10..5e0a0cfb2d 100644 --- a/src/collector/tests.rs +++ b/src/collector/tests.rs @@ -138,9 +138,10 @@ impl Collector for TestCollector { impl SegmentCollector for TestSegmentCollector { type Fruit = TestFruit; - fn collect(&mut self, doc: DocId, score: Score) { + fn collect(&mut self, doc: DocId, score: Score) -> crate::Result<()> { self.fruit.docs.push(DocAddress::new(self.segment_id, doc)); self.fruit.scores.push(score); + Ok(()) } fn harvest(self) -> ::Fruit { @@ -198,9 +199,10 @@ impl Collector for FastFieldTestCollector { impl SegmentCollector for FastFieldSegmentCollector { type Fruit = Vec; - fn collect(&mut self, doc: DocId, _score: Score) { + fn collect(&mut self, doc: DocId, _score: Score) -> crate::Result<()> { let val = self.reader.get(doc); self.vals.push(val); + Ok(()) } fn harvest(self) -> Vec { @@ -255,9 +257,10 @@ impl Collector for BytesFastFieldTestCollector { impl SegmentCollector for BytesFastFieldSegmentCollector { type Fruit = Vec; - fn collect(&mut self, doc: u32, _score: Score) { + fn collect(&mut self, doc: u32, _score: Score) -> crate::Result<()> { let data = self.reader.get_bytes(doc); self.vals.extend(data); + Ok(()) } fn harvest(self) -> ::Fruit { diff --git a/src/collector/top_score_collector.rs b/src/collector/top_score_collector.rs index 516dedcb58..e0e3aeb9dc 100644 --- a/src/collector/top_score_collector.rs +++ b/src/collector/top_score_collector.rs @@ -699,8 +699,9 @@ pub struct TopScoreSegmentCollector(TopSegmentCollector); impl SegmentCollector for TopScoreSegmentCollector { type Fruit = Vec<(Score, DocAddress)>; - fn collect(&mut self, doc: DocId, score: Score) { + fn collect(&mut self, doc: DocId, score: Score) -> crate::Result<()> { self.0.collect(doc, score); + Ok(()) } fn harvest(self) -> Vec<(Score, DocAddress)> { diff --git a/src/collector/tweak_score_top_collector.rs b/src/collector/tweak_score_top_collector.rs index 1a81e73616..cb8d358abe 100644 --- a/src/collector/tweak_score_top_collector.rs +++ b/src/collector/tweak_score_top_collector.rs @@ -93,9 +93,10 @@ where { type Fruit = Vec<(TScore, DocAddress)>; - fn collect(&mut self, doc: DocId, score: Score) { + fn collect(&mut self, doc: DocId, score: Score) -> crate::Result<()> { let score = self.segment_scorer.score(doc, score); self.segment_collector.collect(doc, score); + Ok(()) } fn harvest(self) -> Vec<(TScore, DocAddress)> { From 6a4632211a3f23b2051c42157959eaa3255cd0eb Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Wed, 11 May 2022 18:03:29 +0800 Subject: [PATCH 03/39] forward error in aggregation collect --- src/aggregation/agg_req_with_accessor.rs | 2 ++ src/aggregation/bucket/histogram/histogram.rs | 25 ++++++++++-------- src/aggregation/bucket/range.rs | 20 +++++++------- src/aggregation/bucket/term_agg.rs | 26 ++++++++++--------- src/aggregation/collector.rs | 4 +-- src/aggregation/segment_agg_result.rs | 21 ++++++++------- 6 files changed, 55 insertions(+), 43 deletions(-) diff --git a/src/aggregation/agg_req_with_accessor.rs b/src/aggregation/agg_req_with_accessor.rs index 8ed82ac5c6..ed44ab6ef9 100644 --- a/src/aggregation/agg_req_with_accessor.rs +++ b/src/aggregation/agg_req_with_accessor.rs @@ -1,5 +1,7 @@ //! This will enhance the request tree with access to the fastfield and metadata. +use std::rc::Rc; +use std::sync::atomic::AtomicU32; use std::sync::Arc; use super::agg_req::{Aggregation, Aggregations, BucketAggregationType, MetricAggregation}; diff --git a/src/aggregation/bucket/histogram/histogram.rs b/src/aggregation/bucket/histogram/histogram.rs index 0d5f5574c1..c2a7e3472d 100644 --- a/src/aggregation/bucket/histogram/histogram.rs +++ b/src/aggregation/bucket/histogram/histogram.rs @@ -311,7 +311,7 @@ impl SegmentHistogramCollector { doc: &[DocId], bucket_with_accessor: &BucketAggregationWithAccessor, force_flush: bool, - ) { + ) -> crate::Result<()> { let bounds = self.bounds; let interval = self.interval; let offset = self.offset; @@ -341,28 +341,28 @@ impl SegmentHistogramCollector { bucket_pos0, docs[0], &bucket_with_accessor.sub_aggregation, - ); + )?; self.increment_bucket_if_in_bounds( val1, &bounds, bucket_pos1, docs[1], &bucket_with_accessor.sub_aggregation, - ); + )?; self.increment_bucket_if_in_bounds( val2, &bounds, bucket_pos2, docs[2], &bucket_with_accessor.sub_aggregation, - ); + )?; self.increment_bucket_if_in_bounds( val3, &bounds, bucket_pos3, docs[3], &bucket_with_accessor.sub_aggregation, - ); + )?; } for doc in iter.remainder() { let val = f64_from_fastfield_u64(accessor.get(*doc), &self.field_type); @@ -376,16 +376,17 @@ impl SegmentHistogramCollector { self.buckets[bucket_pos].key, get_bucket_val(val, self.interval, self.offset) as f64 ); - self.increment_bucket(bucket_pos, *doc, &bucket_with_accessor.sub_aggregation); + self.increment_bucket(bucket_pos, *doc, &bucket_with_accessor.sub_aggregation)?; } if force_flush { if let Some(sub_aggregations) = self.sub_aggregations.as_mut() { for sub_aggregation in sub_aggregations { sub_aggregation - .flush_staged_docs(&bucket_with_accessor.sub_aggregation, force_flush); + .flush_staged_docs(&bucket_with_accessor.sub_aggregation, force_flush)?; } } } + Ok(()) } #[inline] @@ -396,15 +397,16 @@ impl SegmentHistogramCollector { bucket_pos: usize, doc: DocId, bucket_with_accessor: &AggregationsWithAccessor, - ) { + ) -> crate::Result<()> { if bounds.contains(val) { debug_assert_eq!( self.buckets[bucket_pos].key, get_bucket_val(val, self.interval, self.offset) as f64 ); - self.increment_bucket(bucket_pos, doc, bucket_with_accessor); + self.increment_bucket(bucket_pos, doc, bucket_with_accessor)?; } + Ok(()) } #[inline] @@ -413,12 +415,13 @@ impl SegmentHistogramCollector { bucket_pos: usize, doc: DocId, bucket_with_accessor: &AggregationsWithAccessor, - ) { + ) -> crate::Result<()> { let bucket = &mut self.buckets[bucket_pos]; bucket.doc_count += 1; if let Some(sub_aggregation) = self.sub_aggregations.as_mut() { - (&mut sub_aggregation[bucket_pos]).collect(doc, bucket_with_accessor); + (&mut sub_aggregation[bucket_pos]).collect(doc, bucket_with_accessor)?; } + Ok(()) } fn f64_from_fastfield_u64(&self, val: u64) -> f64 { diff --git a/src/aggregation/bucket/range.rs b/src/aggregation/bucket/range.rs index 69206b1100..610453e036 100644 --- a/src/aggregation/bucket/range.rs +++ b/src/aggregation/bucket/range.rs @@ -224,7 +224,7 @@ impl SegmentRangeCollector { doc: &[DocId], bucket_with_accessor: &BucketAggregationWithAccessor, force_flush: bool, - ) { + ) -> crate::Result<()> { let mut iter = doc.chunks_exact(4); let accessor = bucket_with_accessor .accessor @@ -240,24 +240,25 @@ impl SegmentRangeCollector { let bucket_pos3 = self.get_bucket_pos(val3); let bucket_pos4 = self.get_bucket_pos(val4); - self.increment_bucket(bucket_pos1, docs[0], &bucket_with_accessor.sub_aggregation); - self.increment_bucket(bucket_pos2, docs[1], &bucket_with_accessor.sub_aggregation); - self.increment_bucket(bucket_pos3, docs[2], &bucket_with_accessor.sub_aggregation); - self.increment_bucket(bucket_pos4, docs[3], &bucket_with_accessor.sub_aggregation); + self.increment_bucket(bucket_pos1, docs[0], &bucket_with_accessor.sub_aggregation)?; + self.increment_bucket(bucket_pos2, docs[1], &bucket_with_accessor.sub_aggregation)?; + self.increment_bucket(bucket_pos3, docs[2], &bucket_with_accessor.sub_aggregation)?; + self.increment_bucket(bucket_pos4, docs[3], &bucket_with_accessor.sub_aggregation)?; } for doc in iter.remainder() { let val = accessor.get(*doc); let bucket_pos = self.get_bucket_pos(val); - self.increment_bucket(bucket_pos, *doc, &bucket_with_accessor.sub_aggregation); + self.increment_bucket(bucket_pos, *doc, &bucket_with_accessor.sub_aggregation)?; } if force_flush { for bucket in &mut self.buckets { if let Some(sub_aggregation) = &mut bucket.bucket.sub_aggregation { sub_aggregation - .flush_staged_docs(&bucket_with_accessor.sub_aggregation, force_flush); + .flush_staged_docs(&bucket_with_accessor.sub_aggregation, force_flush)?; } } } + Ok(()) } #[inline] @@ -266,13 +267,14 @@ impl SegmentRangeCollector { bucket_pos: usize, doc: DocId, bucket_with_accessor: &AggregationsWithAccessor, - ) { + ) -> crate::Result<()> { let bucket = &mut self.buckets[bucket_pos]; bucket.bucket.doc_count += 1; if let Some(sub_aggregation) = &mut bucket.bucket.sub_aggregation { - sub_aggregation.collect(doc, bucket_with_accessor); + sub_aggregation.collect(doc, bucket_with_accessor)?; } + Ok(()) } #[inline] diff --git a/src/aggregation/bucket/term_agg.rs b/src/aggregation/bucket/term_agg.rs index c9833c8853..1ed63d3e65 100644 --- a/src/aggregation/bucket/term_agg.rs +++ b/src/aggregation/bucket/term_agg.rs @@ -246,8 +246,7 @@ impl TermBuckets { doc: DocId, bucket_with_accessor: &AggregationsWithAccessor, blueprint: &Option, - ) { - // self.ensure_vec_exists(term_ids); + ) -> crate::Result<()> { for &term_id in term_ids { let entry = self .entries @@ -255,17 +254,19 @@ impl TermBuckets { .or_insert_with(|| TermBucketEntry::from_blueprint(blueprint)); entry.doc_count += 1; if let Some(sub_aggregations) = entry.sub_aggregations.as_mut() { - sub_aggregations.collect(doc, bucket_with_accessor); + sub_aggregations.collect(doc, bucket_with_accessor)?; } } + Ok(()) } - fn force_flush(&mut self, agg_with_accessor: &AggregationsWithAccessor) { + fn force_flush(&mut self, agg_with_accessor: &AggregationsWithAccessor) -> crate::Result<()> { for entry in &mut self.entries.values_mut() { if let Some(sub_aggregations) = entry.sub_aggregations.as_mut() { - sub_aggregations.flush_staged_docs(agg_with_accessor, false); + sub_aggregations.flush_staged_docs(agg_with_accessor, false)?; } } + Ok(()) } } @@ -421,7 +422,7 @@ impl SegmentTermCollector { doc: &[DocId], bucket_with_accessor: &BucketAggregationWithAccessor, force_flush: bool, - ) { + ) -> crate::Result<()> { let accessor = bucket_with_accessor .accessor .as_multi() @@ -442,25 +443,25 @@ impl SegmentTermCollector { docs[0], &bucket_with_accessor.sub_aggregation, &self.blueprint, - ); + )?; self.term_buckets.increment_bucket( &vals2, docs[1], &bucket_with_accessor.sub_aggregation, &self.blueprint, - ); + )?; self.term_buckets.increment_bucket( &vals3, docs[2], &bucket_with_accessor.sub_aggregation, &self.blueprint, - ); + )?; self.term_buckets.increment_bucket( &vals4, docs[3], &bucket_with_accessor.sub_aggregation, &self.blueprint, - ); + )?; } for &doc in iter.remainder() { accessor.get_vals(doc, &mut vals1); @@ -470,12 +471,13 @@ impl SegmentTermCollector { doc, &bucket_with_accessor.sub_aggregation, &self.blueprint, - ); + )?; } if force_flush { self.term_buckets - .force_flush(&bucket_with_accessor.sub_aggregation); + .force_flush(&bucket_with_accessor.sub_aggregation)?; } + Ok(()) } } diff --git a/src/aggregation/collector.rs b/src/aggregation/collector.rs index 3cbbbcdc4e..69913931cd 100644 --- a/src/aggregation/collector.rs +++ b/src/aggregation/collector.rs @@ -133,13 +133,13 @@ impl SegmentCollector for AggregationSegmentCollector { #[inline] fn collect(&mut self, doc: crate::DocId, _score: crate::Score) -> crate::Result<()> { - self.result.collect(doc, &self.aggs_with_accessor); + self.result.collect(doc, &self.aggs_with_accessor)?; Ok(()) } fn harvest(mut self) -> Self::Fruit { self.result - .flush_staged_docs(&self.aggs_with_accessor, true); + .flush_staged_docs(&self.aggs_with_accessor, true)?; self.result .into_intermediate_aggregations_result(&self.aggs_with_accessor) } diff --git a/src/aggregation/segment_agg_result.rs b/src/aggregation/segment_agg_result.rs index 81f2b85de9..121fb4cf3f 100644 --- a/src/aggregation/segment_agg_result.rs +++ b/src/aggregation/segment_agg_result.rs @@ -115,21 +115,22 @@ impl SegmentAggregationResultsCollector { &mut self, doc: crate::DocId, agg_with_accessor: &AggregationsWithAccessor, - ) { + ) -> crate::Result<()> { self.staged_docs[self.num_staged_docs] = doc; self.num_staged_docs += 1; if self.num_staged_docs == self.staged_docs.len() { - self.flush_staged_docs(agg_with_accessor, false); + self.flush_staged_docs(agg_with_accessor, false)?; } + Ok(()) } pub(crate) fn flush_staged_docs( &mut self, agg_with_accessor: &AggregationsWithAccessor, force_flush: bool, - ) { + ) -> crate::Result<()> { if self.num_staged_docs == 0 { - return; + return Ok(()); } if let Some(metrics) = &mut self.metrics { for (collector, agg_with_accessor) in @@ -148,11 +149,12 @@ impl SegmentAggregationResultsCollector { &self.staged_docs[..self.num_staged_docs], agg_with_accessor, force_flush, - ); + )?; } } self.num_staged_docs = 0; + Ok(()) } } @@ -256,17 +258,18 @@ impl SegmentBucketResultCollector { doc: &[DocId], bucket_with_accessor: &BucketAggregationWithAccessor, force_flush: bool, - ) { + ) -> crate::Result<()> { match self { SegmentBucketResultCollector::Range(range) => { - range.collect_block(doc, bucket_with_accessor, force_flush); + range.collect_block(doc, bucket_with_accessor, force_flush)?; } SegmentBucketResultCollector::Histogram(histogram) => { - histogram.collect_block(doc, bucket_with_accessor, force_flush) + histogram.collect_block(doc, bucket_with_accessor, force_flush)?; } SegmentBucketResultCollector::Terms(terms) => { - terms.collect_block(doc, bucket_with_accessor, force_flush) + terms.collect_block(doc, bucket_with_accessor, force_flush)?; } } + Ok(()) } } From 11ac4512504926e8b301ba5fc593c46f066f0430 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Thu, 12 May 2022 12:18:29 +0800 Subject: [PATCH 04/39] abort aggregation when too many buckets are created Validation happens on different phases depending on the aggregation Term: During segment collection Histogram: At the end when converting in intermediate buckets (we preallocate empty buckets for the range) Revisit after #1370 Range: When validating the request update CHANGELOG --- CHANGELOG.md | 1 + examples/custom_collector.rs | 2 +- src/aggregation/agg_req_with_accessor.rs | 5 ++ src/aggregation/bucket/histogram/histogram.rs | 9 ++- src/aggregation/bucket/range.rs | 26 +++++++-- src/aggregation/bucket/term_agg.rs | 57 +++++++++++++++---- src/aggregation/mod.rs | 15 +++-- src/aggregation/segment_agg_result.rs | 14 ++++- src/collector/mod.rs | 6 +- src/query/boolean_query/boolean_weight.rs | 6 +- src/query/term_query/term_weight.rs | 4 +- src/query/weight.rs | 11 ++-- 12 files changed, 118 insertions(+), 38 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 00d88d2938..4282b830a7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ Unreleased - Add [histogram](https://github.com/quickwit-oss/tantivy/pull/1306) aggregation (@PSeitz) - Add support for fastfield on text fields (@PSeitz) - Add terms aggregation (@PSeitz) +- API Change: `SegmentCollector.collect` changed to return a `Result`. Tantivy 0.17 ================================ diff --git a/examples/custom_collector.rs b/examples/custom_collector.rs index 12f846a430..14a9d20358 100644 --- a/examples/custom_collector.rs +++ b/examples/custom_collector.rs @@ -102,7 +102,7 @@ struct StatsSegmentCollector { impl SegmentCollector for StatsSegmentCollector { type Fruit = Option; - fn collect(&mut self, doc: u32, _score: Score) -> crate::Result<()> { + fn collect(&mut self, doc: u32, _score: Score) -> tantivy::Result<()> { let value = self.fast_field_reader.get(doc) as f64; self.stats.count += 1; self.stats.sum += value; diff --git a/src/aggregation/agg_req_with_accessor.rs b/src/aggregation/agg_req_with_accessor.rs index ed44ab6ef9..10597b3de0 100644 --- a/src/aggregation/agg_req_with_accessor.rs +++ b/src/aggregation/agg_req_with_accessor.rs @@ -62,6 +62,7 @@ pub struct BucketAggregationWithAccessor { pub(crate) field_type: Type, pub(crate) bucket_agg: BucketAggregationType, pub(crate) sub_aggregation: AggregationsWithAccessor, + pub(crate) bucket_count: Rc, } impl BucketAggregationWithAccessor { @@ -69,6 +70,7 @@ impl BucketAggregationWithAccessor { bucket: &BucketAggregationType, sub_aggregation: &Aggregations, reader: &SegmentReader, + bucket_count: Rc, ) -> crate::Result { let mut inverted_index = None; let (accessor, field_type) = match &bucket { @@ -97,6 +99,7 @@ impl BucketAggregationWithAccessor { sub_aggregation: get_aggs_with_accessor_and_validate(&sub_aggregation, reader)?, bucket_agg: bucket.clone(), inverted_index, + bucket_count, }) } } @@ -137,6 +140,7 @@ pub(crate) fn get_aggs_with_accessor_and_validate( aggs: &Aggregations, reader: &SegmentReader, ) -> crate::Result { + let bucket_count: Rc = Default::default(); let mut metrics = vec![]; let mut buckets = vec![]; for (key, agg) in aggs.iter() { @@ -147,6 +151,7 @@ pub(crate) fn get_aggs_with_accessor_and_validate( &bucket.bucket_agg, &bucket.sub_aggregation, reader, + Rc::clone(&bucket_count), )?, )), Aggregation::Metric(metric) => metrics.push(( diff --git a/src/aggregation/bucket/histogram/histogram.rs b/src/aggregation/bucket/histogram/histogram.rs index c2a7e3472d..69111c71fc 100644 --- a/src/aggregation/bucket/histogram/histogram.rs +++ b/src/aggregation/bucket/histogram/histogram.rs @@ -13,7 +13,9 @@ use crate::aggregation::f64_from_fastfield_u64; use crate::aggregation::intermediate_agg_result::{ IntermediateAggregationResults, IntermediateBucketResult, IntermediateHistogramBucketEntry, }; -use crate::aggregation::segment_agg_result::SegmentAggregationResultsCollector; +use crate::aggregation::segment_agg_result::{ + validate_bucket_count, SegmentAggregationResultsCollector, +}; use crate::fastfield::{DynamicFastFieldReader, FastFieldReader}; use crate::schema::Type; use crate::{DocId, TantivyError}; @@ -250,6 +252,11 @@ impl SegmentHistogramCollector { ); }; + agg_with_accessor + .bucket_count + .fetch_add(buckets.len() as u32, std::sync::atomic::Ordering::Relaxed); + validate_bucket_count(&agg_with_accessor.bucket_count)?; + Ok(IntermediateBucketResult::Histogram { buckets }) } diff --git a/src/aggregation/bucket/range.rs b/src/aggregation/bucket/range.rs index 610453e036..590165c158 100644 --- a/src/aggregation/bucket/range.rs +++ b/src/aggregation/bucket/range.rs @@ -1,6 +1,9 @@ use std::fmt::Debug; use std::ops::Range; +use std::rc::Rc; +use std::sync::atomic::AtomicU32; +use fnv::FnvHashMap; use serde::{Deserialize, Serialize}; use crate::aggregation::agg_req_with_accessor::{ @@ -9,8 +12,10 @@ use crate::aggregation::agg_req_with_accessor::{ use crate::aggregation::intermediate_agg_result::{ IntermediateBucketResult, IntermediateRangeBucketEntry, IntermediateRangeBucketResult, }; -use crate::aggregation::segment_agg_result::SegmentAggregationResultsCollector; -use crate::aggregation::{f64_from_fastfield_u64, f64_to_fastfield_u64, Key}; +use crate::aggregation::segment_agg_result::{ + validate_bucket_count, SegmentAggregationResultsCollector, +}; +use crate::aggregation::{f64_from_fastfield_u64, f64_to_fastfield_u64, Key, SerializedKey}; use crate::fastfield::FastFieldReader; use crate::schema::Type; use crate::{DocId, TantivyError}; @@ -153,7 +158,7 @@ impl SegmentRangeCollector { ) -> crate::Result { let field_type = self.field_type; - let buckets = self + let buckets: FnvHashMap = self .buckets .into_iter() .map(move |range_bucket| { @@ -174,12 +179,13 @@ impl SegmentRangeCollector { pub(crate) fn from_req_and_validate( req: &RangeAggregation, sub_aggregation: &AggregationsWithAccessor, + bucket_count: &Rc, field_type: Type, ) -> crate::Result { // The range input on the request is f64. // We need to convert to u64 ranges, because we read the values as u64. // The mapping from the conversion is monotonic so ordering is preserved. - let buckets = extend_validate_ranges(&req.ranges, &field_type)? + let buckets: Vec<_> = extend_validate_ranges(&req.ranges, &field_type)? .iter() .map(|range| { let to = if range.end == u64::MAX { @@ -212,6 +218,9 @@ impl SegmentRangeCollector { }) .collect::>()?; + bucket_count.fetch_add(buckets.len() as u32, std::sync::atomic::Ordering::Relaxed); + validate_bucket_count(bucket_count)?; + Ok(SegmentRangeCollector { buckets, field_type, @@ -403,8 +412,13 @@ mod tests { ranges, }; - SegmentRangeCollector::from_req_and_validate(&req, &Default::default(), field_type) - .expect("unexpected error") + SegmentRangeCollector::from_req_and_validate( + &req, + &Default::default(), + &Default::default(), + field_type, + ) + .expect("unexpected error") } #[test] diff --git a/src/aggregation/bucket/term_agg.rs b/src/aggregation/bucket/term_agg.rs index 1ed63d3e65..312522017e 100644 --- a/src/aggregation/bucket/term_agg.rs +++ b/src/aggregation/bucket/term_agg.rs @@ -11,7 +11,9 @@ use crate::aggregation::agg_req_with_accessor::{ use crate::aggregation::intermediate_agg_result::{ IntermediateBucketResult, IntermediateTermBucketEntry, IntermediateTermBucketResult, }; -use crate::aggregation::segment_agg_result::SegmentAggregationResultsCollector; +use crate::aggregation::segment_agg_result::{ + validate_bucket_count, SegmentAggregationResultsCollector, +}; use crate::error::DataCorruption; use crate::fastfield::MultiValuedFastFieldReader; use crate::schema::Type; @@ -244,19 +246,23 @@ impl TermBuckets { &mut self, term_ids: &[u64], doc: DocId, - bucket_with_accessor: &AggregationsWithAccessor, + bucket_with_accessor: &BucketAggregationWithAccessor, blueprint: &Option, ) -> crate::Result<()> { for &term_id in term_ids { - let entry = self - .entries - .entry(term_id as u32) - .or_insert_with(|| TermBucketEntry::from_blueprint(blueprint)); + let entry = self.entries.entry(term_id as u32).or_insert_with(|| { + bucket_with_accessor + .bucket_count + .fetch_add(1, std::sync::atomic::Ordering::Relaxed); + + TermBucketEntry::from_blueprint(blueprint) + }); entry.doc_count += 1; if let Some(sub_aggregations) = entry.sub_aggregations.as_mut() { - sub_aggregations.collect(doc, bucket_with_accessor)?; + sub_aggregations.collect(doc, &bucket_with_accessor.sub_aggregation)?; } } + validate_bucket_count(&bucket_with_accessor.bucket_count)?; Ok(()) } @@ -441,25 +447,25 @@ impl SegmentTermCollector { self.term_buckets.increment_bucket( &vals1, docs[0], - &bucket_with_accessor.sub_aggregation, + bucket_with_accessor, &self.blueprint, )?; self.term_buckets.increment_bucket( &vals2, docs[1], - &bucket_with_accessor.sub_aggregation, + bucket_with_accessor, &self.blueprint, )?; self.term_buckets.increment_bucket( &vals3, docs[2], - &bucket_with_accessor.sub_aggregation, + bucket_with_accessor, &self.blueprint, )?; self.term_buckets.increment_bucket( &vals4, docs[3], - &bucket_with_accessor.sub_aggregation, + bucket_with_accessor, &self.blueprint, )?; } @@ -469,7 +475,7 @@ impl SegmentTermCollector { self.term_buckets.increment_bucket( &vals1, doc, - &bucket_with_accessor.sub_aggregation, + bucket_with_accessor, &self.blueprint, )?; } @@ -1175,6 +1181,33 @@ mod tests { Ok(()) } + #[test] + fn terms_aggregation_term_bucket_limit() -> crate::Result<()> { + let terms: Vec = (0..100_000).map(|el| el.to_string()).collect(); + let terms_per_segment = vec![terms.iter().map(|el| el.as_str()).collect()]; + + let index = get_test_index_from_terms(true, &terms_per_segment)?; + + let agg_req: Aggregations = vec![( + "my_texts".to_string(), + Aggregation::Bucket(BucketAggregation { + bucket_agg: BucketAggregationType::Terms(TermsAggregation { + field: "string_id".to_string(), + min_doc_count: Some(0), + ..Default::default() + }), + sub_aggregation: Default::default(), + }), + )] + .into_iter() + .collect(); + + let res = exec_request_with_query(agg_req, &index, None); + assert!(res.is_err()); + + Ok(()) + } + #[test] fn test_json_format() -> crate::Result<()> { let agg_req: Aggregations = vec![( diff --git a/src/aggregation/mod.rs b/src/aggregation/mod.rs index dfaaf3265a..7fe2d82847 100644 --- a/src/aggregation/mod.rs +++ b/src/aggregation/mod.rs @@ -417,7 +417,9 @@ mod tests { let mut schema_builder = Schema::builder(); let text_fieldtype = crate::schema::TextOptions::default() .set_indexing_options( - TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs), + TextFieldIndexing::default() + .set_index_option(IndexRecordOption::Basic) + .set_fieldnorms(false), ) .set_fast() .set_stored(); @@ -435,7 +437,8 @@ mod tests { ); let index = Index::create_in_ram(schema_builder.build()); { - let mut index_writer = index.writer_for_tests()?; + // let mut index_writer = index.writer_for_tests()?; + let mut index_writer = index.writer_with_num_threads(1, 30_000_000)?; for values in segment_and_values { for (i, term) in values { let i = *i; @@ -457,9 +460,11 @@ mod tests { let segment_ids = index .searchable_segment_ids() .expect("Searchable segments failed."); - let mut index_writer = index.writer_for_tests()?; - index_writer.merge(&segment_ids).wait()?; - index_writer.wait_merging_threads()?; + if segment_ids.len() > 1 { + let mut index_writer = index.writer_for_tests()?; + index_writer.merge(&segment_ids).wait()?; + index_writer.wait_merging_threads()?; + } } Ok(index) diff --git a/src/aggregation/segment_agg_result.rs b/src/aggregation/segment_agg_result.rs index 121fb4cf3f..57c545f7d6 100644 --- a/src/aggregation/segment_agg_result.rs +++ b/src/aggregation/segment_agg_result.rs @@ -4,6 +4,8 @@ //! merging. use std::fmt::Debug; +use std::rc::Rc; +use std::sync::atomic::AtomicU32; use super::agg_req::MetricAggregation; use super::agg_req_with_accessor::{ @@ -16,7 +18,7 @@ use super::metric::{ }; use super::VecWithNames; use crate::aggregation::agg_req::BucketAggregationType; -use crate::DocId; +use crate::{DocId, TantivyError}; pub(crate) const DOC_BLOCK_SIZE: usize = 64; pub(crate) type DocBlock = [DocId; DOC_BLOCK_SIZE]; @@ -236,6 +238,7 @@ impl SegmentBucketResultCollector { Ok(Self::Range(SegmentRangeCollector::from_req_and_validate( range_req, &req.sub_aggregation, + &req.bucket_count, req.field_type, )?)) } @@ -273,3 +276,12 @@ impl SegmentBucketResultCollector { Ok(()) } } + +pub(crate) fn validate_bucket_count(bucket_count: &Rc) -> crate::Result<()> { + if bucket_count.load(std::sync::atomic::Ordering::Relaxed) > 65000 { + return Err(TantivyError::InvalidArgument( + "Aborting aggregation because too many buckets were created".to_string(), + )); + } + Ok(()) +} diff --git a/src/collector/mod.rs b/src/collector/mod.rs index 97b6020340..6d600bb488 100644 --- a/src/collector/mod.rs +++ b/src/collector/mod.rs @@ -175,12 +175,14 @@ pub trait Collector: Sync + Send { if let Some(alive_bitset) = reader.alive_bitset() { weight.for_each(reader, &mut |doc, score| { if alive_bitset.is_alive(doc) { - segment_collector.collect(doc, score).unwrap(); // TODO + segment_collector.collect(doc, score)?; } + Ok(()) })?; } else { weight.for_each(reader, &mut |doc, score| { - segment_collector.collect(doc, score).unwrap(); // TODO + segment_collector.collect(doc, score)?; + Ok(()) })?; } Ok(segment_collector.harvest()) diff --git a/src/query/boolean_query/boolean_weight.rs b/src/query/boolean_query/boolean_weight.rs index f2a5fd376a..9024a6abb8 100644 --- a/src/query/boolean_query/boolean_weight.rs +++ b/src/query/boolean_query/boolean_weight.rs @@ -186,17 +186,17 @@ impl Weight for BooleanWeight { fn for_each( &self, reader: &SegmentReader, - callback: &mut dyn FnMut(DocId, Score), + callback: &mut dyn FnMut(DocId, Score) -> crate::Result<()>, ) -> crate::Result<()> { let scorer = self.complex_scorer::(reader, 1.0)?; match scorer { SpecializedScorer::TermUnion(term_scorers) => { let mut union_scorer = Union::::from(term_scorers); - for_each_scorer(&mut union_scorer, callback); + for_each_scorer(&mut union_scorer, callback)?; } SpecializedScorer::Other(mut scorer) => { - for_each_scorer(scorer.as_mut(), callback); + for_each_scorer(scorer.as_mut(), callback)?; } } Ok(()) diff --git a/src/query/term_query/term_weight.rs b/src/query/term_query/term_weight.rs index 4e742bc444..e1529027b2 100644 --- a/src/query/term_query/term_weight.rs +++ b/src/query/term_query/term_weight.rs @@ -49,10 +49,10 @@ impl Weight for TermWeight { fn for_each( &self, reader: &SegmentReader, - callback: &mut dyn FnMut(DocId, Score), + callback: &mut dyn FnMut(DocId, Score) -> crate::Result<()>, ) -> crate::Result<()> { let mut scorer = self.specialized_scorer(reader, 1.0)?; - for_each_scorer(&mut scorer, callback); + for_each_scorer(&mut scorer, callback)?; Ok(()) } diff --git a/src/query/weight.rs b/src/query/weight.rs index 3a2ff3d33c..26bdfb0edb 100644 --- a/src/query/weight.rs +++ b/src/query/weight.rs @@ -7,13 +7,14 @@ use crate::{DocId, Score, TERMINATED}; /// `DocSet` and push the scored documents to the collector. pub(crate) fn for_each_scorer( scorer: &mut TScorer, - callback: &mut dyn FnMut(DocId, Score), -) { + callback: &mut dyn FnMut(DocId, Score) -> crate::Result<()>, +) -> crate::Result<()> { let mut doc = scorer.doc(); while doc != TERMINATED { - callback(doc, scorer.score()); + callback(doc, scorer.score())?; doc = scorer.advance(); } + Ok(()) } /// Calls `callback` with all of the `(doc, score)` for which score @@ -71,10 +72,10 @@ pub trait Weight: Send + Sync + 'static { fn for_each( &self, reader: &SegmentReader, - callback: &mut dyn FnMut(DocId, Score), + callback: &mut dyn FnMut(DocId, Score) -> crate::Result<()>, ) -> crate::Result<()> { let mut scorer = self.scorer(reader, 1.0)?; - for_each_scorer(scorer.as_mut(), callback); + for_each_scorer(scorer.as_mut(), callback)?; Ok(()) } From 44ea7313ca85265945090b18588ce863beb54818 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Thu, 12 May 2022 21:04:25 +0800 Subject: [PATCH 05/39] set max bucket size as parameter --- examples/aggregation.rs | 2 +- src/aggregation/agg_req_with_accessor.rs | 20 +++++++-- src/aggregation/bucket/histogram/histogram.rs | 8 ++-- src/aggregation/bucket/range.rs | 14 +++---- src/aggregation/bucket/term_agg.rs | 39 ++++++++++------- src/aggregation/collector.rs | 42 +++++++++++++++---- src/aggregation/intermediate_agg_result.rs | 4 +- src/aggregation/metric/stats.rs | 4 +- src/aggregation/mod.rs | 42 +++++++++---------- src/aggregation/segment_agg_result.rs | 38 ++++++++++++++--- 10 files changed, 140 insertions(+), 73 deletions(-) diff --git a/examples/aggregation.rs b/examples/aggregation.rs index 82cc0fccd3..ae11dc5a5a 100644 --- a/examples/aggregation.rs +++ b/examples/aggregation.rs @@ -117,7 +117,7 @@ fn main() -> tantivy::Result<()> { .into_iter() .collect(); - let collector = AggregationCollector::from_aggs(agg_req_1); + let collector = AggregationCollector::from_aggs(agg_req_1, None); let searcher = reader.searcher(); let agg_res: AggregationResults = searcher.search(&term_query, &collector).unwrap(); diff --git a/src/aggregation/agg_req_with_accessor.rs b/src/aggregation/agg_req_with_accessor.rs index 10597b3de0..491faf2137 100644 --- a/src/aggregation/agg_req_with_accessor.rs +++ b/src/aggregation/agg_req_with_accessor.rs @@ -7,6 +7,7 @@ use std::sync::Arc; use super::agg_req::{Aggregation, Aggregations, BucketAggregationType, MetricAggregation}; use super::bucket::{HistogramAggregation, RangeAggregation, TermsAggregation}; use super::metric::{AverageAggregation, StatsAggregation}; +use super::segment_agg_result::BucketCount; use super::VecWithNames; use crate::fastfield::{ type_and_cardinality, DynamicFastFieldReader, FastType, MultiValuedFastFieldReader, @@ -62,7 +63,7 @@ pub struct BucketAggregationWithAccessor { pub(crate) field_type: Type, pub(crate) bucket_agg: BucketAggregationType, pub(crate) sub_aggregation: AggregationsWithAccessor, - pub(crate) bucket_count: Rc, + pub(crate) bucket_count: BucketCount, } impl BucketAggregationWithAccessor { @@ -71,6 +72,7 @@ impl BucketAggregationWithAccessor { sub_aggregation: &Aggregations, reader: &SegmentReader, bucket_count: Rc, + max_bucket_count: u32, ) -> crate::Result { let mut inverted_index = None; let (accessor, field_type) = match &bucket { @@ -96,10 +98,18 @@ impl BucketAggregationWithAccessor { Ok(BucketAggregationWithAccessor { accessor, field_type, - sub_aggregation: get_aggs_with_accessor_and_validate(&sub_aggregation, reader)?, + sub_aggregation: get_aggs_with_accessor_and_validate( + &sub_aggregation, + reader, + bucket_count.clone(), + max_bucket_count, + )?, bucket_agg: bucket.clone(), inverted_index, - bucket_count, + bucket_count: BucketCount { + bucket_count, + max_bucket_count, + }, }) } } @@ -139,8 +149,9 @@ impl MetricAggregationWithAccessor { pub(crate) fn get_aggs_with_accessor_and_validate( aggs: &Aggregations, reader: &SegmentReader, + bucket_count: Rc, + max_bucket_count: u32, ) -> crate::Result { - let bucket_count: Rc = Default::default(); let mut metrics = vec![]; let mut buckets = vec![]; for (key, agg) in aggs.iter() { @@ -152,6 +163,7 @@ pub(crate) fn get_aggs_with_accessor_and_validate( &bucket.sub_aggregation, reader, Rc::clone(&bucket_count), + max_bucket_count, )?, )), Aggregation::Metric(metric) => metrics.push(( diff --git a/src/aggregation/bucket/histogram/histogram.rs b/src/aggregation/bucket/histogram/histogram.rs index 69111c71fc..70acf0f117 100644 --- a/src/aggregation/bucket/histogram/histogram.rs +++ b/src/aggregation/bucket/histogram/histogram.rs @@ -13,9 +13,7 @@ use crate::aggregation::f64_from_fastfield_u64; use crate::aggregation::intermediate_agg_result::{ IntermediateAggregationResults, IntermediateBucketResult, IntermediateHistogramBucketEntry, }; -use crate::aggregation::segment_agg_result::{ - validate_bucket_count, SegmentAggregationResultsCollector, -}; +use crate::aggregation::segment_agg_result::SegmentAggregationResultsCollector; use crate::fastfield::{DynamicFastFieldReader, FastFieldReader}; use crate::schema::Type; use crate::{DocId, TantivyError}; @@ -254,8 +252,8 @@ impl SegmentHistogramCollector { agg_with_accessor .bucket_count - .fetch_add(buckets.len() as u32, std::sync::atomic::Ordering::Relaxed); - validate_bucket_count(&agg_with_accessor.bucket_count)?; + .add_count(buckets.len() as u32); + agg_with_accessor.bucket_count.validate_bucket_count()?; Ok(IntermediateBucketResult::Histogram { buckets }) } diff --git a/src/aggregation/bucket/range.rs b/src/aggregation/bucket/range.rs index 590165c158..7faa500e7c 100644 --- a/src/aggregation/bucket/range.rs +++ b/src/aggregation/bucket/range.rs @@ -1,7 +1,5 @@ use std::fmt::Debug; use std::ops::Range; -use std::rc::Rc; -use std::sync::atomic::AtomicU32; use fnv::FnvHashMap; use serde::{Deserialize, Serialize}; @@ -12,9 +10,7 @@ use crate::aggregation::agg_req_with_accessor::{ use crate::aggregation::intermediate_agg_result::{ IntermediateBucketResult, IntermediateRangeBucketEntry, IntermediateRangeBucketResult, }; -use crate::aggregation::segment_agg_result::{ - validate_bucket_count, SegmentAggregationResultsCollector, -}; +use crate::aggregation::segment_agg_result::{BucketCount, SegmentAggregationResultsCollector}; use crate::aggregation::{f64_from_fastfield_u64, f64_to_fastfield_u64, Key, SerializedKey}; use crate::fastfield::FastFieldReader; use crate::schema::Type; @@ -179,7 +175,7 @@ impl SegmentRangeCollector { pub(crate) fn from_req_and_validate( req: &RangeAggregation, sub_aggregation: &AggregationsWithAccessor, - bucket_count: &Rc, + bucket_count: &BucketCount, field_type: Type, ) -> crate::Result { // The range input on the request is f64. @@ -218,8 +214,8 @@ impl SegmentRangeCollector { }) .collect::>()?; - bucket_count.fetch_add(buckets.len() as u32, std::sync::atomic::Ordering::Relaxed); - validate_bucket_count(bucket_count)?; + bucket_count.add_count(buckets.len() as u32); + bucket_count.validate_bucket_count()?; Ok(SegmentRangeCollector { buckets, @@ -438,7 +434,7 @@ mod tests { .into_iter() .collect(); - let collector = AggregationCollector::from_aggs(agg_req); + let collector = AggregationCollector::from_aggs(agg_req, None); let reader = index.reader()?; let searcher = reader.searcher(); diff --git a/src/aggregation/bucket/term_agg.rs b/src/aggregation/bucket/term_agg.rs index 312522017e..52e120cc96 100644 --- a/src/aggregation/bucket/term_agg.rs +++ b/src/aggregation/bucket/term_agg.rs @@ -11,9 +11,7 @@ use crate::aggregation::agg_req_with_accessor::{ use crate::aggregation::intermediate_agg_result::{ IntermediateBucketResult, IntermediateTermBucketEntry, IntermediateTermBucketResult, }; -use crate::aggregation::segment_agg_result::{ - validate_bucket_count, SegmentAggregationResultsCollector, -}; +use crate::aggregation::segment_agg_result::{BucketCount, SegmentAggregationResultsCollector}; use crate::error::DataCorruption; use crate::fastfield::MultiValuedFastFieldReader; use crate::schema::Type; @@ -246,23 +244,23 @@ impl TermBuckets { &mut self, term_ids: &[u64], doc: DocId, - bucket_with_accessor: &BucketAggregationWithAccessor, + sub_aggregation: &AggregationsWithAccessor, + bucket_count: &BucketCount, blueprint: &Option, ) -> crate::Result<()> { for &term_id in term_ids { let entry = self.entries.entry(term_id as u32).or_insert_with(|| { - bucket_with_accessor - .bucket_count - .fetch_add(1, std::sync::atomic::Ordering::Relaxed); + bucket_count.add_count(1); TermBucketEntry::from_blueprint(blueprint) }); entry.doc_count += 1; if let Some(sub_aggregations) = entry.sub_aggregations.as_mut() { - sub_aggregations.collect(doc, &bucket_with_accessor.sub_aggregation)?; + sub_aggregations.collect(doc, &sub_aggregation)?; } } - validate_bucket_count(&bucket_with_accessor.bucket_count)?; + bucket_count.validate_bucket_count()?; + Ok(()) } @@ -447,25 +445,29 @@ impl SegmentTermCollector { self.term_buckets.increment_bucket( &vals1, docs[0], - bucket_with_accessor, + &bucket_with_accessor.sub_aggregation, + &bucket_with_accessor.bucket_count, &self.blueprint, )?; self.term_buckets.increment_bucket( &vals2, docs[1], - bucket_with_accessor, + &bucket_with_accessor.sub_aggregation, + &bucket_with_accessor.bucket_count, &self.blueprint, )?; self.term_buckets.increment_bucket( &vals3, docs[2], - bucket_with_accessor, + &bucket_with_accessor.sub_aggregation, + &bucket_with_accessor.bucket_count, &self.blueprint, )?; self.term_buckets.increment_bucket( &vals4, docs[3], - bucket_with_accessor, + &bucket_with_accessor.sub_aggregation, + &bucket_with_accessor.bucket_count, &self.blueprint, )?; } @@ -475,7 +477,8 @@ impl SegmentTermCollector { self.term_buckets.increment_bucket( &vals1, doc, - bucket_with_accessor, + &bucket_with_accessor.sub_aggregation, + &bucket_with_accessor.bucket_count, &self.blueprint, )?; } @@ -1326,9 +1329,15 @@ mod bench { let mut collector = get_collector_with_buckets(total_terms); let vals = get_rand_terms(total_terms, num_terms); let aggregations_with_accessor: AggregationsWithAccessor = Default::default(); + let bucket_count: BucketCount = BucketCount { + bucket_count: Default::default(), + max_bucket_count: 1_000_001u32, + }; b.iter(|| { for &val in &vals { - collector.increment_bucket(&[val], 0, &aggregations_with_accessor, &None); + collector + .increment_bucket(&[val], 0, &aggregations_with_accessor, &bucket_count, &None) + .unwrap(); } }) } diff --git a/src/aggregation/collector.rs b/src/aggregation/collector.rs index 69913931cd..cf2848f383 100644 --- a/src/aggregation/collector.rs +++ b/src/aggregation/collector.rs @@ -1,3 +1,5 @@ +use std::rc::Rc; + use super::agg_req::Aggregations; use super::agg_req_with_accessor::AggregationsWithAccessor; use super::agg_result::AggregationResults; @@ -7,17 +9,25 @@ use crate::aggregation::agg_req_with_accessor::get_aggs_with_accessor_and_valida use crate::collector::{Collector, SegmentCollector}; use crate::SegmentReader; +pub const MAX_BUCKET_COUNT: u32 = 65000; + /// Collector for aggregations. /// /// The collector collects all aggregations by the underlying aggregation request. pub struct AggregationCollector { agg: Aggregations, + max_bucket_count: u32, } impl AggregationCollector { /// Create collector from aggregation request. - pub fn from_aggs(agg: Aggregations) -> Self { - Self { agg } + /// + /// max_bucket_count will default to `MAX_BUCKET_COUNT` (65000) when unset + pub fn from_aggs(agg: Aggregations, max_bucket_count: Option) -> Self { + Self { + agg, + max_bucket_count: max_bucket_count.unwrap_or(MAX_BUCKET_COUNT), + } } } @@ -28,15 +38,21 @@ impl AggregationCollector { /// # Purpose /// AggregationCollector returns `IntermediateAggregationResults` and not the final /// `AggregationResults`, so that results from differenct indices can be merged and then converted -/// into the final `AggregationResults` via the `into()` method. +/// into the final `AggregationResults` via the `into_final_result()` method. pub struct DistributedAggregationCollector { agg: Aggregations, + max_bucket_count: u32, } impl DistributedAggregationCollector { /// Create collector from aggregation request. - pub fn from_aggs(agg: Aggregations) -> Self { - Self { agg } + /// + /// max_bucket_count will default to `MAX_BUCKET_COUNT` (65000) when unset + pub fn from_aggs(agg: Aggregations, max_bucket_count: Option) -> Self { + Self { + agg, + max_bucket_count: max_bucket_count.unwrap_or(MAX_BUCKET_COUNT), + } } } @@ -50,7 +66,11 @@ impl Collector for DistributedAggregationCollector { _segment_local_id: crate::SegmentOrdinal, reader: &crate::SegmentReader, ) -> crate::Result { - AggregationSegmentCollector::from_agg_req_and_reader(&self.agg, reader) + AggregationSegmentCollector::from_agg_req_and_reader( + &self.agg, + reader, + self.max_bucket_count, + ) } fn requires_scoring(&self) -> bool { @@ -75,7 +95,11 @@ impl Collector for AggregationCollector { _segment_local_id: crate::SegmentOrdinal, reader: &crate::SegmentReader, ) -> crate::Result { - AggregationSegmentCollector::from_agg_req_and_reader(&self.agg, reader) + AggregationSegmentCollector::from_agg_req_and_reader( + &self.agg, + reader, + self.max_bucket_count, + ) } fn requires_scoring(&self) -> bool { @@ -117,8 +141,10 @@ impl AggregationSegmentCollector { pub fn from_agg_req_and_reader( agg: &Aggregations, reader: &SegmentReader, + max_bucket_count: u32, ) -> crate::Result { - let aggs_with_accessor = get_aggs_with_accessor_and_validate(agg, reader)?; + let aggs_with_accessor = + get_aggs_with_accessor_and_validate(agg, reader, Rc::default(), max_bucket_count)?; let result = SegmentAggregationResultsCollector::from_req_and_validate(&aggs_with_accessor)?; Ok(AggregationSegmentCollector { diff --git a/src/aggregation/intermediate_agg_result.rs b/src/aggregation/intermediate_agg_result.rs index cb2f9f416c..20eef59c07 100644 --- a/src/aggregation/intermediate_agg_result.rs +++ b/src/aggregation/intermediate_agg_result.rs @@ -35,7 +35,7 @@ pub struct IntermediateAggregationResults { } impl IntermediateAggregationResults { - /// Convert and intermediate result and its aggregation request to the final result + /// Convert intermediate result and its aggregation request to the final result. pub(crate) fn into_final_bucket_result( self, req: Aggregations, @@ -43,7 +43,7 @@ impl IntermediateAggregationResults { self.into_final_bucket_result_internal(&(req.into())) } - /// Convert and intermediate result and its aggregation request to the final result + /// Convert intermediate result and its aggregation request to the final result. /// /// Internal function, AggregationsInternal is used instead Aggregations, which is optimized /// for internal processing, by splitting metric and buckets into seperate groups. diff --git a/src/aggregation/metric/stats.rs b/src/aggregation/metric/stats.rs index 0498ffbe80..2f704b17d0 100644 --- a/src/aggregation/metric/stats.rs +++ b/src/aggregation/metric/stats.rs @@ -222,7 +222,7 @@ mod tests { .into_iter() .collect(); - let collector = AggregationCollector::from_aggs(agg_req_1); + let collector = AggregationCollector::from_aggs(agg_req_1, None); let reader = index.reader()?; let searcher = reader.searcher(); @@ -299,7 +299,7 @@ mod tests { .into_iter() .collect(); - let collector = AggregationCollector::from_aggs(agg_req_1); + let collector = AggregationCollector::from_aggs(agg_req_1, None); let searcher = reader.searcher(); let agg_res: AggregationResults = searcher.search(&term_query, &collector).unwrap(); diff --git a/src/aggregation/mod.rs b/src/aggregation/mod.rs index 7fe2d82847..7f6f8378ce 100644 --- a/src/aggregation/mod.rs +++ b/src/aggregation/mod.rs @@ -28,7 +28,7 @@ //! //! ```verbatim //! let agg_req: Aggregations = serde_json::from_str(json_request_string).unwrap(); -//! let collector = AggregationCollector::from_aggs(agg_req); +//! let collector = AggregationCollector::from_aggs(agg_req, None); //! let searcher = reader.searcher(); //! let agg_res = searcher.search(&term_query, &collector).unwrap_err(); //! let json_response_string: String = &serde_json::to_string(&agg_res)?; @@ -68,7 +68,7 @@ //! .into_iter() //! .collect(); //! -//! let collector = AggregationCollector::from_aggs(agg_req); +//! let collector = AggregationCollector::from_aggs(agg_req, None); //! //! let searcher = reader.searcher(); //! let agg_res: AggregationResults = searcher.search(&AllQuery, &collector).unwrap(); @@ -358,7 +358,7 @@ mod tests { index: &Index, query: Option<(&str, &str)>, ) -> crate::Result { - let collector = AggregationCollector::from_aggs(agg_req); + let collector = AggregationCollector::from_aggs(agg_req, None); let reader = index.reader()?; let searcher = reader.searcher(); @@ -547,7 +547,7 @@ mod tests { .unwrap(); let agg_res: AggregationResults = if use_distributed_collector { - let collector = DistributedAggregationCollector::from_aggs(agg_req.clone()); + let collector = DistributedAggregationCollector::from_aggs(agg_req.clone(), None); let searcher = reader.searcher(); let intermediate_agg_result = searcher.search(&AllQuery, &collector).unwrap(); @@ -555,7 +555,7 @@ mod tests { .into_final_bucket_result(agg_req) .unwrap() } else { - let collector = AggregationCollector::from_aggs(agg_req); + let collector = AggregationCollector::from_aggs(agg_req, None); let searcher = reader.searcher(); searcher.search(&AllQuery, &collector).unwrap() @@ -792,7 +792,7 @@ mod tests { .into_iter() .collect(); - let collector = AggregationCollector::from_aggs(agg_req_1); + let collector = AggregationCollector::from_aggs(agg_req_1, None); let searcher = reader.searcher(); let agg_res: AggregationResults = searcher.search(&term_query, &collector).unwrap(); @@ -982,7 +982,7 @@ mod tests { assert_eq!(field_names, vec!["text".to_string()].into_iter().collect()); let agg_res: AggregationResults = if use_distributed_collector { - let collector = DistributedAggregationCollector::from_aggs(agg_req.clone()); + let collector = DistributedAggregationCollector::from_aggs(agg_req.clone(), None); let searcher = reader.searcher(); let res = searcher.search(&term_query, &collector).unwrap(); @@ -991,7 +991,7 @@ mod tests { serde_json::from_str(&serde_json::to_string(&res).unwrap()).unwrap(); res.into_final_bucket_result(agg_req.clone()).unwrap() } else { - let collector = AggregationCollector::from_aggs(agg_req.clone()); + let collector = AggregationCollector::from_aggs(agg_req.clone(), None); let searcher = reader.searcher(); searcher.search(&term_query, &collector).unwrap() @@ -1049,7 +1049,7 @@ mod tests { ); // Test empty result set - let collector = AggregationCollector::from_aggs(agg_req); + let collector = AggregationCollector::from_aggs(agg_req, None); let searcher = reader.searcher(); searcher.search(&query_with_no_hits, &collector).unwrap(); @@ -1114,7 +1114,7 @@ mod tests { .into_iter() .collect(); - let collector = AggregationCollector::from_aggs(agg_req_1); + let collector = AggregationCollector::from_aggs(agg_req_1, None); let searcher = reader.searcher(); @@ -1227,7 +1227,7 @@ mod tests { .into_iter() .collect(); - let collector = AggregationCollector::from_aggs(agg_req_1); + let collector = AggregationCollector::from_aggs(agg_req_1, None); let searcher = reader.searcher(); let agg_res: AggregationResults = @@ -1258,7 +1258,7 @@ mod tests { .into_iter() .collect(); - let collector = AggregationCollector::from_aggs(agg_req_1); + let collector = AggregationCollector::from_aggs(agg_req_1, None); let searcher = reader.searcher(); let agg_res: AggregationResults = @@ -1289,7 +1289,7 @@ mod tests { .into_iter() .collect(); - let collector = AggregationCollector::from_aggs(agg_req_1); + let collector = AggregationCollector::from_aggs(agg_req_1, None); let searcher = reader.searcher(); let agg_res: AggregationResults = @@ -1328,7 +1328,7 @@ mod tests { .into_iter() .collect(); - let collector = AggregationCollector::from_aggs(agg_req_1); + let collector = AggregationCollector::from_aggs(agg_req_1, None); let searcher = reader.searcher(); let agg_res: AggregationResults = @@ -1357,7 +1357,7 @@ mod tests { .into_iter() .collect(); - let collector = AggregationCollector::from_aggs(agg_req); + let collector = AggregationCollector::from_aggs(agg_req, None); let searcher = reader.searcher(); let agg_res: AggregationResults = @@ -1386,7 +1386,7 @@ mod tests { .into_iter() .collect(); - let collector = AggregationCollector::from_aggs(agg_req); + let collector = AggregationCollector::from_aggs(agg_req, None); let searcher = reader.searcher(); let agg_res: AggregationResults = @@ -1422,7 +1422,7 @@ mod tests { .into_iter() .collect(); - let collector = AggregationCollector::from_aggs(agg_req_1); + let collector = AggregationCollector::from_aggs(agg_req_1, None); let searcher = reader.searcher(); let agg_res: AggregationResults = @@ -1457,7 +1457,7 @@ mod tests { .into_iter() .collect(); - let collector = AggregationCollector::from_aggs(agg_req_1); + let collector = AggregationCollector::from_aggs(agg_req_1, None); let searcher = reader.searcher(); let agg_res: AggregationResults = @@ -1496,7 +1496,7 @@ mod tests { .into_iter() .collect(); - let collector = AggregationCollector::from_aggs(agg_req_1); + let collector = AggregationCollector::from_aggs(agg_req_1, None); let searcher = reader.searcher(); let agg_res: AggregationResults = @@ -1526,7 +1526,7 @@ mod tests { .into_iter() .collect(); - let collector = AggregationCollector::from_aggs(agg_req_1); + let collector = AggregationCollector::from_aggs(agg_req_1, None); let searcher = reader.searcher(); let agg_res: AggregationResults = @@ -1582,7 +1582,7 @@ mod tests { .into_iter() .collect(); - let collector = AggregationCollector::from_aggs(agg_req_1); + let collector = AggregationCollector::from_aggs(agg_req_1, None); let searcher = reader.searcher(); let agg_res: AggregationResults = diff --git a/src/aggregation/segment_agg_result.rs b/src/aggregation/segment_agg_result.rs index 57c545f7d6..fe07400897 100644 --- a/src/aggregation/segment_agg_result.rs +++ b/src/aggregation/segment_agg_result.rs @@ -12,6 +12,7 @@ use super::agg_req_with_accessor::{ AggregationsWithAccessor, BucketAggregationWithAccessor, MetricAggregationWithAccessor, }; use super::bucket::{SegmentHistogramCollector, SegmentRangeCollector, SegmentTermCollector}; +use super::collector::MAX_BUCKET_COUNT; use super::intermediate_agg_result::{IntermediateAggregationResults, IntermediateBucketResult}; use super::metric::{ AverageAggregation, SegmentAverageCollector, SegmentStatsCollector, StatsAggregation, @@ -277,11 +278,36 @@ impl SegmentBucketResultCollector { } } -pub(crate) fn validate_bucket_count(bucket_count: &Rc) -> crate::Result<()> { - if bucket_count.load(std::sync::atomic::Ordering::Relaxed) > 65000 { - return Err(TantivyError::InvalidArgument( - "Aborting aggregation because too many buckets were created".to_string(), - )); +#[derive(Clone)] +pub(crate) struct BucketCount { + /// The counter which is shared between the aggregations for one request. + pub(crate) bucket_count: Rc, + pub(crate) max_bucket_count: u32, +} + +impl Default for BucketCount { + fn default() -> Self { + Self { + bucket_count: Default::default(), + max_bucket_count: MAX_BUCKET_COUNT, + } + } +} + +impl BucketCount { + pub(crate) fn validate_bucket_count(&self) -> crate::Result<()> { + if self.get_count() > self.max_bucket_count { + return Err(TantivyError::InvalidArgument( + "Aborting aggregation because too many buckets were created".to_string(), + )); + } + Ok(()) + } + pub(crate) fn add_count(&self, count: u32) { + self.bucket_count + .fetch_add(count as u32, std::sync::atomic::Ordering::Relaxed); + } + pub(crate) fn get_count(&self) -> u32 { + self.bucket_count.load(std::sync::atomic::Ordering::Relaxed) } - Ok(()) } From c5c2e59b2bbb810db2e55758f220384f9e2c2724 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Tue, 17 May 2022 11:35:39 +0800 Subject: [PATCH 06/39] introduce optional collect_block in segmentcollector add collect_block in segment_collector to handle groups of documents as performance optimization add collect_block for MultiCollector --- src/aggregation/bucket/term_agg.rs | 2 +- src/collector/count_collector.rs | 5 +++ src/collector/custom_score_top_collector.rs | 14 +++++- src/collector/mod.rs | 47 ++++++++++++++++++++- src/collector/multi_collector.rs | 18 ++++++++ src/collector/top_score_collector.rs | 8 ++++ 6 files changed, 89 insertions(+), 5 deletions(-) diff --git a/src/aggregation/bucket/term_agg.rs b/src/aggregation/bucket/term_agg.rs index 52e120cc96..8a9970e0fd 100644 --- a/src/aggregation/bucket/term_agg.rs +++ b/src/aggregation/bucket/term_agg.rs @@ -256,7 +256,7 @@ impl TermBuckets { }); entry.doc_count += 1; if let Some(sub_aggregations) = entry.sub_aggregations.as_mut() { - sub_aggregations.collect(doc, &sub_aggregation)?; + sub_aggregations.collect(doc, sub_aggregation)?; } } bucket_count.validate_bucket_count()?; diff --git a/src/collector/count_collector.rs b/src/collector/count_collector.rs index 02f30f85c1..3ff1368f83 100644 --- a/src/collector/count_collector.rs +++ b/src/collector/count_collector.rs @@ -70,6 +70,11 @@ impl SegmentCollector for SegmentCountCollector { Ok(()) } + fn collect_block(&mut self, docs: &[(DocId, Score)]) -> crate::Result<()> { + self.count += docs.len(); + Ok(()) + } + fn harvest(self) -> usize { self.count } diff --git a/src/collector/custom_score_top_collector.rs b/src/collector/custom_score_top_collector.rs index d597727ed1..0f981c20a7 100644 --- a/src/collector/custom_score_top_collector.rs +++ b/src/collector/custom_score_top_collector.rs @@ -8,7 +8,8 @@ pub(crate) struct CustomScoreTopCollector { } impl CustomScoreTopCollector -where TScore: Clone + PartialOrd +where + TScore: Clone + PartialOrd, { pub(crate) fn new( custom_scorer: TCustomScorer, @@ -96,6 +97,14 @@ where Ok(()) } + fn collect_block(&mut self, docs: &[(DocId, Score)]) -> crate::Result<()> { + for (doc, _score) in docs { + let score = self.segment_scorer.score(*doc); + self.segment_collector.collect(*doc, score); + } + Ok(()) + } + fn harvest(self) -> Vec<(TScore, DocAddress)> { self.segment_collector.harvest() } @@ -114,7 +123,8 @@ where } impl CustomSegmentScorer for F -where F: 'static + FnMut(DocId) -> TScore +where + F: 'static + FnMut(DocId) -> TScore, { fn score(&mut self, doc: DocId) -> TScore { (self)(doc) diff --git a/src/collector/mod.rs b/src/collector/mod.rs index 6d600bb488..7e6d43f7b7 100644 --- a/src/collector/mod.rs +++ b/src/collector/mod.rs @@ -172,19 +172,33 @@ pub trait Collector: Sync + Send { ) -> crate::Result<::Fruit> { let mut segment_collector = self.for_segment(segment_ord as u32, reader)?; + let mut cache_pos = 0; + let mut cache = [(0, 0.0); 64]; + if let Some(alive_bitset) = reader.alive_bitset() { weight.for_each(reader, &mut |doc, score| { if alive_bitset.is_alive(doc) { - segment_collector.collect(doc, score)?; + cache[cache_pos] = (doc, score); + cache_pos += 1; + if cache_pos == 64 { + segment_collector.collect_block(&cache)?; + cache_pos = 0; + } } Ok(()) })?; } else { weight.for_each(reader, &mut |doc, score| { - segment_collector.collect(doc, score)?; + cache[cache_pos] = (doc, score); + cache_pos += 1; + if cache_pos == 64 { + segment_collector.collect_block(&cache)?; + cache_pos = 0; + } Ok(()) })?; } + segment_collector.collect_block(&cache[..cache_pos])?; Ok(segment_collector.harvest()) } } @@ -258,6 +272,14 @@ pub trait SegmentCollector: 'static { /// The query pushes the scored document to the collector via this method. fn collect(&mut self, doc: DocId, score: Score) -> crate::Result<()>; + /// The query pushes the scored document to the collector via this method. + fn collect_block(&mut self, docs: &[(DocId, Score)]) -> crate::Result<()> { + for (doc, score) in docs { + self.collect(*doc, *score)?; + } + Ok(()) + } + /// Extract the fruit of the collection from the `SegmentCollector`. fn harvest(self) -> Self::Fruit; } @@ -317,6 +339,12 @@ where Ok(()) } + fn collect_block(&mut self, docs: &[(DocId, Score)]) -> crate::Result<()> { + self.0.collect_block(docs)?; + self.1.collect_block(docs)?; + Ok(()) + } + fn harvest(self) -> ::Fruit { (self.0.harvest(), self.1.harvest()) } @@ -383,6 +411,13 @@ where Ok(()) } + fn collect_block(&mut self, docs: &[(DocId, Score)]) -> crate::Result<()> { + self.0.collect_block(docs)?; + self.1.collect_block(docs)?; + self.2.collect_block(docs)?; + Ok(()) + } + fn harvest(self) -> ::Fruit { (self.0.harvest(), self.1.harvest(), self.2.harvest()) } @@ -459,6 +494,14 @@ where Ok(()) } + fn collect_block(&mut self, docs: &[(DocId, Score)]) -> crate::Result<()> { + self.0.collect_block(docs)?; + self.1.collect_block(docs)?; + self.2.collect_block(docs)?; + self.3.collect_block(docs)?; + Ok(()) + } + fn harvest(self) -> ::Fruit { ( self.0.harvest(), diff --git a/src/collector/multi_collector.rs b/src/collector/multi_collector.rs index 7b119ad868..a7a8ed1692 100644 --- a/src/collector/multi_collector.rs +++ b/src/collector/multi_collector.rs @@ -57,6 +57,11 @@ impl SegmentCollector for Box { Ok(()) } + fn collect_block(&mut self, docs: &[(DocId, Score)]) -> crate::Result<()> { + self.as_mut().collect_block(docs)?; + Ok(()) + } + fn harvest(self) -> Box { BoxableSegmentCollector::harvest_from_box(self) } @@ -64,6 +69,7 @@ impl SegmentCollector for Box { pub trait BoxableSegmentCollector { fn collect(&mut self, doc: u32, score: Score) -> crate::Result<()>; + fn collect_block(&mut self, docs: &[(DocId, Score)]) -> crate::Result<()>; fn harvest_from_box(self: Box) -> Box; } @@ -76,6 +82,11 @@ impl BoxableSegmentCollector self.0.collect(doc, score) } + fn collect_block(&mut self, docs: &[(DocId, Score)]) -> crate::Result<()> { + self.0.collect_block(docs)?; + Ok(()) + } + fn harvest_from_box(self: Box) -> Box { Box::new(self.0.harvest()) } @@ -236,6 +247,13 @@ impl SegmentCollector for MultiCollectorChild { Ok(()) } + fn collect_block(&mut self, docs: &[(DocId, Score)]) -> crate::Result<()> { + for child in &mut self.children { + child.collect_block(docs)?; + } + Ok(()) + } + fn harvest(self) -> MultiFruit { MultiFruit { sub_fruits: self diff --git a/src/collector/top_score_collector.rs b/src/collector/top_score_collector.rs index e0e3aeb9dc..6074d088a3 100644 --- a/src/collector/top_score_collector.rs +++ b/src/collector/top_score_collector.rs @@ -704,6 +704,14 @@ impl SegmentCollector for TopScoreSegmentCollector { Ok(()) } + #[inline] + fn collect_block(&mut self, docs: &[(DocId, Score)]) -> crate::Result<()> { + for (doc, score) in docs { + self.0.collect(*doc, *score); + } + Ok(()) + } + fn harvest(self) -> Vec<(Score, DocAddress)> { self.0.harvest() } From 17dcc99e43f203da354def2af22e33184030e739 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Thu, 19 May 2022 16:25:21 +0800 Subject: [PATCH 07/39] Revert "introduce optional collect_block in segmentcollector" This reverts commit c5c2e59b2bbb810db2e55758f220384f9e2c2724. --- src/aggregation/bucket/term_agg.rs | 2 +- src/collector/count_collector.rs | 5 --- src/collector/custom_score_top_collector.rs | 14 +----- src/collector/mod.rs | 47 +-------------------- src/collector/multi_collector.rs | 18 -------- src/collector/top_score_collector.rs | 8 ---- 6 files changed, 5 insertions(+), 89 deletions(-) diff --git a/src/aggregation/bucket/term_agg.rs b/src/aggregation/bucket/term_agg.rs index 8a9970e0fd..52e120cc96 100644 --- a/src/aggregation/bucket/term_agg.rs +++ b/src/aggregation/bucket/term_agg.rs @@ -256,7 +256,7 @@ impl TermBuckets { }); entry.doc_count += 1; if let Some(sub_aggregations) = entry.sub_aggregations.as_mut() { - sub_aggregations.collect(doc, sub_aggregation)?; + sub_aggregations.collect(doc, &sub_aggregation)?; } } bucket_count.validate_bucket_count()?; diff --git a/src/collector/count_collector.rs b/src/collector/count_collector.rs index 3ff1368f83..02f30f85c1 100644 --- a/src/collector/count_collector.rs +++ b/src/collector/count_collector.rs @@ -70,11 +70,6 @@ impl SegmentCollector for SegmentCountCollector { Ok(()) } - fn collect_block(&mut self, docs: &[(DocId, Score)]) -> crate::Result<()> { - self.count += docs.len(); - Ok(()) - } - fn harvest(self) -> usize { self.count } diff --git a/src/collector/custom_score_top_collector.rs b/src/collector/custom_score_top_collector.rs index 0f981c20a7..d597727ed1 100644 --- a/src/collector/custom_score_top_collector.rs +++ b/src/collector/custom_score_top_collector.rs @@ -8,8 +8,7 @@ pub(crate) struct CustomScoreTopCollector { } impl CustomScoreTopCollector -where - TScore: Clone + PartialOrd, +where TScore: Clone + PartialOrd { pub(crate) fn new( custom_scorer: TCustomScorer, @@ -97,14 +96,6 @@ where Ok(()) } - fn collect_block(&mut self, docs: &[(DocId, Score)]) -> crate::Result<()> { - for (doc, _score) in docs { - let score = self.segment_scorer.score(*doc); - self.segment_collector.collect(*doc, score); - } - Ok(()) - } - fn harvest(self) -> Vec<(TScore, DocAddress)> { self.segment_collector.harvest() } @@ -123,8 +114,7 @@ where } impl CustomSegmentScorer for F -where - F: 'static + FnMut(DocId) -> TScore, +where F: 'static + FnMut(DocId) -> TScore { fn score(&mut self, doc: DocId) -> TScore { (self)(doc) diff --git a/src/collector/mod.rs b/src/collector/mod.rs index 7e6d43f7b7..6d600bb488 100644 --- a/src/collector/mod.rs +++ b/src/collector/mod.rs @@ -172,33 +172,19 @@ pub trait Collector: Sync + Send { ) -> crate::Result<::Fruit> { let mut segment_collector = self.for_segment(segment_ord as u32, reader)?; - let mut cache_pos = 0; - let mut cache = [(0, 0.0); 64]; - if let Some(alive_bitset) = reader.alive_bitset() { weight.for_each(reader, &mut |doc, score| { if alive_bitset.is_alive(doc) { - cache[cache_pos] = (doc, score); - cache_pos += 1; - if cache_pos == 64 { - segment_collector.collect_block(&cache)?; - cache_pos = 0; - } + segment_collector.collect(doc, score)?; } Ok(()) })?; } else { weight.for_each(reader, &mut |doc, score| { - cache[cache_pos] = (doc, score); - cache_pos += 1; - if cache_pos == 64 { - segment_collector.collect_block(&cache)?; - cache_pos = 0; - } + segment_collector.collect(doc, score)?; Ok(()) })?; } - segment_collector.collect_block(&cache[..cache_pos])?; Ok(segment_collector.harvest()) } } @@ -272,14 +258,6 @@ pub trait SegmentCollector: 'static { /// The query pushes the scored document to the collector via this method. fn collect(&mut self, doc: DocId, score: Score) -> crate::Result<()>; - /// The query pushes the scored document to the collector via this method. - fn collect_block(&mut self, docs: &[(DocId, Score)]) -> crate::Result<()> { - for (doc, score) in docs { - self.collect(*doc, *score)?; - } - Ok(()) - } - /// Extract the fruit of the collection from the `SegmentCollector`. fn harvest(self) -> Self::Fruit; } @@ -339,12 +317,6 @@ where Ok(()) } - fn collect_block(&mut self, docs: &[(DocId, Score)]) -> crate::Result<()> { - self.0.collect_block(docs)?; - self.1.collect_block(docs)?; - Ok(()) - } - fn harvest(self) -> ::Fruit { (self.0.harvest(), self.1.harvest()) } @@ -411,13 +383,6 @@ where Ok(()) } - fn collect_block(&mut self, docs: &[(DocId, Score)]) -> crate::Result<()> { - self.0.collect_block(docs)?; - self.1.collect_block(docs)?; - self.2.collect_block(docs)?; - Ok(()) - } - fn harvest(self) -> ::Fruit { (self.0.harvest(), self.1.harvest(), self.2.harvest()) } @@ -494,14 +459,6 @@ where Ok(()) } - fn collect_block(&mut self, docs: &[(DocId, Score)]) -> crate::Result<()> { - self.0.collect_block(docs)?; - self.1.collect_block(docs)?; - self.2.collect_block(docs)?; - self.3.collect_block(docs)?; - Ok(()) - } - fn harvest(self) -> ::Fruit { ( self.0.harvest(), diff --git a/src/collector/multi_collector.rs b/src/collector/multi_collector.rs index a7a8ed1692..7b119ad868 100644 --- a/src/collector/multi_collector.rs +++ b/src/collector/multi_collector.rs @@ -57,11 +57,6 @@ impl SegmentCollector for Box { Ok(()) } - fn collect_block(&mut self, docs: &[(DocId, Score)]) -> crate::Result<()> { - self.as_mut().collect_block(docs)?; - Ok(()) - } - fn harvest(self) -> Box { BoxableSegmentCollector::harvest_from_box(self) } @@ -69,7 +64,6 @@ impl SegmentCollector for Box { pub trait BoxableSegmentCollector { fn collect(&mut self, doc: u32, score: Score) -> crate::Result<()>; - fn collect_block(&mut self, docs: &[(DocId, Score)]) -> crate::Result<()>; fn harvest_from_box(self: Box) -> Box; } @@ -82,11 +76,6 @@ impl BoxableSegmentCollector self.0.collect(doc, score) } - fn collect_block(&mut self, docs: &[(DocId, Score)]) -> crate::Result<()> { - self.0.collect_block(docs)?; - Ok(()) - } - fn harvest_from_box(self: Box) -> Box { Box::new(self.0.harvest()) } @@ -247,13 +236,6 @@ impl SegmentCollector for MultiCollectorChild { Ok(()) } - fn collect_block(&mut self, docs: &[(DocId, Score)]) -> crate::Result<()> { - for child in &mut self.children { - child.collect_block(docs)?; - } - Ok(()) - } - fn harvest(self) -> MultiFruit { MultiFruit { sub_fruits: self diff --git a/src/collector/top_score_collector.rs b/src/collector/top_score_collector.rs index 6074d088a3..e0e3aeb9dc 100644 --- a/src/collector/top_score_collector.rs +++ b/src/collector/top_score_collector.rs @@ -704,14 +704,6 @@ impl SegmentCollector for TopScoreSegmentCollector { Ok(()) } - #[inline] - fn collect_block(&mut self, docs: &[(DocId, Score)]) -> crate::Result<()> { - for (doc, score) in docs { - self.0.collect(*doc, *score); - } - Ok(()) - } - fn harvest(self) -> Vec<(Score, DocAddress)> { self.0.harvest() } From b114e553cd2c373e19307ab3e922abe45fa49f42 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Thu, 19 May 2022 16:43:55 +0800 Subject: [PATCH 08/39] Revert "return result from segment collector" This reverts commit a99e5459e3e58064ab8917b81e21e924e5ca0548. --- examples/custom_collector.rs | 3 +- src/aggregation/collector.rs | 3 +- src/collector/count_collector.rs | 11 +++--- src/collector/custom_score_top_collector.rs | 3 +- src/collector/docset_collector.rs | 3 +- src/collector/facet_collector.rs | 3 +- src/collector/filter_collector_wrapper.rs | 5 ++- src/collector/histogram_collector.rs | 3 +- src/collector/mod.rs | 40 +++++++++------------ src/collector/multi_collector.rs | 16 ++++----- src/collector/tests.rs | 9 ++--- src/collector/top_score_collector.rs | 3 +- src/collector/tweak_score_top_collector.rs | 3 +- src/query/boolean_query/boolean_weight.rs | 6 ++-- src/query/term_query/term_weight.rs | 4 +-- src/query/weight.rs | 11 +++--- 16 files changed, 52 insertions(+), 74 deletions(-) diff --git a/examples/custom_collector.rs b/examples/custom_collector.rs index 14a9d20358..7bdc9d06b4 100644 --- a/examples/custom_collector.rs +++ b/examples/custom_collector.rs @@ -102,12 +102,11 @@ struct StatsSegmentCollector { impl SegmentCollector for StatsSegmentCollector { type Fruit = Option; - fn collect(&mut self, doc: u32, _score: Score) -> tantivy::Result<()> { + fn collect(&mut self, doc: u32, _score: Score) { let value = self.fast_field_reader.get(doc) as f64; self.stats.count += 1; self.stats.sum += value; self.stats.squared_sum += value * value; - Ok(()) } fn harvest(self) -> ::Fruit { diff --git a/src/aggregation/collector.rs b/src/aggregation/collector.rs index cf2848f383..09c8ecbe1d 100644 --- a/src/aggregation/collector.rs +++ b/src/aggregation/collector.rs @@ -158,9 +158,8 @@ impl SegmentCollector for AggregationSegmentCollector { type Fruit = crate::Result; #[inline] - fn collect(&mut self, doc: crate::DocId, _score: crate::Score) -> crate::Result<()> { + fn collect(&mut self, doc: crate::DocId, _score: crate::Score) { self.result.collect(doc, &self.aggs_with_accessor)?; - Ok(()) } fn harvest(mut self) -> Self::Fruit { diff --git a/src/collector/count_collector.rs b/src/collector/count_collector.rs index 02f30f85c1..075a4f36b4 100644 --- a/src/collector/count_collector.rs +++ b/src/collector/count_collector.rs @@ -65,9 +65,8 @@ pub struct SegmentCountCollector { impl SegmentCollector for SegmentCountCollector { type Fruit = usize; - fn collect(&mut self, _: DocId, _: Score) -> crate::Result<()> { + fn collect(&mut self, _: DocId, _: Score) { self.count += 1; - Ok(()) } fn harvest(self) -> usize { @@ -93,18 +92,18 @@ mod tests { } { let mut count_collector = SegmentCountCollector::default(); - count_collector.collect(0u32, 1.0).unwrap(); + count_collector.collect(0u32, 1.0); assert_eq!(count_collector.harvest(), 1); } { let mut count_collector = SegmentCountCollector::default(); - count_collector.collect(0u32, 1.0).unwrap(); + count_collector.collect(0u32, 1.0); assert_eq!(count_collector.harvest(), 1); } { let mut count_collector = SegmentCountCollector::default(); - count_collector.collect(0u32, 1.0).unwrap(); - count_collector.collect(1u32, 1.0).unwrap(); + count_collector.collect(0u32, 1.0); + count_collector.collect(1u32, 1.0); assert_eq!(count_collector.harvest(), 2); } } diff --git a/src/collector/custom_score_top_collector.rs b/src/collector/custom_score_top_collector.rs index d597727ed1..d645004ade 100644 --- a/src/collector/custom_score_top_collector.rs +++ b/src/collector/custom_score_top_collector.rs @@ -90,10 +90,9 @@ where { type Fruit = Vec<(TScore, DocAddress)>; - fn collect(&mut self, doc: DocId, _score: Score) -> crate::Result<()> { + fn collect(&mut self, doc: DocId, _score: Score) { let score = self.segment_scorer.score(doc); self.segment_collector.collect(doc, score); - Ok(()) } fn harvest(self) -> Vec<(TScore, DocAddress)> { diff --git a/src/collector/docset_collector.rs b/src/collector/docset_collector.rs index 9f6a5fd3bd..a27a394189 100644 --- a/src/collector/docset_collector.rs +++ b/src/collector/docset_collector.rs @@ -50,9 +50,8 @@ pub struct DocSetChildCollector { impl SegmentCollector for DocSetChildCollector { type Fruit = (u32, HashSet); - fn collect(&mut self, doc: crate::DocId, _score: Score) -> crate::Result<()> { + fn collect(&mut self, doc: crate::DocId, _score: Score) { self.docs.insert(doc); - Ok(()) } fn harvest(self) -> (u32, HashSet) { diff --git a/src/collector/facet_collector.rs b/src/collector/facet_collector.rs index 8ad3311e28..e2ef47f989 100644 --- a/src/collector/facet_collector.rs +++ b/src/collector/facet_collector.rs @@ -333,7 +333,7 @@ impl Collector for FacetCollector { impl SegmentCollector for FacetSegmentCollector { type Fruit = FacetCounts; - fn collect(&mut self, doc: DocId, _: Score) -> crate::Result<()> { + fn collect(&mut self, doc: DocId, _: Score) { self.reader.facet_ords(doc, &mut self.facet_ords_buf); let mut previous_collapsed_ord: usize = usize::MAX; for &facet_ord in &self.facet_ords_buf { @@ -345,7 +345,6 @@ impl SegmentCollector for FacetSegmentCollector { }; previous_collapsed_ord = collapsed_ord; } - Ok(()) } /// Returns the results of the collection. diff --git a/src/collector/filter_collector_wrapper.rs b/src/collector/filter_collector_wrapper.rs index 15e7f80212..b1dbaaa203 100644 --- a/src/collector/filter_collector_wrapper.rs +++ b/src/collector/filter_collector_wrapper.rs @@ -173,12 +173,11 @@ where { type Fruit = TSegmentCollector::Fruit; - fn collect(&mut self, doc: u32, score: Score) -> crate::Result<()> { + fn collect(&mut self, doc: u32, score: Score) { let value = self.fast_field_reader.get(doc); if (self.predicate)(value) { - self.segment_collector.collect(doc, score)?; + self.segment_collector.collect(doc, score) } - Ok(()) } fn harvest(self) -> ::Fruit { diff --git a/src/collector/histogram_collector.rs b/src/collector/histogram_collector.rs index fbf398627a..22956a86a2 100644 --- a/src/collector/histogram_collector.rs +++ b/src/collector/histogram_collector.rs @@ -91,10 +91,9 @@ pub struct SegmentHistogramCollector { impl SegmentCollector for SegmentHistogramCollector { type Fruit = Vec; - fn collect(&mut self, doc: DocId, _score: Score) -> crate::Result<()> { + fn collect(&mut self, doc: DocId, _score: Score) { let value = self.ff_reader.get(doc); self.histogram_computer.add_value(value); - Ok(()) } fn harvest(self) -> Self::Fruit { diff --git a/src/collector/mod.rs b/src/collector/mod.rs index 6d600bb488..1597d7fe45 100644 --- a/src/collector/mod.rs +++ b/src/collector/mod.rs @@ -175,14 +175,12 @@ pub trait Collector: Sync + Send { if let Some(alive_bitset) = reader.alive_bitset() { weight.for_each(reader, &mut |doc, score| { if alive_bitset.is_alive(doc) { - segment_collector.collect(doc, score)?; + segment_collector.collect(doc, score); } - Ok(()) })?; } else { weight.for_each(reader, &mut |doc, score| { - segment_collector.collect(doc, score)?; - Ok(()) + segment_collector.collect(doc, score); })?; } Ok(segment_collector.harvest()) @@ -192,11 +190,10 @@ pub trait Collector: Sync + Send { impl SegmentCollector for Option { type Fruit = Option; - fn collect(&mut self, doc: DocId, score: Score) -> crate::Result<()> { + fn collect(&mut self, doc: DocId, score: Score) { if let Some(segment_collector) = self { - segment_collector.collect(doc, score)?; + segment_collector.collect(doc, score); } - Ok(()) } fn harvest(self) -> Self::Fruit { @@ -256,7 +253,7 @@ pub trait SegmentCollector: 'static { type Fruit: Fruit; /// The query pushes the scored document to the collector via this method. - fn collect(&mut self, doc: DocId, score: Score) -> crate::Result<()>; + fn collect(&mut self, doc: DocId, score: Score); /// Extract the fruit of the collection from the `SegmentCollector`. fn harvest(self) -> Self::Fruit; @@ -311,10 +308,9 @@ where { type Fruit = (Left::Fruit, Right::Fruit); - fn collect(&mut self, doc: DocId, score: Score) -> crate::Result<()> { - self.0.collect(doc, score)?; - self.1.collect(doc, score)?; - Ok(()) + fn collect(&mut self, doc: DocId, score: Score) { + self.0.collect(doc, score); + self.1.collect(doc, score); } fn harvest(self) -> ::Fruit { @@ -376,11 +372,10 @@ where { type Fruit = (One::Fruit, Two::Fruit, Three::Fruit); - fn collect(&mut self, doc: DocId, score: Score) -> crate::Result<()> { - self.0.collect(doc, score)?; - self.1.collect(doc, score)?; - self.2.collect(doc, score)?; - Ok(()) + fn collect(&mut self, doc: DocId, score: Score) { + self.0.collect(doc, score); + self.1.collect(doc, score); + self.2.collect(doc, score); } fn harvest(self) -> ::Fruit { @@ -451,12 +446,11 @@ where { type Fruit = (One::Fruit, Two::Fruit, Three::Fruit, Four::Fruit); - fn collect(&mut self, doc: DocId, score: Score) -> crate::Result<()> { - self.0.collect(doc, score)?; - self.1.collect(doc, score)?; - self.2.collect(doc, score)?; - self.3.collect(doc, score)?; - Ok(()) + fn collect(&mut self, doc: DocId, score: Score) { + self.0.collect(doc, score); + self.1.collect(doc, score); + self.2.collect(doc, score); + self.3.collect(doc, score); } fn harvest(self) -> ::Fruit { diff --git a/src/collector/multi_collector.rs b/src/collector/multi_collector.rs index 7b119ad868..039902ff4f 100644 --- a/src/collector/multi_collector.rs +++ b/src/collector/multi_collector.rs @@ -52,9 +52,8 @@ impl Collector for CollectorWrapper { impl SegmentCollector for Box { type Fruit = Box; - fn collect(&mut self, doc: u32, score: Score) -> crate::Result<()> { - self.as_mut().collect(doc, score)?; - Ok(()) + fn collect(&mut self, doc: u32, score: Score) { + self.as_mut().collect(doc, score); } fn harvest(self) -> Box { @@ -63,7 +62,7 @@ impl SegmentCollector for Box { } pub trait BoxableSegmentCollector { - fn collect(&mut self, doc: u32, score: Score) -> crate::Result<()>; + fn collect(&mut self, doc: u32, score: Score); fn harvest_from_box(self: Box) -> Box; } @@ -72,8 +71,8 @@ pub struct SegmentCollectorWrapper(TSegment impl BoxableSegmentCollector for SegmentCollectorWrapper { - fn collect(&mut self, doc: u32, score: Score) -> crate::Result<()> { - self.0.collect(doc, score) + fn collect(&mut self, doc: u32, score: Score) { + self.0.collect(doc, score); } fn harvest_from_box(self: Box) -> Box { @@ -229,11 +228,10 @@ pub struct MultiCollectorChild { impl SegmentCollector for MultiCollectorChild { type Fruit = MultiFruit; - fn collect(&mut self, doc: DocId, score: Score) -> crate::Result<()> { + fn collect(&mut self, doc: DocId, score: Score) { for child in &mut self.children { - child.collect(doc, score)?; + child.collect(doc, score); } - Ok(()) } fn harvest(self) -> MultiFruit { diff --git a/src/collector/tests.rs b/src/collector/tests.rs index 5e0a0cfb2d..3bda822a10 100644 --- a/src/collector/tests.rs +++ b/src/collector/tests.rs @@ -138,10 +138,9 @@ impl Collector for TestCollector { impl SegmentCollector for TestSegmentCollector { type Fruit = TestFruit; - fn collect(&mut self, doc: DocId, score: Score) -> crate::Result<()> { + fn collect(&mut self, doc: DocId, score: Score) { self.fruit.docs.push(DocAddress::new(self.segment_id, doc)); self.fruit.scores.push(score); - Ok(()) } fn harvest(self) -> ::Fruit { @@ -199,10 +198,9 @@ impl Collector for FastFieldTestCollector { impl SegmentCollector for FastFieldSegmentCollector { type Fruit = Vec; - fn collect(&mut self, doc: DocId, _score: Score) -> crate::Result<()> { + fn collect(&mut self, doc: DocId, _score: Score) { let val = self.reader.get(doc); self.vals.push(val); - Ok(()) } fn harvest(self) -> Vec { @@ -257,10 +255,9 @@ impl Collector for BytesFastFieldTestCollector { impl SegmentCollector for BytesFastFieldSegmentCollector { type Fruit = Vec; - fn collect(&mut self, doc: u32, _score: Score) -> crate::Result<()> { + fn collect(&mut self, doc: u32, _score: Score) { let data = self.reader.get_bytes(doc); self.vals.extend(data); - Ok(()) } fn harvest(self) -> ::Fruit { diff --git a/src/collector/top_score_collector.rs b/src/collector/top_score_collector.rs index e0e3aeb9dc..516dedcb58 100644 --- a/src/collector/top_score_collector.rs +++ b/src/collector/top_score_collector.rs @@ -699,9 +699,8 @@ pub struct TopScoreSegmentCollector(TopSegmentCollector); impl SegmentCollector for TopScoreSegmentCollector { type Fruit = Vec<(Score, DocAddress)>; - fn collect(&mut self, doc: DocId, score: Score) -> crate::Result<()> { + fn collect(&mut self, doc: DocId, score: Score) { self.0.collect(doc, score); - Ok(()) } fn harvest(self) -> Vec<(Score, DocAddress)> { diff --git a/src/collector/tweak_score_top_collector.rs b/src/collector/tweak_score_top_collector.rs index cb8d358abe..1a81e73616 100644 --- a/src/collector/tweak_score_top_collector.rs +++ b/src/collector/tweak_score_top_collector.rs @@ -93,10 +93,9 @@ where { type Fruit = Vec<(TScore, DocAddress)>; - fn collect(&mut self, doc: DocId, score: Score) -> crate::Result<()> { + fn collect(&mut self, doc: DocId, score: Score) { let score = self.segment_scorer.score(doc, score); self.segment_collector.collect(doc, score); - Ok(()) } fn harvest(self) -> Vec<(TScore, DocAddress)> { diff --git a/src/query/boolean_query/boolean_weight.rs b/src/query/boolean_query/boolean_weight.rs index 9024a6abb8..f2a5fd376a 100644 --- a/src/query/boolean_query/boolean_weight.rs +++ b/src/query/boolean_query/boolean_weight.rs @@ -186,17 +186,17 @@ impl Weight for BooleanWeight { fn for_each( &self, reader: &SegmentReader, - callback: &mut dyn FnMut(DocId, Score) -> crate::Result<()>, + callback: &mut dyn FnMut(DocId, Score), ) -> crate::Result<()> { let scorer = self.complex_scorer::(reader, 1.0)?; match scorer { SpecializedScorer::TermUnion(term_scorers) => { let mut union_scorer = Union::::from(term_scorers); - for_each_scorer(&mut union_scorer, callback)?; + for_each_scorer(&mut union_scorer, callback); } SpecializedScorer::Other(mut scorer) => { - for_each_scorer(scorer.as_mut(), callback)?; + for_each_scorer(scorer.as_mut(), callback); } } Ok(()) diff --git a/src/query/term_query/term_weight.rs b/src/query/term_query/term_weight.rs index e1529027b2..4e742bc444 100644 --- a/src/query/term_query/term_weight.rs +++ b/src/query/term_query/term_weight.rs @@ -49,10 +49,10 @@ impl Weight for TermWeight { fn for_each( &self, reader: &SegmentReader, - callback: &mut dyn FnMut(DocId, Score) -> crate::Result<()>, + callback: &mut dyn FnMut(DocId, Score), ) -> crate::Result<()> { let mut scorer = self.specialized_scorer(reader, 1.0)?; - for_each_scorer(&mut scorer, callback)?; + for_each_scorer(&mut scorer, callback); Ok(()) } diff --git a/src/query/weight.rs b/src/query/weight.rs index 26bdfb0edb..3a2ff3d33c 100644 --- a/src/query/weight.rs +++ b/src/query/weight.rs @@ -7,14 +7,13 @@ use crate::{DocId, Score, TERMINATED}; /// `DocSet` and push the scored documents to the collector. pub(crate) fn for_each_scorer( scorer: &mut TScorer, - callback: &mut dyn FnMut(DocId, Score) -> crate::Result<()>, -) -> crate::Result<()> { + callback: &mut dyn FnMut(DocId, Score), +) { let mut doc = scorer.doc(); while doc != TERMINATED { - callback(doc, scorer.score())?; + callback(doc, scorer.score()); doc = scorer.advance(); } - Ok(()) } /// Calls `callback` with all of the `(doc, score)` for which score @@ -72,10 +71,10 @@ pub trait Weight: Send + Sync + 'static { fn for_each( &self, reader: &SegmentReader, - callback: &mut dyn FnMut(DocId, Score) -> crate::Result<()>, + callback: &mut dyn FnMut(DocId, Score), ) -> crate::Result<()> { let mut scorer = self.scorer(reader, 1.0)?; - for_each_scorer(scorer.as_mut(), callback)?; + for_each_scorer(scorer.as_mut(), callback); Ok(()) } From 71f75071d2ab4d9380a2254cf2197bdd93688fc2 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Thu, 19 May 2022 16:58:56 +0800 Subject: [PATCH 09/39] cache and return error in aggregations --- src/aggregation/bucket/term_agg.rs | 2 +- src/aggregation/collector.rs | 14 ++++++++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/src/aggregation/bucket/term_agg.rs b/src/aggregation/bucket/term_agg.rs index 52e120cc96..8a9970e0fd 100644 --- a/src/aggregation/bucket/term_agg.rs +++ b/src/aggregation/bucket/term_agg.rs @@ -256,7 +256,7 @@ impl TermBuckets { }); entry.doc_count += 1; if let Some(sub_aggregations) = entry.sub_aggregations.as_mut() { - sub_aggregations.collect(doc, &sub_aggregation)?; + sub_aggregations.collect(doc, sub_aggregation)?; } } bucket_count.validate_bucket_count()?; diff --git a/src/aggregation/collector.rs b/src/aggregation/collector.rs index 09c8ecbe1d..c9510d9263 100644 --- a/src/aggregation/collector.rs +++ b/src/aggregation/collector.rs @@ -7,7 +7,7 @@ use super::intermediate_agg_result::IntermediateAggregationResults; use super::segment_agg_result::SegmentAggregationResultsCollector; use crate::aggregation::agg_req_with_accessor::get_aggs_with_accessor_and_validate; use crate::collector::{Collector, SegmentCollector}; -use crate::SegmentReader; +use crate::{SegmentReader, TantivyError}; pub const MAX_BUCKET_COUNT: u32 = 65000; @@ -133,6 +133,7 @@ fn merge_fruits( pub struct AggregationSegmentCollector { aggs_with_accessor: AggregationsWithAccessor, result: SegmentAggregationResultsCollector, + error: Option, } impl AggregationSegmentCollector { @@ -150,6 +151,7 @@ impl AggregationSegmentCollector { Ok(AggregationSegmentCollector { aggs_with_accessor, result, + error: None, }) } } @@ -159,10 +161,18 @@ impl SegmentCollector for AggregationSegmentCollector { #[inline] fn collect(&mut self, doc: crate::DocId, _score: crate::Score) { - self.result.collect(doc, &self.aggs_with_accessor)?; + if self.error.is_some() { + return; + } + if let Err(err) = self.result.collect(doc, &self.aggs_with_accessor) { + self.error = Some(err); + } } fn harvest(mut self) -> Self::Fruit { + if let Some(err) = self.error { + return Err(err); + } self.result .flush_staged_docs(&self.aggs_with_accessor, true)?; self.result From 1440f3243b8d2cb7df3f3c4d61f0831d9fc93b9c Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Tue, 21 Jun 2022 14:47:01 +0800 Subject: [PATCH 10/39] fix clippy --- src/core/index_meta.rs | 4 ++-- src/fastfield/mod.rs | 11 ++++------- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/src/core/index_meta.rs b/src/core/index_meta.rs index 8cd9429302..fb79ac71e7 100644 --- a/src/core/index_meta.rs +++ b/src/core/index_meta.rs @@ -469,7 +469,7 @@ mod tests { fn test_serialize_metas_invalid_comp() { let json = r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"zsstd","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#; - let err = serde_json::from_str::(&json).unwrap_err(); + let err = serde_json::from_str::(json).unwrap_err(); assert_eq!( err.to_string(), "unknown variant `zsstd`, expected one of `none`, `lz4`, `brotli`, `snappy`, `zstd`, \ @@ -479,7 +479,7 @@ mod tests { let json = r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"zstd(bla=10)","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#; - let err = serde_json::from_str::(&json).unwrap_err(); + let err = serde_json::from_str::(json).unwrap_err(); assert_eq!( err.to_string(), "unknown zstd option \"bla\" at line 1 column 103".to_string() diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index ce16d756ad..dd0e51577d 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -162,10 +162,7 @@ impl FastValue for f64 { impl FastValue for bool { fn from_u64(val: u64) -> Self { - match val { - 0 => false, - _ => true, - } + !matches!(val, 0) } fn to_u64(&self) -> u64 { @@ -853,7 +850,7 @@ mod tests { .unwrap(); serializer.close().unwrap(); } - let file = directory.open_read(&path).unwrap(); + let file = directory.open_read(path).unwrap(); assert_eq!(file.len(), 36); let composite_file = CompositeFile::open(&file)?; let file = composite_file.open_read(field).unwrap(); @@ -889,7 +886,7 @@ mod tests { .unwrap(); serializer.close().unwrap(); } - let file = directory.open_read(&path).unwrap(); + let file = directory.open_read(path).unwrap(); assert_eq!(file.len(), 48); let composite_file = CompositeFile::open(&file)?; let file = composite_file.open_read(field).unwrap(); @@ -923,7 +920,7 @@ mod tests { .unwrap(); serializer.close().unwrap(); } - let file = directory.open_read(&path).unwrap(); + let file = directory.open_read(path).unwrap(); assert_eq!(file.len(), 35); let composite_file = CompositeFile::open(&file)?; let file = composite_file.open_read(field).unwrap(); From f21b73d1f6d601cfb3bc999e2757d900ec19984f Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Tue, 21 Jun 2022 15:52:43 +0900 Subject: [PATCH 11/39] Apply suggestions from code review --- src/fastfield/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index dd0e51577d..c691897476 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -162,7 +162,7 @@ impl FastValue for f64 { impl FastValue for bool { fn from_u64(val: u64) -> Self { - !matches!(val, 0) + val != 0u64 } fn to_u64(&self) -> u64 { From 2e2822f89d885391dd9fe7897e9409bf2e4f1fc3 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Thu, 23 Jun 2022 09:48:28 +0900 Subject: [PATCH 12/39] Apply suggestions from code review --- CHANGELOG.md | 1 - 1 file changed, 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4282b830a7..00d88d2938 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,7 +11,6 @@ Unreleased - Add [histogram](https://github.com/quickwit-oss/tantivy/pull/1306) aggregation (@PSeitz) - Add support for fastfield on text fields (@PSeitz) - Add terms aggregation (@PSeitz) -- API Change: `SegmentCollector.collect` changed to return a `Result`. Tantivy 0.17 ================================ From 93f356a7a7e6da898c3edcdd791a4bc2332a274e Mon Sep 17 00:00:00 2001 From: PSeitz Date: Thu, 23 Jun 2022 10:53:20 +0800 Subject: [PATCH 13/39] Extend FAQ (#1388) * Extend FAQ Co-authored-by: Maxim Kraynyuchenko <100854040+maximkpa@users.noreply.github.com> --- README.md | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ab02dd7f2c..97045531a3 100644 --- a/README.md +++ b/README.md @@ -152,4 +152,13 @@ You can also find other bindings on [GitHub](https://github.com/search?q=tantivy - and [more](https://github.com/search?q=tantivy)! ### On average, how much faster is Tantivy compared to Lucene? -- According to our [search latency benchmark](https://tantivy-search.github.io/bench/), Tantivy is approximately 2x faster than Lucene. \ No newline at end of file +- According to our [search latency benchmark](https://tantivy-search.github.io/bench/), Tantivy is approximately 2x faster than Lucene. + +### Does tantivy support incremental indexing? +- Yes. + +### How can I edit documents? +- Data in tantivy is immutable. To edit a document, the document needs to be deleted and reindexed. + +### When will my documents be searchable during indexing? +- Documents will be searchable after a `commit` is called on an `IndexWriter`. Existing `IndexReader`s will also need to be reloaded in order to reflect the changes. Finally, changes are only visible to newly acquired `Searcher`. From 4c7dedef296dbdf902e0af6b6698465142bdad5c Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Thu, 16 Jun 2022 13:06:52 +0800 Subject: [PATCH 14/39] use seperate thread to compress block store Use seperate thread to compress block store for increased indexing performance. This allows to use slower compressors with higher compression ratio, with less or no perfomance impact (with enough cores). A seperate thread is spawned to compress the docstore, which handles single blocks and stacking from other docstores. The spawned compressor thread does not write, instead it sends back the compressed data. This is done in order to avoid writing multithreaded on the same file. --- src/indexer/merger.rs | 2 +- src/indexer/segment_serializer.rs | 2 +- src/indexer/segment_writer.rs | 2 +- src/store/index/mod.rs | 10 +- src/store/index/skip_index_builder.rs | 2 +- src/store/mod.rs | 2 +- src/store/writer.rs | 293 ++++++++++++++++++++------ 7 files changed, 236 insertions(+), 77 deletions(-) diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 007934ba53..3c4f2a2abc 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -1081,7 +1081,7 @@ impl IndexMerger { store_writer.store_bytes(&doc_bytes)?; } } else { - store_writer.stack(&store_reader)?; + store_writer.stack(store_reader)?; } } } diff --git a/src/indexer/segment_serializer.rs b/src/indexer/segment_serializer.rs index 554503e668..ffb6a3dc33 100644 --- a/src/indexer/segment_serializer.rs +++ b/src/indexer/segment_serializer.rs @@ -42,7 +42,7 @@ impl SegmentSerializer { let blocksize = segment.index().settings().docstore_blocksize; Ok(SegmentSerializer { segment, - store_writer: StoreWriter::new(store_write, compressor, blocksize), + store_writer: StoreWriter::new(store_write, compressor, blocksize)?, fast_field_serializer, fieldnorms_serializer: Some(fieldnorms_serializer), postings_serializer, diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 94469c5bca..308cca255e 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -382,7 +382,7 @@ fn remap_and_write( let block_size = serializer.segment().index().settings().docstore_blocksize; let old_store_writer = std::mem::replace( &mut serializer.store_writer, - StoreWriter::new(store_write, compressor, block_size), + StoreWriter::new(store_write, compressor, block_size)?, ); old_store_writer.close()?; let store_read = StoreReader::open( diff --git a/src/store/index/mod.rs b/src/store/index/mod.rs index 2401e23d6d..af572e758b 100644 --- a/src/store/index/mod.rs +++ b/src/store/index/mod.rs @@ -54,7 +54,7 @@ mod tests { fn test_skip_index_empty() -> io::Result<()> { let mut output: Vec = Vec::new(); let skip_index_builder: SkipIndexBuilder = SkipIndexBuilder::new(); - skip_index_builder.write(&mut output)?; + skip_index_builder.serialize_into(&mut output)?; let skip_index: SkipIndex = SkipIndex::open(OwnedBytes::new(output)); let mut skip_cursor = skip_index.checkpoints(); assert!(skip_cursor.next().is_none()); @@ -70,7 +70,7 @@ mod tests { byte_range: 0..3, }; skip_index_builder.insert(checkpoint.clone()); - skip_index_builder.write(&mut output)?; + skip_index_builder.serialize_into(&mut output)?; let skip_index: SkipIndex = SkipIndex::open(OwnedBytes::new(output)); let mut skip_cursor = skip_index.checkpoints(); assert_eq!(skip_cursor.next(), Some(checkpoint)); @@ -108,7 +108,7 @@ mod tests { for checkpoint in &checkpoints { skip_index_builder.insert(checkpoint.clone()); } - skip_index_builder.write(&mut output)?; + skip_index_builder.serialize_into(&mut output)?; let skip_index: SkipIndex = SkipIndex::open(OwnedBytes::new(output)); assert_eq!( @@ -167,7 +167,7 @@ mod tests { for checkpoint in &checkpoints { skip_index_builder.insert(checkpoint.clone()); } - skip_index_builder.write(&mut output)?; + skip_index_builder.serialize_into(&mut output)?; assert_eq!(output.len(), 4035); let resulting_checkpoints: Vec = SkipIndex::open(OwnedBytes::new(output)) .checkpoints() @@ -238,7 +238,7 @@ mod tests { skip_index_builder.insert(checkpoint); } let mut buffer = Vec::new(); - skip_index_builder.write(&mut buffer).unwrap(); + skip_index_builder.serialize_into(&mut buffer).unwrap(); let skip_index = SkipIndex::open(OwnedBytes::new(buffer)); let iter_checkpoints: Vec = skip_index.checkpoints().collect(); assert_eq!(&checkpoints[..], &iter_checkpoints[..]); diff --git a/src/store/index/skip_index_builder.rs b/src/store/index/skip_index_builder.rs index cbb899a219..2f34376cd4 100644 --- a/src/store/index/skip_index_builder.rs +++ b/src/store/index/skip_index_builder.rs @@ -87,7 +87,7 @@ impl SkipIndexBuilder { } } - pub fn write(mut self, output: &mut W) -> io::Result<()> { + pub fn serialize_into(mut self, output: &mut W) -> io::Result<()> { let mut last_pointer = None; for skip_layer in self.layers.iter_mut() { if let Some(checkpoint) = last_pointer { diff --git a/src/store/mod.rs b/src/store/mod.rs index 88ef9b579e..8dd035fe7f 100644 --- a/src/store/mod.rs +++ b/src/store/mod.rs @@ -88,7 +88,7 @@ pub mod tests { schema_builder.add_text_field("title", TextOptions::default().set_stored()); let schema = schema_builder.build(); { - let mut store_writer = StoreWriter::new(writer, compressor, blocksize); + let mut store_writer = StoreWriter::new(writer, compressor, blocksize).unwrap(); for i in 0..num_docs { let mut doc = Document::default(); doc.add_field_value(field_body, LOREM.to_string()); diff --git a/src/store/writer.rs b/src/store/writer.rs index a351d0fcbc..140dd08a78 100644 --- a/src/store/writer.rs +++ b/src/store/writer.rs @@ -1,6 +1,9 @@ use std::io::{self, Write}; +use std::sync::mpsc::{sync_channel, Receiver, SyncSender, TryRecvError}; +use std::thread::{self, JoinHandle}; -use common::{BinarySerializable, CountingWriter, VInt}; +use common::{BinarySerializable, VInt}; +use ownedbytes::OwnedBytes; use super::compressors::Compressor; use super::footer::DocStoreFooter; @@ -21,12 +24,23 @@ use crate::DocId; pub struct StoreWriter { compressor: Compressor, block_size: usize, - doc: DocId, - first_doc_in_block: DocId, - offset_index_writer: SkipIndexBuilder, - writer: CountingWriter, + num_docs_in_current_block: DocId, intermediary_buffer: Vec, current_block: Vec, + + writer: WritePtr, + // the channel to communicate with the compressor thread. + compressor_sender: SyncSender, + // the channel to receive data to write from the compressor thread. + data_receiver: Receiver, + + // the handle to check for errors on the thread + compressor_thread_handle: JoinHandle>, +} + +enum BlockCompressorMessage { + AddBlock(DocumentBlock), + Stack((StoreReader, DocumentBlock)), } impl StoreWriter { @@ -34,17 +48,43 @@ impl StoreWriter { /// /// The store writer will writes blocks on disc as /// document are added. - pub fn new(writer: WritePtr, compressor: Compressor, block_size: usize) -> StoreWriter { - StoreWriter { + pub fn new( + writer: WritePtr, + compressor: Compressor, + block_size: usize, + ) -> io::Result { + let thread_builder = thread::Builder::new().name("docstore compressor thread".to_string()); + + // Data channel to send fs writes, to write only from current thread + let (data_sender, data_receiver) = sync_channel(3); + // Channel to send uncompressed data to compressor channel + let (block_sender, block_receiver) = sync_channel(3); + let thread_join_handle = thread_builder.spawn(move || { + let mut block_compressor = BlockCompressor::new(compressor, data_sender); + while let Ok(packet) = block_receiver.recv() { + match packet { + BlockCompressorMessage::AddBlock(block) => { + block_compressor.compress_block(block)?; + } + BlockCompressorMessage::Stack((store_reader, block)) => { + block_compressor.stack(block, store_reader)?; + } + } + } + block_compressor.close() + })?; + + Ok(StoreWriter { compressor, block_size, - doc: 0, - first_doc_in_block: 0, - offset_index_writer: SkipIndexBuilder::new(), - writer: CountingWriter::wrap(writer), + num_docs_in_current_block: 0, intermediary_buffer: Vec::new(), current_block: Vec::new(), - } + writer, + compressor_sender: block_sender, + compressor_thread_handle: thread_join_handle, + data_receiver, + }) } pub(crate) fn compressor(&self) -> Compressor { @@ -56,21 +96,53 @@ impl StoreWriter { self.intermediary_buffer.capacity() + self.current_block.capacity() } - /// Store bytes of a serialized document. - /// - /// The document id is implicitely the current number - /// of documents. - pub fn store_bytes(&mut self, serialized_document: &[u8]) -> io::Result<()> { - let doc_num_bytes = serialized_document.len(); - VInt(doc_num_bytes as u64).serialize(&mut self.current_block)?; - self.current_block.write_all(serialized_document)?; - self.doc += 1; + fn check_flush_block(&mut self) -> io::Result<()> { if self.current_block.len() > self.block_size { - self.write_and_compress_block()?; + let block = self.get_current_block(); + self.compressor_sender + .send(BlockCompressorMessage::AddBlock(block)) + .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?; + self.fetch_writes_from_channel()?; } Ok(()) } + /// Try to empty the queue to write into the file. + /// + /// This is done in order to avoid writing from multiple threads into the file. + fn fetch_writes_from_channel(&mut self) -> io::Result<()> { + loop { + match self.data_receiver.try_recv() { + Ok(data) => { + self.writer.write_all(data.as_slice())?; + } + Err(err) => match err { + TryRecvError::Empty => { + break; + } + TryRecvError::Disconnected => { + return Err(io::Error::new( + io::ErrorKind::Other, + "compressor data channel unexpected closed".to_string(), + )); + } + }, + } + } + + Ok(()) + } + + fn get_current_block(&mut self) -> DocumentBlock { + let block = DocumentBlock { + data: self.current_block.to_owned(), + num_docs_in_block: self.num_docs_in_current_block, + }; + self.current_block.clear(); + self.num_docs_in_current_block = 0; + block + } + /// Store a new document. /// /// The document id is implicitely the current number @@ -85,28 +157,145 @@ impl StoreWriter { VInt(doc_num_bytes as u64).serialize(&mut self.current_block)?; self.current_block .write_all(&self.intermediary_buffer[..])?; - self.doc += 1; - if self.current_block.len() > self.block_size { - self.write_and_compress_block()?; + self.num_docs_in_current_block += 1; + self.check_flush_block()?; + Ok(()) + } + + /// Store bytes of a serialized document. + /// + /// The document id is implicitely the current number + /// of documents. + pub fn store_bytes(&mut self, serialized_document: &[u8]) -> io::Result<()> { + let doc_num_bytes = serialized_document.len(); + VInt(doc_num_bytes as u64).serialize(&mut self.current_block)?; + self.current_block.write_all(serialized_document)?; + self.num_docs_in_current_block += 1; + self.check_flush_block()?; + Ok(()) + } + + /// Stacks a store reader on top of the documents written so far. + /// This method is an optimization compared to iterating over the documents + /// in the store and adding them one by one, as the store's data will + /// not be decompressed and then recompressed. + pub fn stack(&mut self, store_reader: StoreReader) -> io::Result<()> { + self.check_flush_block()?; + let block = self.get_current_block(); + self.compressor_sender + .send(BlockCompressorMessage::Stack((store_reader, block))) + .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?; + + Ok(()) + } + + /// Finalized the store writer. + /// + /// Compress the last unfinished block if any, + /// and serializes the skip list index on disc. + pub fn close(mut self) -> io::Result<()> { + let block = self.get_current_block(); + if !block.is_empty() { + self.compressor_sender + .send(BlockCompressorMessage::AddBlock(block)) + .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?; } + drop(self.compressor_sender); + + // Wait for remaining data on the channel to write + while let Ok(data) = self.data_receiver.recv() { + self.writer.write_all(data.as_slice())?; + } + + // The compressor thread should have finished already, since data_receiver stopped + // receiving + let (docstore_footer, offset_index_writer) = self + .compressor_thread_handle + .join() + .map_err(|err| io::Error::new(io::ErrorKind::Other, format!("{:?}", err)))??; + + offset_index_writer.serialize_into(&mut self.writer)?; + docstore_footer.serialize(&mut self.writer)?; + self.writer.terminate() + } +} + +/// BlockCompressor is seperated from StoreWriter, to be run in an own thread +pub struct BlockCompressor { + compressor: Compressor, + doc: DocId, + offset_index_writer: SkipIndexBuilder, + intermediary_buffer: Vec, + written_bytes: usize, + data_sender: SyncSender, +} + +struct DocumentBlock { + data: Vec, + num_docs_in_block: DocId, +} + +impl DocumentBlock { + fn is_empty(&self) -> bool { + self.data.is_empty() + } +} + +impl BlockCompressor { + fn new(compressor: Compressor, data_sender: SyncSender) -> Self { + Self { + compressor, + doc: 0, + offset_index_writer: SkipIndexBuilder::new(), + intermediary_buffer: Vec::new(), + written_bytes: 0, + data_sender, + } + } + + fn compress_block(&mut self, block: DocumentBlock) -> io::Result<()> { + assert!(block.num_docs_in_block > 0); + self.intermediary_buffer.clear(); + self.compressor + .compress_into(&block.data[..], &mut self.intermediary_buffer)?; + + let byte_range = self.written_bytes..self.written_bytes + self.intermediary_buffer.len(); + + self.data_sender + .send(OwnedBytes::new(self.intermediary_buffer.to_owned())) + .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?; + + self.written_bytes += byte_range.len(); + + self.register_checkpoint(Checkpoint { + doc_range: self.doc..self.doc + block.num_docs_in_block, + byte_range, + }); Ok(()) } + fn register_checkpoint(&mut self, checkpoint: Checkpoint) { + self.offset_index_writer.insert(checkpoint.clone()); + self.doc = checkpoint.doc_range.end; + } + /// Stacks a store reader on top of the documents written so far. /// This method is an optimization compared to iterating over the documents /// in the store and adding them one by one, as the store's data will /// not be decompressed and then recompressed. - pub fn stack(&mut self, store_reader: &StoreReader) -> io::Result<()> { - if !self.current_block.is_empty() { - self.write_and_compress_block()?; + fn stack(&mut self, block: DocumentBlock, store_reader: StoreReader) -> io::Result<()> { + if !block.is_empty() { + self.compress_block(block)?; } - assert_eq!(self.first_doc_in_block, self.doc); let doc_shift = self.doc; - let start_shift = self.writer.written_bytes() as usize; + let start_shift = self.written_bytes; // just bulk write all of the block of the given reader. - self.writer - .write_all(store_reader.block_data()?.as_slice())?; + self.data_sender + .send(store_reader.block_data()?) + .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?; + + self.written_bytes += store_reader.block_data()?.as_slice().len(); // concatenate the index of the `store_reader`, after translating // its start doc id and its start file offset. @@ -119,42 +308,12 @@ impl StoreWriter { } Ok(()) } + fn close(self) -> io::Result<(DocStoreFooter, SkipIndexBuilder)> { + drop(self.data_sender); - fn register_checkpoint(&mut self, checkpoint: Checkpoint) { - self.offset_index_writer.insert(checkpoint.clone()); - self.first_doc_in_block = checkpoint.doc_range.end; - self.doc = checkpoint.doc_range.end; - } - - fn write_and_compress_block(&mut self) -> io::Result<()> { - assert!(self.doc > 0); - self.intermediary_buffer.clear(); - self.compressor - .compress_into(&self.current_block[..], &mut self.intermediary_buffer)?; - let start_offset = self.writer.written_bytes() as usize; - self.writer.write_all(&self.intermediary_buffer)?; - let end_offset = self.writer.written_bytes() as usize; - let end_doc = self.doc; - self.register_checkpoint(Checkpoint { - doc_range: self.first_doc_in_block..end_doc, - byte_range: start_offset..end_offset, - }); - self.current_block.clear(); - Ok(()) - } - - /// Finalized the store writer. - /// - /// Compress the last unfinished block if any, - /// and serializes the skip list index on disc. - pub fn close(mut self) -> io::Result<()> { - if !self.current_block.is_empty() { - self.write_and_compress_block()?; - } - let header_offset: u64 = self.writer.written_bytes() as u64; + let header_offset: u64 = self.written_bytes as u64; let footer = DocStoreFooter::new(header_offset, Decompressor::from(self.compressor)); - self.offset_index_writer.write(&mut self.writer)?; - footer.serialize(&mut self.writer)?; - self.writer.terminate() + + Ok((footer, self.offset_index_writer)) } } From 7bf5962554ecc14401d70b9c2a47ff1356f7bbf0 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Thu, 16 Jun 2022 16:50:56 +0800 Subject: [PATCH 15/39] merge match, explicit type --- src/store/writer.rs | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/src/store/writer.rs b/src/store/writer.rs index 140dd08a78..f68fce831e 100644 --- a/src/store/writer.rs +++ b/src/store/writer.rs @@ -56,9 +56,13 @@ impl StoreWriter { let thread_builder = thread::Builder::new().name("docstore compressor thread".to_string()); // Data channel to send fs writes, to write only from current thread - let (data_sender, data_receiver) = sync_channel(3); + let (data_sender, data_receiver): (SyncSender, Receiver) = + sync_channel(3); // Channel to send uncompressed data to compressor channel - let (block_sender, block_receiver) = sync_channel(3); + let (block_sender, block_receiver): ( + SyncSender, + Receiver, + ) = sync_channel(3); let thread_join_handle = thread_builder.spawn(move || { let mut block_compressor = BlockCompressor::new(compressor, data_sender); while let Ok(packet) = block_receiver.recv() { @@ -116,17 +120,15 @@ impl StoreWriter { Ok(data) => { self.writer.write_all(data.as_slice())?; } - Err(err) => match err { - TryRecvError::Empty => { - break; - } - TryRecvError::Disconnected => { - return Err(io::Error::new( - io::ErrorKind::Other, - "compressor data channel unexpected closed".to_string(), - )); - } - }, + Err(TryRecvError::Empty) => { + break; + } + Err(TryRecvError::Disconnected) => { + return Err(io::Error::new( + io::ErrorKind::Other, + "compressor data channel unexpected closed".to_string(), + )); + } } } From efabcbcdf5f23d8536177462ee630ed7648f99ab Mon Sep 17 00:00:00 2001 From: PSeitz Date: Fri, 17 Jun 2022 10:21:20 +0200 Subject: [PATCH 16/39] Update src/store/writer.rs Co-authored-by: Paul Masurel --- src/store/writer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/store/writer.rs b/src/store/writer.rs index f68fce831e..1a15ab4eaa 100644 --- a/src/store/writer.rs +++ b/src/store/writer.rs @@ -222,7 +222,7 @@ impl StoreWriter { } } -/// BlockCompressor is seperated from StoreWriter, to be run in an own thread +/// BlockCompressor is separated from StoreWriter, to be run in an own thread pub struct BlockCompressor { compressor: Compressor, doc: DocId, From 8b6647e90877ac0f110a543b355646cd625c6d38 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Tue, 21 Jun 2022 10:21:35 +0800 Subject: [PATCH 17/39] move writer to compressor thread --- common/src/writer.rs | 2 +- src/store/writer.rs | 100 +++++++++++-------------------------------- 2 files changed, 25 insertions(+), 77 deletions(-) diff --git a/common/src/writer.rs b/common/src/writer.rs index 20f56221d7..9b8b86908d 100644 --- a/common/src/writer.rs +++ b/common/src/writer.rs @@ -62,7 +62,7 @@ impl TerminatingWrite for CountingWriter { pub struct AntiCallToken(()); /// Trait used to indicate when no more write need to be done on a writer -pub trait TerminatingWrite: Write { +pub trait TerminatingWrite: Write + Send { /// Indicate that the writer will no longer be used. Internally call terminate_ref. fn terminate(mut self) -> io::Result<()> where Self: Sized { diff --git a/src/store/writer.rs b/src/store/writer.rs index 1a15ab4eaa..69273cd163 100644 --- a/src/store/writer.rs +++ b/src/store/writer.rs @@ -1,9 +1,8 @@ use std::io::{self, Write}; -use std::sync::mpsc::{sync_channel, Receiver, SyncSender, TryRecvError}; +use std::sync::mpsc::{sync_channel, Receiver, SyncSender}; use std::thread::{self, JoinHandle}; -use common::{BinarySerializable, VInt}; -use ownedbytes::OwnedBytes; +use common::{BinarySerializable, CountingWriter, VInt}; use super::compressors::Compressor; use super::footer::DocStoreFooter; @@ -28,14 +27,11 @@ pub struct StoreWriter { intermediary_buffer: Vec, current_block: Vec, - writer: WritePtr, - // the channel to communicate with the compressor thread. + // the channel to send data to the compressor thread. compressor_sender: SyncSender, - // the channel to receive data to write from the compressor thread. - data_receiver: Receiver, // the handle to check for errors on the thread - compressor_thread_handle: JoinHandle>, + compressor_thread_handle: JoinHandle>, } enum BlockCompressorMessage { @@ -55,16 +51,13 @@ impl StoreWriter { ) -> io::Result { let thread_builder = thread::Builder::new().name("docstore compressor thread".to_string()); - // Data channel to send fs writes, to write only from current thread - let (data_sender, data_receiver): (SyncSender, Receiver) = - sync_channel(3); // Channel to send uncompressed data to compressor channel let (block_sender, block_receiver): ( SyncSender, Receiver, ) = sync_channel(3); let thread_join_handle = thread_builder.spawn(move || { - let mut block_compressor = BlockCompressor::new(compressor, data_sender); + let mut block_compressor = BlockCompressor::new(compressor, writer); while let Ok(packet) = block_receiver.recv() { match packet { BlockCompressorMessage::AddBlock(block) => { @@ -84,10 +77,8 @@ impl StoreWriter { num_docs_in_current_block: 0, intermediary_buffer: Vec::new(), current_block: Vec::new(), - writer, compressor_sender: block_sender, compressor_thread_handle: thread_join_handle, - data_receiver, }) } @@ -106,35 +97,10 @@ impl StoreWriter { self.compressor_sender .send(BlockCompressorMessage::AddBlock(block)) .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?; - self.fetch_writes_from_channel()?; } Ok(()) } - /// Try to empty the queue to write into the file. - /// - /// This is done in order to avoid writing from multiple threads into the file. - fn fetch_writes_from_channel(&mut self) -> io::Result<()> { - loop { - match self.data_receiver.try_recv() { - Ok(data) => { - self.writer.write_all(data.as_slice())?; - } - Err(TryRecvError::Empty) => { - break; - } - Err(TryRecvError::Disconnected) => { - return Err(io::Error::new( - io::ErrorKind::Other, - "compressor data channel unexpected closed".to_string(), - )); - } - } - } - - Ok(()) - } - fn get_current_block(&mut self) -> DocumentBlock { let block = DocumentBlock { data: self.current_block.to_owned(), @@ -204,21 +170,11 @@ impl StoreWriter { } drop(self.compressor_sender); - // Wait for remaining data on the channel to write - while let Ok(data) = self.data_receiver.recv() { - self.writer.write_all(data.as_slice())?; - } - - // The compressor thread should have finished already, since data_receiver stopped - // receiving - let (docstore_footer, offset_index_writer) = self - .compressor_thread_handle + self.compressor_thread_handle .join() .map_err(|err| io::Error::new(io::ErrorKind::Other, format!("{:?}", err)))??; - offset_index_writer.serialize_into(&mut self.writer)?; - docstore_footer.serialize(&mut self.writer)?; - self.writer.terminate() + Ok(()) } } @@ -228,8 +184,7 @@ pub struct BlockCompressor { doc: DocId, offset_index_writer: SkipIndexBuilder, intermediary_buffer: Vec, - written_bytes: usize, - data_sender: SyncSender, + writer: CountingWriter, } struct DocumentBlock { @@ -244,14 +199,13 @@ impl DocumentBlock { } impl BlockCompressor { - fn new(compressor: Compressor, data_sender: SyncSender) -> Self { + fn new(compressor: Compressor, writer: WritePtr) -> Self { Self { compressor, doc: 0, offset_index_writer: SkipIndexBuilder::new(), intermediary_buffer: Vec::new(), - written_bytes: 0, - data_sender, + writer: CountingWriter::wrap(writer), } } @@ -261,17 +215,13 @@ impl BlockCompressor { self.compressor .compress_into(&block.data[..], &mut self.intermediary_buffer)?; - let byte_range = self.written_bytes..self.written_bytes + self.intermediary_buffer.len(); - - self.data_sender - .send(OwnedBytes::new(self.intermediary_buffer.to_owned())) - .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?; - - self.written_bytes += byte_range.len(); + let start_offset = self.writer.written_bytes() as usize; + self.writer.write_all(&self.intermediary_buffer)?; + let end_offset = self.writer.written_bytes() as usize; self.register_checkpoint(Checkpoint { doc_range: self.doc..self.doc + block.num_docs_in_block, - byte_range, + byte_range: start_offset..end_offset, }); Ok(()) } @@ -290,14 +240,11 @@ impl BlockCompressor { self.compress_block(block)?; } let doc_shift = self.doc; - let start_shift = self.written_bytes; + let start_shift = self.writer.written_bytes() as usize; // just bulk write all of the block of the given reader. - self.data_sender - .send(store_reader.block_data()?) - .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?; - - self.written_bytes += store_reader.block_data()?.as_slice().len(); + self.writer + .write_all(store_reader.block_data()?.as_slice())?; // concatenate the index of the `store_reader`, after translating // its start doc id and its start file offset. @@ -310,12 +257,13 @@ impl BlockCompressor { } Ok(()) } - fn close(self) -> io::Result<(DocStoreFooter, SkipIndexBuilder)> { - drop(self.data_sender); - - let header_offset: u64 = self.written_bytes as u64; - let footer = DocStoreFooter::new(header_offset, Decompressor::from(self.compressor)); + fn close(mut self) -> io::Result<()> { + let header_offset: u64 = self.writer.written_bytes() as u64; + let docstore_footer = + DocStoreFooter::new(header_offset, Decompressor::from(self.compressor)); - Ok((footer, self.offset_index_writer)) + self.offset_index_writer.serialize_into(&mut self.writer)?; + docstore_footer.serialize(&mut self.writer)?; + self.writer.terminate() } } From 449594f67a5b5c7aaf59e0e363b4082dcab7651e Mon Sep 17 00:00:00 2001 From: PSeitz Date: Tue, 21 Jun 2022 07:48:12 +0200 Subject: [PATCH 18/39] Update src/store/writer.rs Co-authored-by: Paul Masurel --- src/store/writer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/store/writer.rs b/src/store/writer.rs index 69273cd163..735d01ed6a 100644 --- a/src/store/writer.rs +++ b/src/store/writer.rs @@ -49,7 +49,7 @@ impl StoreWriter { compressor: Compressor, block_size: usize, ) -> io::Result { - let thread_builder = thread::Builder::new().name("docstore compressor thread".to_string()); + let thread_builder = thread::Builder::new().name("docstore-compressor-thread".to_string()); // Channel to send uncompressed data to compressor channel let (block_sender, block_receiver): ( From 0135fbc4c83483fa9044ceda0f2fbc2bc365e52a Mon Sep 17 00:00:00 2001 From: PSeitz Date: Tue, 21 Jun 2022 07:48:19 +0200 Subject: [PATCH 19/39] Update src/store/writer.rs Co-authored-by: Paul Masurel --- src/store/writer.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/store/writer.rs b/src/store/writer.rs index 735d01ed6a..27fc2351a0 100644 --- a/src/store/writer.rs +++ b/src/store/writer.rs @@ -68,7 +68,8 @@ impl StoreWriter { } } } - block_compressor.close() + block_compressor.close()?; + Ok(()) })?; Ok(StoreWriter { From 79e42d4a6da02cfed70641188c70777b52d98859 Mon Sep 17 00:00:00 2001 From: PSeitz Date: Tue, 21 Jun 2022 09:21:10 +0200 Subject: [PATCH 20/39] Update src/store/writer.rs Co-authored-by: Paul Masurel --- src/store/writer.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/store/writer.rs b/src/store/writer.rs index 27fc2351a0..cd80cd9082 100644 --- a/src/store/writer.rs +++ b/src/store/writer.rs @@ -92,6 +92,7 @@ impl StoreWriter { self.intermediary_buffer.capacity() + self.current_block.capacity() } + /// Checks if the current block is full, and if so, compresses and flushes it. fn check_flush_block(&mut self) -> io::Result<()> { if self.current_block.len() > self.block_size { let block = self.get_current_block(); From 0bc6b4a11768a09d8373e50473606d09530eab90 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Tue, 21 Jun 2022 16:10:48 +0800 Subject: [PATCH 21/39] renames and refactoring --- src/store/writer.rs | 54 ++++++++++++++++++++------------------------- 1 file changed, 24 insertions(+), 30 deletions(-) diff --git a/src/store/writer.rs b/src/store/writer.rs index cd80cd9082..244bf72fc6 100644 --- a/src/store/writer.rs +++ b/src/store/writer.rs @@ -29,14 +29,13 @@ pub struct StoreWriter { // the channel to send data to the compressor thread. compressor_sender: SyncSender, - // the handle to check for errors on the thread compressor_thread_handle: JoinHandle>, } enum BlockCompressorMessage { AddBlock(DocumentBlock), - Stack((StoreReader, DocumentBlock)), + Stack(StoreReader), } impl StoreWriter { @@ -61,10 +60,10 @@ impl StoreWriter { while let Ok(packet) = block_receiver.recv() { match packet { BlockCompressorMessage::AddBlock(block) => { - block_compressor.compress_block(block)?; + block_compressor.compress_block_and_write(block)?; } - BlockCompressorMessage::Stack((store_reader, block)) => { - block_compressor.stack(block, store_reader)?; + BlockCompressorMessage::Stack(store_reader) => { + block_compressor.stack(store_reader)?; } } } @@ -95,22 +94,25 @@ impl StoreWriter { /// Checks if the current block is full, and if so, compresses and flushes it. fn check_flush_block(&mut self) -> io::Result<()> { if self.current_block.len() > self.block_size { - let block = self.get_current_block(); - self.compressor_sender - .send(BlockCompressorMessage::AddBlock(block)) - .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?; + self.send_current_block_to_compressor()?; } Ok(()) } - fn get_current_block(&mut self) -> DocumentBlock { + /// Flushes current uncompressed block and sends to compressor. + fn send_current_block_to_compressor(&mut self) -> io::Result<()> { let block = DocumentBlock { data: self.current_block.to_owned(), num_docs_in_block: self.num_docs_in_current_block, }; self.current_block.clear(); self.num_docs_in_current_block = 0; - block + if !block.is_empty() { + self.compressor_sender + .send(BlockCompressorMessage::AddBlock(block)) + .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?; + } + Ok(()) } /// Store a new document. @@ -150,10 +152,10 @@ impl StoreWriter { /// in the store and adding them one by one, as the store's data will /// not be decompressed and then recompressed. pub fn stack(&mut self, store_reader: StoreReader) -> io::Result<()> { - self.check_flush_block()?; - let block = self.get_current_block(); + // We flush the current block first before stacking + self.send_current_block_to_compressor()?; self.compressor_sender - .send(BlockCompressorMessage::Stack((store_reader, block))) + .send(BlockCompressorMessage::Stack(store_reader)) .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?; Ok(()) @@ -164,12 +166,7 @@ impl StoreWriter { /// Compress the last unfinished block if any, /// and serializes the skip list index on disc. pub fn close(mut self) -> io::Result<()> { - let block = self.get_current_block(); - if !block.is_empty() { - self.compressor_sender - .send(BlockCompressorMessage::AddBlock(block)) - .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?; - } + self.send_current_block_to_compressor()?; drop(self.compressor_sender); self.compressor_thread_handle @@ -183,7 +180,7 @@ impl StoreWriter { /// BlockCompressor is separated from StoreWriter, to be run in an own thread pub struct BlockCompressor { compressor: Compressor, - doc: DocId, + first_doc_in_block: DocId, offset_index_writer: SkipIndexBuilder, intermediary_buffer: Vec, writer: CountingWriter, @@ -204,14 +201,14 @@ impl BlockCompressor { fn new(compressor: Compressor, writer: WritePtr) -> Self { Self { compressor, - doc: 0, + first_doc_in_block: 0, offset_index_writer: SkipIndexBuilder::new(), intermediary_buffer: Vec::new(), writer: CountingWriter::wrap(writer), } } - fn compress_block(&mut self, block: DocumentBlock) -> io::Result<()> { + fn compress_block_and_write(&mut self, block: DocumentBlock) -> io::Result<()> { assert!(block.num_docs_in_block > 0); self.intermediary_buffer.clear(); self.compressor @@ -222,7 +219,7 @@ impl BlockCompressor { let end_offset = self.writer.written_bytes() as usize; self.register_checkpoint(Checkpoint { - doc_range: self.doc..self.doc + block.num_docs_in_block, + doc_range: self.first_doc_in_block..self.first_doc_in_block + block.num_docs_in_block, byte_range: start_offset..end_offset, }); Ok(()) @@ -230,18 +227,15 @@ impl BlockCompressor { fn register_checkpoint(&mut self, checkpoint: Checkpoint) { self.offset_index_writer.insert(checkpoint.clone()); - self.doc = checkpoint.doc_range.end; + self.first_doc_in_block = checkpoint.doc_range.end; } /// Stacks a store reader on top of the documents written so far. /// This method is an optimization compared to iterating over the documents /// in the store and adding them one by one, as the store's data will /// not be decompressed and then recompressed. - fn stack(&mut self, block: DocumentBlock, store_reader: StoreReader) -> io::Result<()> { - if !block.is_empty() { - self.compress_block(block)?; - } - let doc_shift = self.doc; + fn stack(&mut self, store_reader: StoreReader) -> io::Result<()> { + let doc_shift = self.first_doc_in_block; let start_shift = self.writer.written_bytes() as usize; // just bulk write all of the block of the given reader. From 2b713f0977f139afc1a71fb1b9464edc61394d16 Mon Sep 17 00:00:00 2001 From: PSeitz Date: Thu, 23 Jun 2022 09:11:22 +0200 Subject: [PATCH 22/39] Update src/store/writer.rs Co-authored-by: Paul Masurel --- src/store/writer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/store/writer.rs b/src/store/writer.rs index 244bf72fc6..465ceaea99 100644 --- a/src/store/writer.rs +++ b/src/store/writer.rs @@ -126,7 +126,7 @@ impl StoreWriter { // intermediary_buffer due to the borrow checker // a new buffer costs ~1% indexing performance let doc_num_bytes = self.intermediary_buffer.len(); - VInt(doc_num_bytes as u64).serialize(&mut self.current_block)?; + VInt(doc_num_bytes as u64).serialize_into_vec(&mut self.current_block); self.current_block .write_all(&self.intermediary_buffer[..])?; self.num_docs_in_current_block += 1; From c3220bece05e2c99226eab1d81236211171b9a7c Mon Sep 17 00:00:00 2001 From: PSeitz Date: Thu, 23 Jun 2022 09:11:36 +0200 Subject: [PATCH 23/39] Update src/store/writer.rs Co-authored-by: Paul Masurel --- src/store/writer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/store/writer.rs b/src/store/writer.rs index 465ceaea99..dd6bd903e0 100644 --- a/src/store/writer.rs +++ b/src/store/writer.rs @@ -141,7 +141,7 @@ impl StoreWriter { pub fn store_bytes(&mut self, serialized_document: &[u8]) -> io::Result<()> { let doc_num_bytes = serialized_document.len(); VInt(doc_num_bytes as u64).serialize(&mut self.current_block)?; - self.current_block.write_all(serialized_document)?; + self.current_block.extend_from_slice(serialized_document); self.num_docs_in_current_block += 1; self.check_flush_block()?; Ok(()) From ad76d1100804f8322fb6050183a59ada76ab4e9c Mon Sep 17 00:00:00 2001 From: PSeitz Date: Thu, 23 Jun 2022 09:11:48 +0200 Subject: [PATCH 24/39] Update src/store/writer.rs Co-authored-by: Paul Masurel --- src/store/writer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/store/writer.rs b/src/store/writer.rs index dd6bd903e0..f15c5a8152 100644 --- a/src/store/writer.rs +++ b/src/store/writer.rs @@ -140,7 +140,7 @@ impl StoreWriter { /// of documents. pub fn store_bytes(&mut self, serialized_document: &[u8]) -> io::Result<()> { let doc_num_bytes = serialized_document.len(); - VInt(doc_num_bytes as u64).serialize(&mut self.current_block)?; + VInt(doc_num_bytes as u64).serialize_into_vec(&mut self.current_block); self.current_block.extend_from_slice(serialized_document); self.num_docs_in_current_block += 1; self.check_flush_block()?; From 9baefbe2ab8695952f3db5dca30b8b543b00bf38 Mon Sep 17 00:00:00 2001 From: PSeitz Date: Thu, 23 Jun 2022 09:24:49 +0200 Subject: [PATCH 25/39] Update src/store/writer.rs Co-authored-by: Paul Masurel --- src/store/writer.rs | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/src/store/writer.rs b/src/store/writer.rs index f15c5a8152..3327dd1e97 100644 --- a/src/store/writer.rs +++ b/src/store/writer.rs @@ -101,17 +101,20 @@ impl StoreWriter { /// Flushes current uncompressed block and sends to compressor. fn send_current_block_to_compressor(&mut self) -> io::Result<()> { + // We don't do anything if the current block is empty to begin with. + if self.current_block.is_empty() { + return Ok(()); + } let block = DocumentBlock { data: self.current_block.to_owned(), num_docs_in_block: self.num_docs_in_current_block, }; self.current_block.clear(); self.num_docs_in_current_block = 0; - if !block.is_empty() { - self.compressor_sender - .send(BlockCompressorMessage::AddBlock(block)) - .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?; - } + self.compressor_sender + .send(BlockCompressorMessage::AddBlock(block)) + .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?; + Ok(()) } @@ -191,12 +194,6 @@ struct DocumentBlock { num_docs_in_block: DocId, } -impl DocumentBlock { - fn is_empty(&self) -> bool { - self.data.is_empty() - } -} - impl BlockCompressor { fn new(compressor: Compressor, writer: WritePtr) -> Self { Self { From 437cd350a2e4c634419d3b1133a21e1cceefe77a Mon Sep 17 00:00:00 2001 From: Antoine G <325288+saroh@users.noreply.github.com> Date: Tue, 28 Jun 2022 06:55:47 +0200 Subject: [PATCH 26/39] Add support for phrase slop in query language (#1393) Closes #1390 --- query-grammar/Cargo.toml | 2 +- query-grammar/src/query_grammar.rs | 63 ++++++++++++++++++++------ query-grammar/src/user_input_ast.rs | 11 +++-- src/query/phrase_query/phrase_query.rs | 9 +++- src/query/query_parser/logical_ast.rs | 11 ++++- src/query/query_parser/query_parser.rs | 39 +++++++++++++--- 6 files changed, 107 insertions(+), 28 deletions(-) diff --git a/query-grammar/Cargo.toml b/query-grammar/Cargo.toml index b395de9138..e4f2df98b4 100644 --- a/query-grammar/Cargo.toml +++ b/query-grammar/Cargo.toml @@ -14,4 +14,4 @@ edition = "2018" [dependencies] combine = {version="4", default-features=false, features=[] } once_cell = "1.7.2" -regex ={ version = "1.5.4", default-features = false, features = ["std"] } +regex ={ version = "1.5.4", default-features = false, features = ["std", "unicode"] } diff --git a/query-grammar/src/query_grammar.rs b/query-grammar/src/query_grammar.rs index dc8d06fbf3..4f05fbbea5 100644 --- a/query-grammar/src/query_grammar.rs +++ b/query-grammar/src/query_grammar.rs @@ -16,9 +16,9 @@ use crate::Occur; // Note: '-' char is only forbidden at the beginning of a field name, would be clearer to add it to // special characters. const SPECIAL_CHARS: &[char] = &[ - '+', '^', '`', ':', '{', '}', '"', '[', ']', '(', ')', '~', '!', '\\', '*', ' ', + '+', '^', '`', ':', '{', '}', '"', '[', ']', '(', ')', '!', '\\', '*', ' ', ]; -const ESCAPED_SPECIAL_CHARS_PATTERN: &str = r#"\\(\+|\^|`|:|\{|\}|"|\[|\]|\(|\)|\~|!|\\|\*|\s)"#; +const ESCAPED_SPECIAL_CHARS_PATTERN: &str = r#"\\(\+|\^|`|:|\{|\}|"|\[|\]|\(|\)|!|\\|\*|\s)"#; /// Parses a field_name /// A field name must have at least one character and be followed by a colon. @@ -120,22 +120,36 @@ fn date_time<'a>() -> impl Parser<&'a str, Output = String> { fn term_val<'a>() -> impl Parser<&'a str, Output = String> { let phrase = char('"').with(many1(satisfy(|c| c != '"'))).skip(char('"')); - phrase.or(word()) + negative_number().or(phrase.or(word())) } fn term_query<'a>() -> impl Parser<&'a str, Output = UserInputLiteral> { - let term_val_with_field = negative_number().or(term_val()); - (field_name(), term_val_with_field).map(|(field_name, phrase)| UserInputLiteral { + (field_name(), term_val(), slop_val()).map(|(field_name, phrase, slop)| UserInputLiteral { field_name: Some(field_name), phrase, + slop, + }) +} + +fn slop_val<'a>() -> impl Parser<&'a str, Output = u32> { + let slop = + (char('~'), many1(digit())).and_then(|(_, slop): (_, String)| match slop.parse::() { + Ok(d) => Ok(d), + _ => Err(StringStreamError::UnexpectedParse), + }); + optional(slop).map(|slop| match slop { + Some(d) => d, + _ => 0, }) } fn literal<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> { - let term_default_field = term_val().map(|phrase| UserInputLiteral { + let term_default_field = (term_val(), slop_val()).map(|(phrase, slop)| UserInputLiteral { field_name: None, phrase, + slop, }); + attempt(term_query()) .or(term_default_field) .map(UserInputLeaf::from) @@ -522,18 +536,10 @@ mod test { super::field_name().parse(".my.field.name:a"), Ok((".my.field.name".to_string(), "a")) ); - assert_eq!( - super::field_name().parse(r#"my\ field:a"#), - Ok(("my field".to_string(), "a")) - ); assert_eq!( super::field_name().parse(r#"にんじん:a"#), Ok(("にんじん".to_string(), "a")) ); - assert_eq!( - super::field_name().parse("my\\ field\\ name:a"), - Ok(("my field name".to_string(), "a")) - ); assert_eq!( super::field_name().parse(r#"my\field:a"#), Ok((r#"my\field"#.to_string(), "a")) @@ -562,6 +568,17 @@ mod test { super::field_name().parse("_my_field:a"), Ok(("_my_field".to_string(), "a")) ); + assert_eq!( + super::field_name().parse("~my~field:a"), + Ok(("~my~field".to_string(), "a")) + ); + for special_char in SPECIAL_CHARS.iter() { + let query = &format!("\\{special_char}my\\{special_char}field:a"); + assert_eq!( + super::field_name().parse(&query), + Ok((format!("{special_char}my{special_char}field"), "a")) + ); + } } #[test] @@ -714,4 +731,22 @@ mod test { ); test_is_parse_err("abc + "); } + + #[test] + fn test_slop() { + assert!(parse_to_ast().parse("\"a b\"~").is_err()); + assert!(parse_to_ast().parse("foo:\"a b\"~").is_err()); + assert!(parse_to_ast().parse("\"a b\"~a").is_err()); + assert!(parse_to_ast().parse("\"a b\"~100000000000000000").is_err()); + + test_parse_query_to_ast_helper("\"a b\"^2~4", "(*(\"a b\")^2 *\"~4\")"); + test_parse_query_to_ast_helper("\"~Document\"", "\"~Document\""); + test_parse_query_to_ast_helper("~Document", "\"~Document\""); + test_parse_query_to_ast_helper("a~2", "\"a~2\""); + test_parse_query_to_ast_helper("\"a b\"~0", "\"a b\""); + test_parse_query_to_ast_helper("\"a b\"~1", "\"a b\"~1"); + test_parse_query_to_ast_helper("\"a b\"~3", "\"a b\"~3"); + test_parse_query_to_ast_helper("foo:\"a b\"~300", "\"foo\":\"a b\"~300"); + test_parse_query_to_ast_helper("\"a b\"~300^2", "(\"a b\"~300)^2"); + } } diff --git a/query-grammar/src/user_input_ast.rs b/query-grammar/src/user_input_ast.rs index 359900bab1..3130ddbbe6 100644 --- a/query-grammar/src/user_input_ast.rs +++ b/query-grammar/src/user_input_ast.rs @@ -40,14 +40,19 @@ impl Debug for UserInputLeaf { pub struct UserInputLiteral { pub field_name: Option, pub phrase: String, + pub slop: u32, } impl fmt::Debug for UserInputLiteral { fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> { - match self.field_name { - Some(ref field_name) => write!(formatter, "\"{}\":\"{}\"", field_name, self.phrase), - None => write!(formatter, "\"{}\"", self.phrase), + if let Some(ref field) = self.field_name { + write!(formatter, "\"{}\":", field)?; } + write!(formatter, "\"{}\"", self.phrase)?; + if self.slop > 0 { + write!(formatter, "~{}", self.slop)?; + } + Ok(()) } } diff --git a/src/query/phrase_query/phrase_query.rs b/src/query/phrase_query/phrase_query.rs index 1996c95f68..ae3916318b 100644 --- a/src/query/phrase_query/phrase_query.rs +++ b/src/query/phrase_query/phrase_query.rs @@ -43,7 +43,12 @@ impl PhraseQuery { /// Creates a new `PhraseQuery` given a list of terms and their offsets. /// /// Can be used to provide custom offset for each term. - pub fn new_with_offset(mut terms: Vec<(usize, Term)>) -> PhraseQuery { + pub fn new_with_offset(terms: Vec<(usize, Term)>) -> PhraseQuery { + PhraseQuery::new_with_offset_and_slop(terms, 0) + } + + /// Creates a new `PhraseQuery` given a list of terms, their offsets and a slop + pub fn new_with_offset_and_slop(mut terms: Vec<(usize, Term)>, slop: u32) -> PhraseQuery { assert!( terms.len() > 1, "A phrase query is required to have strictly more than one term." @@ -57,7 +62,7 @@ impl PhraseQuery { PhraseQuery { field, phrase_terms: terms, - slop: 0, + slop, } } diff --git a/src/query/query_parser/logical_ast.rs b/src/query/query_parser/logical_ast.rs index 9d26c3cd67..2eb75e675a 100644 --- a/src/query/query_parser/logical_ast.rs +++ b/src/query/query_parser/logical_ast.rs @@ -8,7 +8,7 @@ use crate::Score; #[derive(Clone)] pub enum LogicalLiteral { Term(Term), - Phrase(Vec<(usize, Term)>), + Phrase(Vec<(usize, Term)>, u32), Range { field: Field, value_type: Type, @@ -74,7 +74,14 @@ impl fmt::Debug for LogicalLiteral { fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> { match *self { LogicalLiteral::Term(ref term) => write!(formatter, "{:?}", term), - LogicalLiteral::Phrase(ref terms) => write!(formatter, "\"{:?}\"", terms), + LogicalLiteral::Phrase(ref terms, slop) => { + write!(formatter, "\"{:?}\"", terms)?; + if slop > 0 { + write!(formatter, "~{:?}", slop) + } else { + Ok(()) + } + } LogicalLiteral::Range { ref lower, ref upper, diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index dbb801e5b3..5241a699d6 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -168,6 +168,9 @@ fn trim_ast(logical_ast: LogicalAst) -> Option { /// It is also possible to define a boost for a some specific field, at the query parser level. /// (See [`set_boost(...)`](#method.set_field_boost) ). Typically you may want to boost a title /// field. +/// +/// Phrase terms support the `~` slop operator which allows to set the phrase's matching +/// distance in words. `"big wolf"~1` will return documents containing the phrase `"big bad wolf"`. #[derive(Clone)] pub struct QueryParser { schema: Schema, @@ -405,6 +408,7 @@ impl QueryParser { field: Field, json_path: &str, phrase: &str, + slop: u32, ) -> Result, QueryParserError> { let field_entry = self.schema.get_field_entry(field); let field_type = field_entry.field_type(); @@ -461,6 +465,7 @@ impl QueryParser { field_name, field, phrase, + slop, &text_analyzer, index_record_option, )? @@ -626,7 +631,9 @@ impl QueryParser { self.compute_path_triplets_for_literal(&literal)?; let mut asts: Vec = Vec::new(); for (field, json_path, phrase) in term_phrases { - for ast in self.compute_logical_ast_for_leaf(field, json_path, phrase)? { + for ast in + self.compute_logical_ast_for_leaf(field, json_path, phrase, literal.slop)? + { // Apply some field specific boost defined at the query parser level. let boost = self.field_boost(field); asts.push(LogicalAst::Leaf(Box::new(ast)).boost(boost)); @@ -670,9 +677,9 @@ impl QueryParser { fn convert_literal_to_query(logical_literal: LogicalLiteral) -> Box { match logical_literal { LogicalLiteral::Term(term) => Box::new(TermQuery::new(term, IndexRecordOption::WithFreqs)), - LogicalLiteral::Phrase(term_with_offsets) => { - Box::new(PhraseQuery::new_with_offset(term_with_offsets)) - } + LogicalLiteral::Phrase(term_with_offsets, slop) => Box::new( + PhraseQuery::new_with_offset_and_slop(term_with_offsets, slop), + ), LogicalLiteral::Range { field, value_type, @@ -689,6 +696,7 @@ fn generate_literals_for_str( field_name: &str, field: Field, phrase: &str, + slop: u32, text_analyzer: &TextAnalyzer, index_record_option: IndexRecordOption, ) -> Result, QueryParserError> { @@ -710,7 +718,7 @@ fn generate_literals_for_str( field_name.to_string(), )); } - Ok(Some(LogicalLiteral::Phrase(terms))) + Ok(Some(LogicalLiteral::Phrase(terms, slop))) } fn generate_literals_for_json_object( @@ -741,7 +749,7 @@ fn generate_literals_for_json_object( field_name.to_string(), )); } - logical_literals.push(LogicalLiteral::Phrase(terms)); + logical_literals.push(LogicalLiteral::Phrase(terms, 0)); Ok(logical_literals) } @@ -1493,4 +1501,23 @@ mod test { assert_eq!(&super::locate_splitting_dots(r#"a\.b.c"#), &[4]); assert_eq!(&super::locate_splitting_dots(r#"a\..b.c"#), &[3, 5]); } + + #[test] + pub fn test_phrase_slop() { + test_parse_query_to_logical_ast_helper( + "\"a b\"~0", + r#"("[(0, Term(type=Str, field=0, "a")), (1, Term(type=Str, field=0, "b"))]" "[(0, Term(type=Str, field=1, "a")), (1, Term(type=Str, field=1, "b"))]")"#, + false, + ); + test_parse_query_to_logical_ast_helper( + "\"a b\"~2", + r#"("[(0, Term(type=Str, field=0, "a")), (1, Term(type=Str, field=0, "b"))]"~2 "[(0, Term(type=Str, field=1, "a")), (1, Term(type=Str, field=1, "b"))]"~2)"#, + false, + ); + test_parse_query_to_logical_ast_helper( + "title:\"a b~4\"~2", + r#""[(0, Term(type=Str, field=0, "a")), (1, Term(type=Str, field=0, "b")), (2, Term(type=Str, field=0, "4"))]"~2"#, + false, + ); + } } From db1836691ef9b4f963070bfd9ef13c6d44d2a074 Mon Sep 17 00:00:00 2001 From: PSeitz Date: Tue, 28 Jun 2022 15:21:39 +0800 Subject: [PATCH 27/39] fix visibility (#1398) --- src/aggregation/collector.rs | 2 ++ src/aggregation/intermediate_agg_result.rs | 5 +---- src/aggregation/mod.rs | 1 + 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/aggregation/collector.rs b/src/aggregation/collector.rs index c9510d9263..f3638fae82 100644 --- a/src/aggregation/collector.rs +++ b/src/aggregation/collector.rs @@ -9,6 +9,7 @@ use crate::aggregation::agg_req_with_accessor::get_aggs_with_accessor_and_valida use crate::collector::{Collector, SegmentCollector}; use crate::{SegmentReader, TantivyError}; +/// The default max bucket count, before the aggregation fails. pub const MAX_BUCKET_COUNT: u32 = 65000; /// Collector for aggregations. @@ -22,6 +23,7 @@ pub struct AggregationCollector { impl AggregationCollector { /// Create collector from aggregation request. /// + /// Aggregation fails when the total bucket count is higher than max_bucket_count. /// max_bucket_count will default to `MAX_BUCKET_COUNT` (65000) when unset pub fn from_aggs(agg: Aggregations, max_bucket_count: Option) -> Self { Self { diff --git a/src/aggregation/intermediate_agg_result.rs b/src/aggregation/intermediate_agg_result.rs index 20eef59c07..97a5955c2c 100644 --- a/src/aggregation/intermediate_agg_result.rs +++ b/src/aggregation/intermediate_agg_result.rs @@ -36,10 +36,7 @@ pub struct IntermediateAggregationResults { impl IntermediateAggregationResults { /// Convert intermediate result and its aggregation request to the final result. - pub(crate) fn into_final_bucket_result( - self, - req: Aggregations, - ) -> crate::Result { + pub fn into_final_bucket_result(self, req: Aggregations) -> crate::Result { self.into_final_bucket_result_internal(&(req.into())) } diff --git a/src/aggregation/mod.rs b/src/aggregation/mod.rs index 7f6f8378ce..69ae782db7 100644 --- a/src/aggregation/mod.rs +++ b/src/aggregation/mod.rs @@ -166,6 +166,7 @@ use std::fmt::Display; pub use collector::{ AggregationCollector, AggregationSegmentCollector, DistributedAggregationCollector, + MAX_BUCKET_COUNT, }; use itertools::Itertools; use serde::{Deserialize, Serialize}; From 7eb267341e7792d4442332ce937a2f6ec39a0e70 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Wed, 29 Jun 2022 22:52:50 +0800 Subject: [PATCH 28/39] make errors cloneable --- src/directory/directory.rs | 10 +++++-- src/directory/error.rs | 43 ++++++++++++++++++++---------- src/directory/managed_directory.rs | 9 +++---- src/directory/mmap_directory.rs | 9 ++++--- src/directory/ram_directory.rs | 6 ++--- src/error.rs | 14 +++++++--- 6 files changed, 58 insertions(+), 33 deletions(-) diff --git a/src/directory/directory.rs b/src/directory/directory.rs index e54b8dee32..7c7f81f660 100644 --- a/src/directory/directory.rs +++ b/src/directory/directory.rs @@ -1,6 +1,7 @@ use std::io::Write; use std::marker::{Send, Sync}; use std::path::{Path, PathBuf}; +use std::sync::Arc; use std::time::Duration; use std::{fmt, io, thread}; @@ -62,7 +63,12 @@ impl Drop for DirectoryLockGuard { enum TryAcquireLockError { FileExists, - IoError(io::Error), + IoError(Arc), +} +impl From for TryAcquireLockError { + fn from(io_error: io::Error) -> Self { + Self::IoError(Arc::new(io_error)) + } } fn try_acquire_lock( @@ -73,7 +79,7 @@ fn try_acquire_lock( OpenWriteError::FileAlreadyExists(_) => TryAcquireLockError::FileExists, OpenWriteError::IoError { io_error, .. } => TryAcquireLockError::IoError(io_error), })?; - write.flush().map_err(TryAcquireLockError::IoError)?; + write.flush().map_err(TryAcquireLockError::from)?; Ok(DirectoryLock::from(Box::new(DirectoryLockGuard { directory: directory.box_clone(), path: filepath.to_owned(), diff --git a/src/directory/error.rs b/src/directory/error.rs index 4bb273ce02..5292bcf3fb 100644 --- a/src/directory/error.rs +++ b/src/directory/error.rs @@ -1,10 +1,11 @@ use std::path::PathBuf; +use std::sync::Arc; use std::{fmt, io}; use crate::Version; /// Error while trying to acquire a directory lock. -#[derive(Debug, Error)] +#[derive(Debug, Clone, Error)] pub enum LockError { /// Failed to acquired a lock as it is already held by another /// client. @@ -16,11 +17,18 @@ pub enum LockError { LockBusy, /// Trying to acquire a lock failed with an `IoError` #[error("Failed to acquire the lock due to an io:Error.")] - IoError(io::Error), + IoError(Arc), +} + +impl LockError { + /// Wraps an io error. + pub fn wrap_io_error(io_error: io::Error) -> Self { + Self::IoError(Arc::new(io_error)) + } } /// Error that may occur when opening a directory -#[derive(Debug, Error)] +#[derive(Debug, Clone, Error)] pub enum OpenDirectoryError { /// The underlying directory does not exists. #[error("Directory does not exist: '{0}'.")] @@ -30,12 +38,12 @@ pub enum OpenDirectoryError { NotADirectory(PathBuf), /// Failed to create a temp directory. #[error("Failed to create a temporary directory: '{0}'.")] - FailedToCreateTempDir(io::Error), + FailedToCreateTempDir(Arc), /// IoError #[error("IoError '{io_error:?}' while create directory in: '{directory_path:?}'.")] IoError { /// underlying io Error. - io_error: io::Error, + io_error: Arc, /// directory we tried to open. directory_path: PathBuf, }, @@ -45,14 +53,14 @@ impl OpenDirectoryError { /// Wraps an io error. pub fn wrap_io_error(io_error: io::Error, directory_path: PathBuf) -> Self { Self::IoError { - io_error, + io_error: Arc::new(io_error), directory_path, } } } /// Error that may occur when starting to write in a file -#[derive(Debug, Error)] +#[derive(Debug, Clone, Error)] pub enum OpenWriteError { /// Our directory is WORM, writing an existing file is forbidden. /// Checkout the `Directory` documentation. @@ -63,7 +71,7 @@ pub enum OpenWriteError { #[error("IoError '{io_error:?}' while opening file for write: '{filepath}'.")] IoError { /// The underlying `io::Error`. - io_error: io::Error, + io_error: Arc, /// File path of the file that tantivy failed to open for write. filepath: PathBuf, }, @@ -72,11 +80,15 @@ pub enum OpenWriteError { impl OpenWriteError { /// Wraps an io error. pub fn wrap_io_error(io_error: io::Error, filepath: PathBuf) -> Self { - Self::IoError { io_error, filepath } + Self::IoError { + io_error: Arc::new(io_error), + filepath, + } } } /// Type of index incompatibility between the library and the index found on disk /// Used to catch and provide a hint to solve this incompatibility issue +#[derive(Clone)] pub enum Incompatibility { /// This library cannot decompress the index found on disk CompressionMismatch { @@ -135,7 +147,7 @@ impl fmt::Debug for Incompatibility { } /// Error that may occur when accessing a file read -#[derive(Debug, Error)] +#[derive(Debug, Clone, Error)] pub enum OpenReadError { /// The file does not exists. #[error("Files does not exists: {0:?}")] @@ -146,7 +158,7 @@ pub enum OpenReadError { )] IoError { /// The underlying `io::Error`. - io_error: io::Error, + io_error: Arc, /// File path of the file that tantivy failed to open for read. filepath: PathBuf, }, @@ -158,11 +170,14 @@ pub enum OpenReadError { impl OpenReadError { /// Wraps an io error. pub fn wrap_io_error(io_error: io::Error, filepath: PathBuf) -> Self { - Self::IoError { io_error, filepath } + Self::IoError { + io_error: Arc::new(io_error), + filepath, + } } } /// Error that may occur when trying to delete a file -#[derive(Debug, Error)] +#[derive(Debug, Clone, Error)] pub enum DeleteError { /// The file does not exists. #[error("File does not exists: '{0}'.")] @@ -172,7 +187,7 @@ pub enum DeleteError { #[error("The following IO error happened while deleting file '{filepath}': '{io_error:?}'.")] IoError { /// The underlying `io::Error`. - io_error: io::Error, + io_error: Arc, /// File path of the file that tantivy failed to delete. filepath: PathBuf, }, diff --git a/src/directory/managed_directory.rs b/src/directory/managed_directory.rs index b153355677..1b7c23a050 100644 --- a/src/directory/managed_directory.rs +++ b/src/directory/managed_directory.rs @@ -242,16 +242,13 @@ impl ManagedDirectory { /// Verify checksum of a managed file pub fn validate_checksum(&self, path: &Path) -> result::Result { let reader = self.directory.open_read(path)?; - let (footer, data) = - Footer::extract_footer(reader).map_err(|io_error| OpenReadError::IoError { - io_error, - filepath: path.to_path_buf(), - })?; + let (footer, data) = Footer::extract_footer(reader) + .map_err(|io_error| OpenReadError::wrap_io_error(io_error, path.to_path_buf()))?; let bytes = data .read_bytes() .map_err(|io_error| OpenReadError::IoError { + io_error: Arc::new(io_error), filepath: path.to_path_buf(), - io_error, })?; let mut hasher = Hasher::new(); hasher.update(bytes.as_slice()); diff --git a/src/directory/mmap_directory.rs b/src/directory/mmap_directory.rs index b17c3625fa..1c7e1bc0a9 100644 --- a/src/directory/mmap_directory.rs +++ b/src/directory/mmap_directory.rs @@ -174,7 +174,8 @@ impl MmapDirectory { /// This is mostly useful to test the MmapDirectory itself. /// For your unit tests, prefer the RamDirectory. pub fn create_from_tempdir() -> Result { - let tempdir = TempDir::new().map_err(OpenDirectoryError::FailedToCreateTempDir)?; + let tempdir = TempDir::new() + .map_err(|io_err| OpenDirectoryError::FailedToCreateTempDir(Arc::new(io_err)))?; Ok(MmapDirectory::new( tempdir.path().to_path_buf(), Some(tempdir), @@ -342,7 +343,7 @@ impl Directory for MmapDirectory { DeleteError::FileDoesNotExist(path.to_owned()) } else { DeleteError::IoError { - io_error: e, + io_error: Arc::new(e), filepath: path.to_path_buf(), } } @@ -422,9 +423,9 @@ impl Directory for MmapDirectory { .write(true) .create(true) //< if the file does not exist yet, create it. .open(&full_path) - .map_err(LockError::IoError)?; + .map_err(LockError::wrap_io_error)?; if lock.is_blocking { - file.lock_exclusive().map_err(LockError::IoError)?; + file.lock_exclusive().map_err(LockError::wrap_io_error)?; } else { file.try_lock_exclusive().map_err(|_| LockError::LockBusy)? } diff --git a/src/directory/ram_directory.rs b/src/directory/ram_directory.rs index f501b100d2..bb9c32050c 100644 --- a/src/directory/ram_directory.rs +++ b/src/directory/ram_directory.rs @@ -172,7 +172,7 @@ impl Directory for RamDirectory { fn delete(&self, path: &Path) -> result::Result<(), DeleteError> { fail_point!("RamDirectory::delete", |_| { Err(DeleteError::IoError { - io_error: io::Error::from(io::ErrorKind::Other), + io_error: Arc::new(io::Error::from(io::ErrorKind::Other)), filepath: path.to_path_buf(), }) }); @@ -184,7 +184,7 @@ impl Directory for RamDirectory { .fs .read() .map_err(|e| OpenReadError::IoError { - io_error: io::Error::new(io::ErrorKind::Other, e.to_string()), + io_error: Arc::new(io::Error::new(io::ErrorKind::Other, e.to_string())), filepath: path.to_path_buf(), })? .exists(path)) @@ -208,7 +208,7 @@ impl Directory for RamDirectory { self.open_read(path)? .read_bytes() .map_err(|io_error| OpenReadError::IoError { - io_error, + io_error: Arc::new(io_error), filepath: path.to_path_buf(), })?; Ok(bytes.as_slice().to_owned()) diff --git a/src/error.rs b/src/error.rs index baccb32580..e8136cdefa 100644 --- a/src/error.rs +++ b/src/error.rs @@ -1,7 +1,7 @@ //! Definition of Tantivy's errors and results. use std::path::PathBuf; -use std::sync::PoisonError; +use std::sync::{Arc, PoisonError}; use std::{fmt, io}; use thiserror::Error; @@ -15,6 +15,7 @@ use crate::{query, schema}; /// Represents a `DataCorruption` error. /// /// When facing data corruption, tantivy actually panics or returns this error. +#[derive(Clone)] pub struct DataCorruption { filepath: Option, comment: String, @@ -50,7 +51,7 @@ impl fmt::Debug for DataCorruption { } /// The library's error enum -#[derive(Debug, Error)] +#[derive(Debug, Clone, Error)] pub enum TantivyError { /// Failed to open the directory. #[error("Failed to open the directory: '{0:?}'")] @@ -69,7 +70,7 @@ pub enum TantivyError { LockFailure(LockError, Option), /// IO Error. #[error("An IO error occurred: '{0}'")] - IoError(#[from] io::Error), + IoError(Arc), /// Data corruption. #[error("Data corrupted: '{0:?}'")] DataCorruption(DataCorruption), @@ -125,6 +126,11 @@ impl From for TantivyError { } } +impl From for TantivyError { + fn from(io_err: io::Error) -> TantivyError { + TantivyError::IoError(Arc::new(io_err)) + } +} impl From for TantivyError { fn from(data_corruption: DataCorruption) -> TantivyError { TantivyError::DataCorruption(data_corruption) @@ -179,7 +185,7 @@ impl From for TantivyError { impl From for TantivyError { fn from(error: serde_json::Error) -> TantivyError { - TantivyError::IoError(error.into()) + TantivyError::IoError(Arc::new(error.into())) } } From fbc469e5df5b18164d03640054ce83d8b292699f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 4 Jul 2022 11:29:33 +0900 Subject: [PATCH 29/39] Update pprof requirement from 0.9.0 to 0.10.0 (#1400) Updates the requirements on [pprof](https://github.com/tikv/pprof-rs) to permit the latest version. - [Release notes](https://github.com/tikv/pprof-rs/releases) - [Changelog](https://github.com/tikv/pprof-rs/blob/master/CHANGELOG.md) - [Commits](https://github.com/tikv/pprof-rs/compare/v0.9.1...v0.10.0) --- updated-dependencies: - dependency-name: pprof dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 4899dd119d..4cb2b6db86 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -71,7 +71,7 @@ proptest = "1.0.0" criterion = "0.3.5" test-log = "0.2.10" env_logger = "0.9.0" -pprof = { version = "0.9.0", features = ["flamegraph", "criterion"] } +pprof = { version = "0.10.0", features = ["flamegraph", "criterion"] } futures = "0.3.21" [dev-dependencies.fail] From d750ced813df955d3c428d7eb3bc7d739a964e19 Mon Sep 17 00:00:00 2001 From: Ryan Russell Date: Sun, 3 Jul 2022 22:12:53 -0500 Subject: [PATCH 30/39] chore(collector): `src/collector` readability (#1399) * chore(collector): `src/collector` readability Signed-off-by: Ryan Russell * Update src/collector/tests.rs --- src/collector/facet_collector.rs | 4 ++-- src/collector/tests.rs | 6 ++---- src/collector/top_collector.rs | 2 +- src/collector/top_score_collector.rs | 4 ++-- 4 files changed, 7 insertions(+), 9 deletions(-) diff --git a/src/collector/facet_collector.rs b/src/collector/facet_collector.rs index e2ef47f989..fc514c8164 100644 --- a/src/collector/facet_collector.rs +++ b/src/collector/facet_collector.rs @@ -271,8 +271,8 @@ impl Collector for FacetCollector { let mut facet_streamer = facet_reader.facet_dict().range().into_stream()?; if facet_streamer.advance() { 'outer: loop { - // at the begining of this loop, facet_streamer - // is positionned on a term that has not been processed yet. + // at the beginning of this loop, facet_streamer + // is positioned on a term that has not been processed yet. let skip_result = skip(facet_streamer.key(), &mut collapse_facet_it); match skip_result { SkipResult::Found => { diff --git a/src/collector/tests.rs b/src/collector/tests.rs index 3bda822a10..ee48cf3014 100644 --- a/src/collector/tests.rs +++ b/src/collector/tests.rs @@ -69,10 +69,8 @@ pub fn test_filter_collector() -> crate::Result<()> { /// Stores all of the doc ids. /// This collector is only used for tests. -/// It is unusable in pr -/// -/// actise, as it does not store -/// the segment ordinals +/// It is unusable in practise, as it does +/// not store the segment ordinals pub struct TestCollector { pub compute_score: bool, } diff --git a/src/collector/top_collector.rs b/src/collector/top_collector.rs index 03c923d763..691ef324b0 100644 --- a/src/collector/top_collector.rs +++ b/src/collector/top_collector.rs @@ -137,7 +137,7 @@ where T: PartialOrd + Clone /// sorted by type `T`. /// /// The implementation is based on a `BinaryHeap`. -/// The theorical complexity for collecting the top `K` out of `n` documents +/// The theoretical complexity for collecting the top `K` out of `n` documents /// is `O(n log K)`. pub(crate) struct TopSegmentCollector { limit: usize, diff --git a/src/collector/top_score_collector.rs b/src/collector/top_score_collector.rs index 516dedcb58..a0e40cc80f 100644 --- a/src/collector/top_score_collector.rs +++ b/src/collector/top_score_collector.rs @@ -79,7 +79,7 @@ where /// sorted by their score. /// /// The implementation is based on a `BinaryHeap`. -/// The theorical complexity for collecting the top `K` out of `n` documents +/// The theoretical complexity for collecting the top `K` out of `n` documents /// is `O(n log K)`. /// /// This collector guarantees a stable sorting in case of a tie on the @@ -283,7 +283,7 @@ impl TopDocs { /// /// # See also /// - /// To confortably work with `u64`s, `i64`s, `f64`s, or `date`s, please refer to + /// To comfortably work with `u64`s, `i64`s, `f64`s, or `date`s, please refer to /// [.order_by_fast_field(...)](#method.order_by_fast_field) method. pub fn order_by_u64_field( self, From 1bd44a5f6182f80b9211018ce651ceef656b6aeb Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Fri, 1 Jul 2022 19:15:17 +0800 Subject: [PATCH 31/39] use total_cmp --- src/aggregation/intermediate_agg_result.rs | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/src/aggregation/intermediate_agg_result.rs b/src/aggregation/intermediate_agg_result.rs index 97a5955c2c..b6b38bfde0 100644 --- a/src/aggregation/intermediate_agg_result.rs +++ b/src/aggregation/intermediate_agg_result.rs @@ -277,11 +277,9 @@ impl IntermediateBucketResult { .collect::>>()?; buckets.sort_by(|left, right| { - // TODO use total_cmp next stable rust release left.from .unwrap_or(f64::MIN) - .partial_cmp(&right.from.unwrap_or(f64::MIN)) - .unwrap_or(Ordering::Equal) + .total_cmp(&right.from.unwrap_or(f64::MIN)) }); Ok(BucketResult::Range { buckets }) } @@ -438,12 +436,9 @@ impl IntermediateTermBucketResult { }) .collect::>>()?; - buckets_with_val.sort_by(|(_, val1), (_, val2)| { - // TODO use total_cmp in next rust stable release - match &order { - Order::Desc => val2.partial_cmp(val1).unwrap_or(std::cmp::Ordering::Equal), - Order::Asc => val1.partial_cmp(val2).unwrap_or(std::cmp::Ordering::Equal), - } + buckets_with_val.sort_by(|(_, val1), (_, val2)| match &order { + Order::Desc => val2.total_cmp(val1), + Order::Asc => val1.total_cmp(val2), }); buckets = buckets_with_val .into_iter() From d89a8dd118fadb9a7c051aef876a5e4d57107c15 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Mon, 4 Jul 2022 13:15:32 +0800 Subject: [PATCH 32/39] set rust version --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 2604f17de1..5e1833dc51 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -28,7 +28,7 @@ jobs: - name: Install latest nightly to test also against unstable feature flag uses: actions-rs/toolchain@v1 with: - toolchain: stable + toolchain: "1.62" override: true components: rustfmt, clippy From d2784173001b4335c00269cc9b8e3a2d8dcd69eb Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Mon, 4 Jul 2022 13:22:04 +0800 Subject: [PATCH 33/39] move build step down --- .github/workflows/test.yml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 5e1833dc51..01af351950 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -16,8 +16,6 @@ jobs: steps: - uses: actions/checkout@v3 - - name: Build - run: cargo build --verbose --workspace - name: Install latest nightly to test also against unstable feature flag uses: actions-rs/toolchain@v1 with: @@ -25,13 +23,16 @@ jobs: override: true components: rustfmt - - name: Install latest nightly to test also against unstable feature flag + - name: Install stable uses: actions-rs/toolchain@v1 with: - toolchain: "1.62" + toolchain: stable override: true components: rustfmt, clippy + - name: Build + run: cargo build --verbose --workspace + - name: Run tests run: cargo +stable test --features mmap,brotli-compression,lz4-compression,snappy-compression,zstd-compression,failpoints --verbose --workspace From 9db2f0e82bcef6b224210f6725cccf39e38a90d6 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Fri, 1 Jul 2022 14:06:50 +0800 Subject: [PATCH 34/39] expose doc store cache size expose lru doc store cache size optimize doc store cache size --- src/core/searcher.rs | 20 ++++++++++++++++- src/indexer/merger.rs | 28 +++++++++++++++--------- src/reader/mod.rs | 6 +++++ src/store/mod.rs | 3 ++- src/store/reader.rs | 51 +++++++++++++++++++++++++++++++++++++------ 5 files changed, 89 insertions(+), 19 deletions(-) diff --git a/src/core/searcher.rs b/src/core/searcher.rs index 1b8f1257e5..35cbd3715d 100644 --- a/src/core/searcher.rs +++ b/src/core/searcher.rs @@ -6,7 +6,7 @@ use crate::core::{Executor, SegmentReader}; use crate::query::Query; use crate::schema::{Document, Schema, Term}; use crate::space_usage::SearcherSpaceUsage; -use crate::store::StoreReader; +use crate::store::{CacheStats, StoreReader}; use crate::{DocAddress, Index, Opstamp, SegmentId, TrackedObject}; /// Identifies the searcher generation accessed by a [Searcher]. @@ -77,11 +77,17 @@ impl Searcher { index: Index, segment_readers: Vec, generation: TrackedObject, + doc_store_cache_size: usize, ) -> io::Result { let store_readers: Vec = segment_readers .iter() .map(SegmentReader::get_store_reader) .collect::>>()?; + + for store_reader in &store_readers { + store_reader.set_cache_size(doc_store_cache_size); + } + Ok(Searcher { schema, index, @@ -110,6 +116,18 @@ impl Searcher { store_reader.get(doc_address.doc_id) } + /// The cache stats for the underlying store reader. + /// + /// Aggregates the sum for each segment store reader. + pub fn doc_store_cache_stats(&self) -> CacheStats { + let cache_stats: CacheStats = self + .store_readers + .iter() + .map(|reader| reader.cache_stats()) + .sum(); + cache_stats + } + /// Fetches a document in an asynchronous manner. #[cfg(feature = "quickwit")] pub async fn doc_async(&self, doc_address: DocAddress) -> crate::Result { diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 3c4f2a2abc..3195815a9d 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -1030,18 +1030,25 @@ impl IndexMerger { debug_time!("write-storable-fields"); debug!("write-storable-field"); - let store_readers: Vec<_> = self - .readers - .iter() - .map(|reader| reader.get_store_reader()) - .collect::>()?; - let mut document_iterators: Vec<_> = store_readers - .iter() - .enumerate() - .map(|(i, store)| store.iter_raw(self.readers[i].alive_bitset())) - .collect(); if !doc_id_mapping.is_trivial() { debug!("non-trivial-doc-id-mapping"); + + let store_readers: Vec<_> = self + .readers + .iter() + .map(|reader| reader.get_store_reader()) + .collect::>()?; + + for store_reader in &store_readers { + store_reader.set_cache_size(50); + } + + let mut document_iterators: Vec<_> = store_readers + .iter() + .enumerate() + .map(|(i, store)| store.iter_raw(self.readers[i].alive_bitset())) + .collect(); + for (old_doc_id, reader_ordinal) in doc_id_mapping.iter() { let doc_bytes_it = &mut document_iterators[*reader_ordinal as usize]; if let Some(doc_bytes_res) = doc_bytes_it.next() { @@ -1059,6 +1066,7 @@ impl IndexMerger { debug!("trivial-doc-id-mapping"); for reader in &self.readers { let store_reader = reader.get_store_reader()?; + store_reader.set_cache_size(1); if reader.has_deletes() // If there is not enough data in the store, we avoid stacking in order to // avoid creating many small blocks in the doc store. Once we have 5 full blocks, diff --git a/src/reader/mod.rs b/src/reader/mod.rs index a61172d3bf..7e7d4bbdf9 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -13,6 +13,7 @@ use self::pool::Pool; use self::warming::WarmingState; use crate::core::searcher::SearcherGeneration; use crate::directory::{Directory, WatchCallback, WatchHandle, META_LOCK}; +use crate::store::LRU_CACHE_CAPACITY; use crate::{Index, Inventory, Searcher, SegmentReader, TrackedObject}; /// Defines when a new version of the index should be reloaded. @@ -47,6 +48,7 @@ pub struct IndexReaderBuilder { index: Index, warmers: Vec>, num_warming_threads: usize, + doc_store_cache_size: usize, } impl IndexReaderBuilder { @@ -58,6 +60,7 @@ impl IndexReaderBuilder { index, warmers: Vec::new(), num_warming_threads: 1, + doc_store_cache_size: LRU_CACHE_CAPACITY, } } @@ -76,6 +79,7 @@ impl IndexReaderBuilder { let inner_reader = InnerIndexReader { index: self.index, num_searchers: self.num_searchers, + doc_store_cache_size: self.doc_store_cache_size, searcher_pool: Pool::new(), warming_state, searcher_generation_counter: Default::default(), @@ -157,6 +161,7 @@ impl TryInto for IndexReaderBuilder { struct InnerIndexReader { num_searchers: usize, + doc_store_cache_size: usize, index: Index, warming_state: WarmingState, searcher_pool: Pool, @@ -203,6 +208,7 @@ impl InnerIndexReader { self.index.clone(), segment_readers.clone(), searcher_generation.clone(), + self.doc_store_cache_size, ) }) .take(self.num_searchers) diff --git a/src/store/mod.rs b/src/store/mod.rs index 8dd035fe7f..5e9edf10ae 100644 --- a/src/store/mod.rs +++ b/src/store/mod.rs @@ -40,7 +40,8 @@ mod reader; mod writer; pub use self::compressors::{Compressor, ZstdCompressor}; pub use self::decompressors::Decompressor; -pub use self::reader::StoreReader; +pub(crate) use self::reader::LRU_CACHE_CAPACITY; +pub use self::reader::{CacheStats, StoreReader}; pub use self::writer::StoreWriter; #[cfg(feature = "lz4-compression")] diff --git a/src/store/reader.rs b/src/store/reader.rs index 6d30b7613c..e2b31b2b83 100644 --- a/src/store/reader.rs +++ b/src/store/reader.rs @@ -1,4 +1,6 @@ use std::io; +use std::iter::Sum; +use std::ops::AddAssign; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::{Arc, Mutex}; @@ -17,7 +19,7 @@ use crate::space_usage::StoreSpaceUsage; use crate::store::index::Checkpoint; use crate::DocId; -const LRU_CACHE_CAPACITY: usize = 100; +pub(crate) const LRU_CACHE_CAPACITY: usize = 100; type Block = OwnedBytes; @@ -30,18 +32,13 @@ pub struct StoreReader { cache: BlockCache, } +/// The cache for decompressed blocks. struct BlockCache { cache: Mutex>, cache_hits: Arc, cache_misses: Arc, } -pub struct CacheStats { - pub num_entries: usize, - pub cache_hits: usize, - pub cache_misses: usize, -} - impl BlockCache { fn get_from_cache(&self, pos: usize) -> Option { if let Some(block) = self.cache.lock().unwrap().get(&pos) { @@ -67,6 +64,10 @@ impl BlockCache { self.cache.lock().unwrap().len() } + fn set_size(&self, size: usize) { + self.cache.lock().unwrap().resize(size); + } + #[cfg(test)] fn peek_lru(&self) -> Option { self.cache @@ -77,6 +78,37 @@ impl BlockCache { } } +#[derive(Debug, Default)] +/// CacheStats for the `StoreReader`. +pub struct CacheStats { + /// The number of entries in the cache + pub num_entries: usize, + /// The number of cache hits. + pub cache_hits: usize, + /// The number of cache misses. + pub cache_misses: usize, +} + +impl AddAssign for CacheStats { + fn add_assign(&mut self, other: Self) { + *self = Self { + num_entries: self.num_entries + other.num_entries, + cache_hits: self.cache_hits + other.cache_hits, + cache_misses: self.cache_misses + other.cache_misses, + }; + } +} + +impl Sum for CacheStats { + fn sum>(mut iter: I) -> Self { + let mut first = iter.next().unwrap_or_default(); + for el in iter { + first += el; + } + first + } +} + impl StoreReader { /// Opens a store reader pub fn open(store_file: FileSlice) -> io::Result { @@ -112,6 +144,11 @@ impl StoreReader { self.cache.stats() } + /// Set lru cache size for decompressed blocks. Defaults to 100 (LRU_CACHE_CAPACITY). + pub(crate) fn set_cache_size(&self, size: usize) { + self.cache.set_size(size) + } + /// Get checkpoint for DocId. The checkpoint can be used to load a block containing the /// document. /// From e31e78f39f0c76356394fbf1feb5b79843524a75 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Mon, 4 Jul 2022 14:04:49 +0800 Subject: [PATCH 35/39] fix workflow action --- .github/workflows/long_running.yml | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/.github/workflows/long_running.yml b/.github/workflows/long_running.yml index 692c3500b8..17fc753df5 100644 --- a/.github/workflows/long_running.yml +++ b/.github/workflows/long_running.yml @@ -3,22 +3,29 @@ name: Long running tests on: push: branches: [ main ] + pull_request: + branches: [ main ] env: CARGO_TERM_COLOR: always NUM_FUNCTIONAL_TEST_ITERATIONS: 20000 jobs: - functional_test_unsorted: + test: + runs-on: ubuntu-latest + steps: - uses: actions/checkout@v3 + - name: Install stable + uses: actions-rs/toolchain@v1 + with: + toolchain: stable + override: true + components: rustfmt, clippy + - name: Run indexing_unsorted run: cargo test indexing_unsorted -- --ignored - functional_test_sorted: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - name: Run indexing_sorted run: cargo test indexing_sorted -- --ignored From 02691f244515289d7318eccdbe1756216ec41f8b Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Mon, 4 Jul 2022 14:19:32 +0800 Subject: [PATCH 36/39] edition 2021 for subcrates --- bitpacker/Cargo.toml | 2 +- common/Cargo.toml | 2 +- fastfield_codecs/Cargo.toml | 2 +- ownedbytes/Cargo.toml | 2 +- query-grammar/Cargo.toml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/bitpacker/Cargo.toml b/bitpacker/Cargo.toml index 6f9eb5ae5c..48c1b8a1c0 100644 --- a/bitpacker/Cargo.toml +++ b/bitpacker/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "tantivy-bitpacker" version = "0.2.0" -edition = "2018" +edition = "2021" authors = ["Paul Masurel "] license = "MIT" categories = [] diff --git a/common/Cargo.toml b/common/Cargo.toml index 6a7df405b8..f7085d9c13 100644 --- a/common/Cargo.toml +++ b/common/Cargo.toml @@ -3,7 +3,7 @@ name = "tantivy-common" version = "0.3.0" authors = ["Paul Masurel ", "Pascal Seitz "] license = "MIT" -edition = "2018" +edition = "2021" description = "common traits and utility functions used by multiple tantivy subcrates" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html diff --git a/fastfield_codecs/Cargo.toml b/fastfield_codecs/Cargo.toml index 26d587487d..1a52025ce2 100644 --- a/fastfield_codecs/Cargo.toml +++ b/fastfield_codecs/Cargo.toml @@ -3,7 +3,7 @@ name = "fastfield_codecs" version = "0.2.0" authors = ["Pascal Seitz "] license = "MIT" -edition = "2018" +edition = "2021" description = "Fast field codecs used by tantivy" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html diff --git a/ownedbytes/Cargo.toml b/ownedbytes/Cargo.toml index 40a8cbf3e3..b06db9f930 100644 --- a/ownedbytes/Cargo.toml +++ b/ownedbytes/Cargo.toml @@ -2,7 +2,7 @@ authors = ["Paul Masurel ", "Pascal Seitz "] name = "ownedbytes" version = "0.3.0" -edition = "2018" +edition = "2021" description = "Expose data as static slice" license = "MIT" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html diff --git a/query-grammar/Cargo.toml b/query-grammar/Cargo.toml index e4f2df98b4..02c967bbcf 100644 --- a/query-grammar/Cargo.toml +++ b/query-grammar/Cargo.toml @@ -9,7 +9,7 @@ homepage = "https://github.com/quickwit-oss/tantivy" repository = "https://github.com/quickwit-oss/tantivy" readme = "README.md" keywords = ["search", "information", "retrieval"] -edition = "2018" +edition = "2021" [dependencies] combine = {version="4", default-features=false, features=[] } From 5750224d4c9d13d863ba7fbe36bae1ddeaeb8038 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Mon, 4 Jul 2022 13:54:22 +0800 Subject: [PATCH 37/39] set docstore cache size at construction --- src/core/searcher.rs | 6 +----- src/core/segment_reader.rs | 6 +++--- src/functional_test.rs | 2 +- src/indexer/index_writer.rs | 5 ++++- src/indexer/merger.rs | 9 ++------- src/indexer/segment_writer.rs | 1 + src/reader/mod.rs | 4 ++-- src/store/mod.rs | 16 ++++++++-------- src/store/reader.rs | 17 ++++------------- 9 files changed, 26 insertions(+), 40 deletions(-) diff --git a/src/core/searcher.rs b/src/core/searcher.rs index 35cbd3715d..8f9bca8104 100644 --- a/src/core/searcher.rs +++ b/src/core/searcher.rs @@ -81,13 +81,9 @@ impl Searcher { ) -> io::Result { let store_readers: Vec = segment_readers .iter() - .map(SegmentReader::get_store_reader) + .map(|segment_reader| segment_reader.get_store_reader(doc_store_cache_size)) .collect::>>()?; - for store_reader in &store_readers { - store_reader.set_cache_size(doc_store_cache_size); - } - Ok(Searcher { schema, index, diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index dab64d8abd..26e0c41ecd 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -133,8 +133,8 @@ impl SegmentReader { } /// Accessor to the segment's `StoreReader`. - pub fn get_store_reader(&self) -> io::Result { - StoreReader::open(self.store_file.clone()) + pub fn get_store_reader(&self, cache_size: usize) -> io::Result { + StoreReader::open(self.store_file.clone(), cache_size) } /// Open a new segment for reading. @@ -326,7 +326,7 @@ impl SegmentReader { self.positions_composite.space_usage(), self.fast_fields_readers.space_usage(), self.fieldnorm_readers.space_usage(), - self.get_store_reader()?.space_usage(), + self.get_store_reader(0)?.space_usage(), self.alive_bitset_opt .as_ref() .map(AliveBitSet::space_usage) diff --git a/src/functional_test.rs b/src/functional_test.rs index e6be8bcc50..e0d0c8bfee 100644 --- a/src/functional_test.rs +++ b/src/functional_test.rs @@ -9,7 +9,7 @@ fn check_index_content(searcher: &Searcher, vals: &[u64]) -> crate::Result<()> { assert!(searcher.segment_readers().len() < 20); assert_eq!(searcher.num_docs() as usize, vals.len()); for segment_reader in searcher.segment_readers() { - let store_reader = segment_reader.get_store_reader()?; + let store_reader = segment_reader.get_store_reader(1)?; for doc_id in 0..segment_reader.max_doc() { let _doc = store_reader.get(doc_id)?; } diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 9d5a329893..8718c5370b 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -792,6 +792,7 @@ mod tests { self, Cardinality, Facet, FacetOptions, IndexRecordOption, NumericOptions, TextFieldIndexing, TextOptions, FAST, INDEXED, STORED, STRING, TEXT, }; + use crate::store::DOCSTORE_CACHE_CAPACITY; use crate::{DocAddress, Index, IndexSettings, IndexSortByField, Order, ReloadPolicy, Term}; const LOREM: &str = "Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do \ @@ -1550,7 +1551,9 @@ mod tests { // doc store tests for segment_reader in searcher.segment_readers().iter() { - let store_reader = segment_reader.get_store_reader().unwrap(); + let store_reader = segment_reader + .get_store_reader(DOCSTORE_CACHE_CAPACITY) + .unwrap(); // test store iterator for doc in store_reader.iter(segment_reader.alive_bitset()) { let id = doc.unwrap().get_first(id_field).unwrap().as_u64().unwrap(); diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 3195815a9d..340ca9127b 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -1036,13 +1036,9 @@ impl IndexMerger { let store_readers: Vec<_> = self .readers .iter() - .map(|reader| reader.get_store_reader()) + .map(|reader| reader.get_store_reader(50)) .collect::>()?; - for store_reader in &store_readers { - store_reader.set_cache_size(50); - } - let mut document_iterators: Vec<_> = store_readers .iter() .enumerate() @@ -1065,8 +1061,7 @@ impl IndexMerger { } else { debug!("trivial-doc-id-mapping"); for reader in &self.readers { - let store_reader = reader.get_store_reader()?; - store_reader.set_cache_size(1); + let store_reader = reader.get_store_reader(1)?; if reader.has_deletes() // If there is not enough data in the store, we avoid stacking in order to // avoid creating many small blocks in the doc store. Once we have 5 full blocks, diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 308cca255e..727eea9537 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -389,6 +389,7 @@ fn remap_and_write( serializer .segment() .open_read(SegmentComponent::TempStore)?, + 50, )?; for old_doc_id in doc_id_map.iter_old_doc_ids() { let doc_bytes = store_read.get_document_bytes(old_doc_id)?; diff --git a/src/reader/mod.rs b/src/reader/mod.rs index 7e7d4bbdf9..ca00a9383c 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -13,7 +13,7 @@ use self::pool::Pool; use self::warming::WarmingState; use crate::core::searcher::SearcherGeneration; use crate::directory::{Directory, WatchCallback, WatchHandle, META_LOCK}; -use crate::store::LRU_CACHE_CAPACITY; +use crate::store::DOCSTORE_CACHE_CAPACITY; use crate::{Index, Inventory, Searcher, SegmentReader, TrackedObject}; /// Defines when a new version of the index should be reloaded. @@ -60,7 +60,7 @@ impl IndexReaderBuilder { index, warmers: Vec::new(), num_warming_threads: 1, - doc_store_cache_size: LRU_CACHE_CAPACITY, + doc_store_cache_size: DOCSTORE_CACHE_CAPACITY, } } diff --git a/src/store/mod.rs b/src/store/mod.rs index 5e9edf10ae..bf57bbc1d0 100644 --- a/src/store/mod.rs +++ b/src/store/mod.rs @@ -40,7 +40,7 @@ mod reader; mod writer; pub use self::compressors::{Compressor, ZstdCompressor}; pub use self::decompressors::Decompressor; -pub(crate) use self::reader::LRU_CACHE_CAPACITY; +pub(crate) use self::reader::DOCSTORE_CACHE_CAPACITY; pub use self::reader::{CacheStats, StoreReader}; pub use self::writer::StoreWriter; @@ -115,7 +115,7 @@ pub mod tests { let schema = write_lorem_ipsum_store(store_wrt, NUM_DOCS, Compressor::Lz4, BLOCK_SIZE); let field_title = schema.get_field("title").unwrap(); let store_file = directory.open_read(path)?; - let store = StoreReader::open(store_file)?; + let store = StoreReader::open(store_file, 10)?; for i in 0..NUM_DOCS as u32 { assert_eq!( *store @@ -155,7 +155,7 @@ pub mod tests { let schema = write_lorem_ipsum_store(store_wrt, NUM_DOCS, compressor, blocksize); let field_title = schema.get_field("title").unwrap(); let store_file = directory.open_read(path)?; - let store = StoreReader::open(store_file)?; + let store = StoreReader::open(store_file, 10)?; for i in 0..NUM_DOCS as u32 { assert_eq!( *store @@ -232,7 +232,7 @@ pub mod tests { let searcher = index.reader()?.searcher(); let reader = searcher.segment_reader(0); - let store = reader.get_store_reader()?; + let store = reader.get_store_reader(10)?; for doc in store.iter(reader.alive_bitset()) { assert_eq!( *doc?.get_first(text_field).unwrap().as_text().unwrap(), @@ -268,7 +268,7 @@ pub mod tests { } assert_eq!( index.reader().unwrap().searcher().segment_readers()[0] - .get_store_reader() + .get_store_reader(10) .unwrap() .decompressor(), Decompressor::Lz4 @@ -289,7 +289,7 @@ pub mod tests { let searcher = index.reader().unwrap().searcher(); assert_eq!(searcher.segment_readers().len(), 1); let reader = searcher.segment_readers().iter().last().unwrap(); - let store = reader.get_store_reader().unwrap(); + let store = reader.get_store_reader(10).unwrap(); for doc in store.iter(reader.alive_bitset()).take(50) { assert_eq!( @@ -336,7 +336,7 @@ pub mod tests { let searcher = index.reader()?.searcher(); assert_eq!(searcher.segment_readers().len(), 1); let reader = searcher.segment_readers().iter().last().unwrap(); - let store = reader.get_store_reader()?; + let store = reader.get_store_reader(10)?; assert_eq!(store.block_checkpoints().count(), 1); Ok(()) } @@ -380,7 +380,7 @@ mod bench { 16_384, ); let store_file = directory.open_read(path).unwrap(); - let store = StoreReader::open(store_file).unwrap(); + let store = StoreReader::open(store_file, 10).unwrap(); b.iter(|| store.iter(None).collect::>()); } } diff --git a/src/store/reader.rs b/src/store/reader.rs index e2b31b2b83..62afd4c04a 100644 --- a/src/store/reader.rs +++ b/src/store/reader.rs @@ -19,7 +19,7 @@ use crate::space_usage::StoreSpaceUsage; use crate::store::index::Checkpoint; use crate::DocId; -pub(crate) const LRU_CACHE_CAPACITY: usize = 100; +pub(crate) const DOCSTORE_CACHE_CAPACITY: usize = 100; type Block = OwnedBytes; @@ -64,10 +64,6 @@ impl BlockCache { self.cache.lock().unwrap().len() } - fn set_size(&self, size: usize) { - self.cache.lock().unwrap().resize(size); - } - #[cfg(test)] fn peek_lru(&self) -> Option { self.cache @@ -111,7 +107,7 @@ impl Sum for CacheStats { impl StoreReader { /// Opens a store reader - pub fn open(store_file: FileSlice) -> io::Result { + pub fn open(store_file: FileSlice, cache_size: usize) -> io::Result { let (footer, data_and_offset) = DocStoreFooter::extract_footer(store_file)?; let (data_file, offset_index_file) = data_and_offset.split(footer.offset as usize); @@ -122,7 +118,7 @@ impl StoreReader { decompressor: footer.decompressor, data: data_file, cache: BlockCache { - cache: Mutex::new(LruCache::new(LRU_CACHE_CAPACITY)), + cache: Mutex::new(LruCache::new(cache_size)), cache_hits: Default::default(), cache_misses: Default::default(), }, @@ -144,11 +140,6 @@ impl StoreReader { self.cache.stats() } - /// Set lru cache size for decompressed blocks. Defaults to 100 (LRU_CACHE_CAPACITY). - pub(crate) fn set_cache_size(&self, size: usize) { - self.cache.set_size(size) - } - /// Get checkpoint for DocId. The checkpoint can be used to load a block containing the /// document. /// @@ -405,7 +396,7 @@ mod tests { let schema = write_lorem_ipsum_store(writer, 500, Compressor::default(), BLOCK_SIZE); let title = schema.get_field("title").unwrap(); let store_file = directory.open_read(path)?; - let store = StoreReader::open(store_file)?; + let store = StoreReader::open(store_file, DOCSTORE_CACHE_CAPACITY)?; assert_eq!(store.cache.len(), 0); assert_eq!(store.cache_stats().cache_hits, 0); From 431b5a091e3853a1ef5360a669c83d491e715173 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Tue, 5 Jul 2022 10:32:33 +0800 Subject: [PATCH 38/39] remove test trigger --- .github/workflows/long_running.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/long_running.yml b/.github/workflows/long_running.yml index 17fc753df5..c0de149212 100644 --- a/.github/workflows/long_running.yml +++ b/.github/workflows/long_running.yml @@ -3,8 +3,6 @@ name: Long running tests on: push: branches: [ main ] - pull_request: - branches: [ main ] env: CARGO_TERM_COLOR: always From 2406d9278b17c7b870fe5de7fa2f461b69565290 Mon Sep 17 00:00:00 2001 From: PSeitz Date: Tue, 5 Jul 2022 22:40:35 -0700 Subject: [PATCH 39/39] allow set doc store cache size on IndexReaderBuilder (#1407) --- src/reader/mod.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/reader/mod.rs b/src/reader/mod.rs index ca00a9383c..d900f00157 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -124,6 +124,15 @@ impl IndexReaderBuilder { self } + /// Sets the cache size of the doc store readers. + /// + /// The doc store readers cache by default DOCSTORE_CACHE_CAPACITY(100) decompressed blocks. + #[must_use] + pub fn doc_store_cache_size(mut self, doc_store_cache_size: usize) -> IndexReaderBuilder { + self.doc_store_cache_size = doc_store_cache_size; + self + } + /// Sets the number of [Searcher] to pool. /// /// See [IndexReader::searcher()].