Skip to content

Commit

Permalink
Fix max bucket limit in histogram (#1703)
Browse files Browse the repository at this point in the history
* Fix max bucket limit in histogram

The max bucket limit in histogram was broken, since some code introduced temporary filtering of buckets, which then resulted into an incorrect increment on the bucket count.
The provided solution covers more scenarios, but there are still some scenarios unhandled (See #1702).

* Apply suggestions from code review

Co-authored-by: Paul Masurel <paul@quickwit.io>

Co-authored-by: Paul Masurel <paul@quickwit.io>
  • Loading branch information
PSeitz and fulmicoton committed Dec 12, 2022
1 parent 509adab commit 2c50b02
Showing 1 changed file with 58 additions and 5 deletions.
63 changes: 58 additions & 5 deletions src/aggregation/bucket/histogram/histogram.rs
Expand Up @@ -206,6 +206,7 @@ pub struct SegmentHistogramCollector {
field_type: Type,
interval: f64,
offset: f64,
min_doc_count: u64,
first_bucket_num: i64,
bounds: HistogramBounds,
}
Expand All @@ -215,6 +216,30 @@ impl SegmentHistogramCollector {
self,
agg_with_accessor: &BucketAggregationWithAccessor,
) -> crate::Result<IntermediateBucketResult> {
// Compute the number of buckets to validate against max num buckets
// Note: We use min_doc_count here, but it's only an lowerbound here, since were are on the
// intermediate level and after merging the number of documents of a bucket could exceed
// `min_doc_count`.
{
let cut_off_buckets_front = self
.buckets
.iter()
.take_while(|bucket| bucket.doc_count <= self.min_doc_count)
.count();
let cut_off_buckets_back = self.buckets[cut_off_buckets_front..]
.iter()
.rev()
.take_while(|bucket| bucket.doc_count <= self.min_doc_count)
.count();
let estimate_num_buckets =
self.buckets.len() - cut_off_buckets_front - cut_off_buckets_back;

agg_with_accessor
.bucket_count
.add_count(estimate_num_buckets as u32);
agg_with_accessor.bucket_count.validate_bucket_count()?;
}

let mut buckets = Vec::with_capacity(
self.buckets
.iter()
Expand Down Expand Up @@ -251,11 +276,6 @@ impl SegmentHistogramCollector {
);
};

agg_with_accessor
.bucket_count
.add_count(buckets.len() as u32);
agg_with_accessor.bucket_count.validate_bucket_count()?;

Ok(IntermediateBucketResult::Histogram { buckets })
}

Expand Down Expand Up @@ -308,6 +328,7 @@ impl SegmentHistogramCollector {
first_bucket_num,
bounds,
sub_aggregations,
min_doc_count: req.min_doc_count(),
})
}

Expand Down Expand Up @@ -1521,4 +1542,36 @@ mod tests {

Ok(())
}

#[test]
fn histogram_test_max_buckets_segments() -> crate::Result<()> {
let values = vec![0.0, 70000.0];

let index = get_test_index_from_values(true, &values)?;

let agg_req: Aggregations = vec![(
"my_interval".to_string(),
Aggregation::Bucket(BucketAggregation {
bucket_agg: BucketAggregationType::Histogram(HistogramAggregation {
field: "score_f64".to_string(),
interval: 1.0,
..Default::default()
}),
sub_aggregation: Default::default(),
}),
)]
.into_iter()
.collect();

let res = exec_request(agg_req, &index);

assert_eq!(
res.unwrap_err().to_string(),
"An invalid argument was passed: 'Aborting aggregation because too many buckets were \
created'"
.to_string()
);

Ok(())
}
}

0 comments on commit 2c50b02

Please sign in to comment.