Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bloom filter config tweaks (#3023) #3175

Merged
merged 2 commits into from Nov 24, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
16 changes: 4 additions & 12 deletions parquet/src/column/writer/encoder.rs
Expand Up @@ -25,7 +25,6 @@ use crate::data_type::private::ParquetValueType;
use crate::data_type::DataType;
use crate::encodings::encoding::{get_encoder, DictEncoder, Encoder};
use crate::errors::{ParquetError, Result};
use crate::file::properties::BloomFilterProperties;
use crate::file::properties::{EnabledStatistics, WriterProperties};
use crate::schema::types::{ColumnDescPtr, ColumnDescriptor};
use crate::util::memory::ByteBufferPtr;
Expand Down Expand Up @@ -194,17 +193,10 @@ impl<T: DataType> ColumnValueEncoder for ColumnValueEncoderImpl<T> {

let statistics_enabled = props.statistics_enabled(descr.path());

let bloom_filter_enabled = props.bloom_filter_enabled(descr.path());
let bloom_filter =
if let Some(BloomFilterProperties { ndv, fpp }) = bloom_filter_enabled {
Sbbf::new_with_ndv_fpp(ndv, fpp)
.map_err(|e| {
eprintln!("invalid bloom filter properties: {}", e);
})
.ok()
} else {
None
};
let bloom_filter = props
.bloom_filter_properties(descr.path())
.map(|props| Sbbf::new_with_ndv_fpp(props.ndv, props.fpp))
.transpose()?;
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is better to propagate the error than printing it


Ok(Self {
encoder,
Expand Down
86 changes: 42 additions & 44 deletions parquet/src/file/properties.rs
Expand Up @@ -260,16 +260,17 @@ impl WriterProperties {
.unwrap_or(DEFAULT_MAX_STATISTICS_SIZE)
}

/// Returns whether bloom filter is enabled for a given column. Bloom filter can be enabled over
/// all or for a specific column, and is by default set to be disabled.
pub fn bloom_filter_enabled(
/// Returns the [`BloomFilterProperties`] for the given column
///
/// Returns `None` if bloom filter is disabled
pub fn bloom_filter_properties(
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Given this returns BloomFilterProperties and not a boolean, I thought this name was more appropriate

&self,
col: &ColumnPath,
) -> Option<BloomFilterProperties> {
) -> Option<&BloomFilterProperties> {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Generally it is better to return a reference so that callers can clone only if necessary

self.column_properties
.get(col)
.and_then(|c| c.bloom_filter_enabled())
.or_else(|| self.default_column_properties.bloom_filter_enabled())
.and_then(|c| c.bloom_filter_properties())
.or_else(|| self.default_column_properties.bloom_filter_properties())
}
}

Expand Down Expand Up @@ -628,7 +629,7 @@ struct ColumnProperties {
statistics_enabled: Option<EnabledStatistics>,
max_statistics_size: Option<usize>,
/// bloom filter related properties
bloom_filter_enabled: Option<BloomFilterProperties>,
bloom_filter_properies: Option<BloomFilterProperties>,
}

impl ColumnProperties {
Expand Down Expand Up @@ -672,40 +673,37 @@ impl ColumnProperties {
/// otherwise it is a no-op.
/// If `value` is `false`, resets bloom filter properties to `None`.
fn set_bloom_filter_enabled(&mut self, value: bool) {
if value {
self.bloom_filter_enabled = self
.bloom_filter_enabled()
.or_else(|| Some(Default::default()));
} else {
self.bloom_filter_enabled = None;
if value && self.bloom_filter_properies.is_none() {
self.bloom_filter_properies = Some(Default::default())
} else if !value {
self.bloom_filter_properies = None
}
}

/// Sets the false positive probability for bloom filter for this column, and implicitly enables
/// bloom filter if not previously enabled. If the `value` is not between 0 and 1 exclusive, it is
/// discarded as no-op.
/// bloom filter if not previously enabled.
///
/// # Panics
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Better IMO to fail loud than silently

///
/// Panics if the `value` is not between 0 and 1 exclusive
fn set_bloom_filter_fpp(&mut self, value: f64) {
if (0.0..1.0).contains(&value) {
self.bloom_filter_enabled = self
.bloom_filter_enabled()
.or_else(|| Some(Default::default()))
.map(|BloomFilterProperties { ndv, .. }| BloomFilterProperties {
ndv,
fpp: value,
});
}
assert!(
value > 0. && value < 1.0,
"fpp must be between 0 and 1 exclusive, got {}",
value
);

self.bloom_filter_properies
.get_or_insert_with(Default::default)
.fpp = value;
}

/// Sets the number of distinct (unique) values for bloom filter for this column, and implicitly
/// enables bloom filter if not previously enabled.
fn set_bloom_filter_ndv(&mut self, value: u64) {
self.bloom_filter_enabled = self
.bloom_filter_enabled()
.or_else(|| Some(Default::default()))
.map(|BloomFilterProperties { fpp, .. }| BloomFilterProperties {
ndv: value,
fpp,
});
self.bloom_filter_properies
.get_or_insert_with(Default::default)
.ndv = value;
}

/// Returns optional encoding for this column.
Expand Down Expand Up @@ -736,9 +734,9 @@ impl ColumnProperties {
self.max_statistics_size
}

/// Returns bloom filter properties if set.
fn bloom_filter_enabled(&self) -> Option<BloomFilterProperties> {
self.bloom_filter_enabled.clone()
/// Returns the bloom filter properties, or `None` if not enabled
fn bloom_filter_properties(&self) -> Option<&BloomFilterProperties> {
self.bloom_filter_properies.as_ref()
}
}

Expand Down Expand Up @@ -867,7 +865,7 @@ mod tests {
DEFAULT_MAX_STATISTICS_SIZE
);
assert!(props
.bloom_filter_enabled(&ColumnPath::from("col"))
.bloom_filter_properties(&ColumnPath::from("col"))
.is_none());
}

Expand Down Expand Up @@ -997,8 +995,8 @@ mod tests {
);
assert_eq!(props.max_statistics_size(&ColumnPath::from("col")), 123);
assert_eq!(
props.bloom_filter_enabled(&ColumnPath::from("col")),
Some(BloomFilterProperties { fpp: 0.1, ndv: 100 })
props.bloom_filter_properties(&ColumnPath::from("col")),
Some(&BloomFilterProperties { fpp: 0.1, ndv: 100 })
);
}

Expand All @@ -1024,8 +1022,8 @@ mod tests {
DEFAULT_DICTIONARY_ENABLED
);
assert_eq!(
props.bloom_filter_enabled(&ColumnPath::from("col")),
Some(BloomFilterProperties {
props.bloom_filter_properties(&ColumnPath::from("col")),
Some(&BloomFilterProperties {
fpp: 0.05,
ndv: 1_000_000_u64
})
Expand All @@ -1037,15 +1035,15 @@ mod tests {
assert_eq!(
WriterProperties::builder()
.build()
.bloom_filter_enabled(&ColumnPath::from("col")),
.bloom_filter_properties(&ColumnPath::from("col")),
None
);
assert_eq!(
WriterProperties::builder()
.set_bloom_filter_ndv(100)
.build()
.bloom_filter_enabled(&ColumnPath::from("col")),
Some(BloomFilterProperties {
.bloom_filter_properties(&ColumnPath::from("col")),
Some(&BloomFilterProperties {
fpp: 0.05,
ndv: 100
})
Expand All @@ -1054,8 +1052,8 @@ mod tests {
WriterProperties::builder()
.set_bloom_filter_fpp(0.1)
.build()
.bloom_filter_enabled(&ColumnPath::from("col")),
Some(BloomFilterProperties {
.bloom_filter_properties(&ColumnPath::from("col")),
Some(&BloomFilterProperties {
fpp: 0.1,
ndv: 1_000_000_u64
})
Expand Down