Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

add bytes_estimate for binary push in parquet deserialize #1308

Merged
merged 8 commits into from Dec 12, 2022
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
8 changes: 6 additions & 2 deletions src/io/parquet/read/deserialize/binary/basic.rs
Expand Up @@ -480,9 +480,13 @@ impl<'a, O: Offset> utils::Decoder<'a> for BinaryDecoder<O> {

pub(super) fn finish<O: Offset, A: TraitBinaryArray<O>>(
data_type: &DataType,
values: Binary<O>,
validity: MutableBitmap,
mut values: Binary<O>,
mut validity: MutableBitmap,
) -> Result<A> {
values.offsets.0.shrink_to_fit();
sundy-li marked this conversation as resolved.
Show resolved Hide resolved
values.values.shrink_to_fit();
validity.shrink_to_fit();

A::try_new(
data_type.clone(),
values.offsets.0.into(),
Expand Down
10 changes: 9 additions & 1 deletion src/io/parquet/read/deserialize/binary/utils.rs
Expand Up @@ -56,13 +56,21 @@ impl<O: Offset> Binary<O> {
offsets.push(O::default());
Self {
offsets: Offsets(offsets),
values: Vec::with_capacity(capacity * 24),
values: Vec::with_capacity(capacity.min(100) * 24),
last_offset: O::default(),
}
}

#[inline]
pub fn push(&mut self, v: &[u8]) {
if self.offsets.0.len() == 101 && self.offsets.0.capacity() > 101 {
let bytes_per_row = self.values.len() / 100 + 1;
sundy-li marked this conversation as resolved.
Show resolved Hide resolved
let bytes_estimate = bytes_per_row * self.offsets.0.capacity();
if bytes_estimate > self.values.capacity() {
self.values.reserve(bytes_estimate - self.values.capacity());
}
}

self.values.extend(v);
self.last_offset += O::from_usize(v.len()).unwrap();
self.offsets.push(self.last_offset)
Expand Down