From 3354e33fa17199e7e793ce5c0b29c0db9d3f8053 Mon Sep 17 00:00:00 2001 From: remzi <13716567376yh@gmail.com> Date: Fri, 22 Jul 2022 12:14:38 +0800 Subject: [PATCH 01/21] create null buffer builder Signed-off-by: remzi <13716567376yh@gmail.com> --- arrow/src/array/builder/mod.rs | 2 + .../src/array/builder/null_buffer_builder.rs | 73 +++++++++++++++++++ arrow/src/array/builder/primitive_builder.rs | 62 +++------------- 3 files changed, 87 insertions(+), 50 deletions(-) create mode 100644 arrow/src/array/builder/null_buffer_builder.rs diff --git a/arrow/src/array/builder/mod.rs b/arrow/src/array/builder/mod.rs index a5b7f1ed05b..d2d4ff85922 100644 --- a/arrow/src/array/builder/mod.rs +++ b/arrow/src/array/builder/mod.rs @@ -30,6 +30,7 @@ mod generic_binary_builder; mod generic_list_builder; mod generic_string_builder; mod map_builder; +mod null_buffer_builder; mod primitive_builder; mod primitive_dictionary_builder; mod string_dictionary_builder; @@ -53,6 +54,7 @@ pub use generic_binary_builder::GenericBinaryBuilder; pub use generic_list_builder::GenericListBuilder; pub use generic_string_builder::GenericStringBuilder; pub use map_builder::MapBuilder; +pub use null_buffer_builder::NullBufferBuilder; pub use primitive_builder::PrimitiveBuilder; pub use primitive_dictionary_builder::PrimitiveDictionaryBuilder; pub use string_dictionary_builder::StringDictionaryBuilder; diff --git a/arrow/src/array/builder/null_buffer_builder.rs b/arrow/src/array/builder/null_buffer_builder.rs new file mode 100644 index 00000000000..7286b366d46 --- /dev/null +++ b/arrow/src/array/builder/null_buffer_builder.rs @@ -0,0 +1,73 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::buffer::Buffer; + +use super::BooleanBufferBuilder; + +#[derive(Debug)] +pub struct NullBufferBuilder { + bitmap_builder: Option, + len: usize, +} + +impl NullBufferBuilder { + pub fn new() -> Self { + Self { + bitmap_builder: None, + len: 0 + } + } + + pub fn append_n_true(&mut self, n: usize) { + if let Some(buf) = self.bitmap_builder.as_mut() { + buf.append_n(n, true) + } + self.len += n; + } + + pub fn append_n_false(&mut self, n: usize) { + self.materialize(); + self.bitmap_builder.as_mut().unwrap().append_n(n, false); + self.len += n; + } + + pub fn append_slice(&mut self, slice: &[bool]) { + if slice.iter().any(|v| !v) { + self.materialize() + } + if let Some(buf) = self.bitmap_builder.as_mut() { + buf.append_slice(slice) + } + self.len += slice.len(); + } + + pub fn finish(&mut self) -> Option { + let buf = self.bitmap_builder.as_mut().map(|b| b.finish()); + self.bitmap_builder = None; + self.len = 0; + buf + } + + fn materialize(&mut self) { + if self.bitmap_builder.is_none() { + let mut b = BooleanBufferBuilder::new(self.len); + b.append_n(self.len, true); + self.bitmap_builder = Some(b); + } + } +} \ No newline at end of file diff --git a/arrow/src/array/builder/primitive_builder.rs b/arrow/src/array/builder/primitive_builder.rs index 36f4ae2b7d9..07e799e0aab 100644 --- a/arrow/src/array/builder/primitive_builder.rs +++ b/arrow/src/array/builder/primitive_builder.rs @@ -23,7 +23,7 @@ use crate::array::ArrayRef; use crate::array::PrimitiveArray; use crate::datatypes::ArrowPrimitiveType; -use super::{ArrayBuilder, BooleanBufferBuilder, BufferBuilder}; +use super::{ArrayBuilder, BufferBuilder, NullBufferBuilder}; /// Array builder for fixed-width primitive types #[derive(Debug)] @@ -31,7 +31,7 @@ pub struct PrimitiveBuilder { values_builder: BufferBuilder, /// We only materialize the builder when we add `false`. /// This optimization is **very** important for performance of `StringBuilder`. - bitmap_builder: Option, + null_buffer_builder: NullBufferBuilder, } impl ArrayBuilder for PrimitiveBuilder { @@ -71,7 +71,7 @@ impl PrimitiveBuilder { pub fn new(capacity: usize) -> Self { Self { values_builder: BufferBuilder::::new(capacity), - bitmap_builder: None, + null_buffer_builder: NullBufferBuilder::new(), } } @@ -83,24 +83,20 @@ impl PrimitiveBuilder { /// Appends a value of type `T` into the builder #[inline] pub fn append_value(&mut self, v: T::Native) { - if let Some(b) = self.bitmap_builder.as_mut() { - b.append(true); - } + self.null_buffer_builder.append_n_true(1); self.values_builder.append(v); } /// Appends a null slot into the builder #[inline] pub fn append_null(&mut self) { - self.materialize_bitmap_builder_if_needed(); - self.bitmap_builder.as_mut().unwrap().append(false); + self.null_buffer_builder.append_n_false(1); self.values_builder.advance(1); } #[inline] pub fn append_nulls(&mut self, n: usize) { - self.materialize_bitmap_builder_if_needed(); - self.bitmap_builder.as_mut().unwrap().append_n(n, false); + self.null_buffer_builder.append_n_false(n); self.values_builder.advance(n); } @@ -116,9 +112,7 @@ impl PrimitiveBuilder { /// Appends a slice of type `T` into the builder #[inline] pub fn append_slice(&mut self, v: &[T::Native]) { - if let Some(b) = self.bitmap_builder.as_mut() { - b.append_n(v.len(), true); - } + self.null_buffer_builder.append_n_true(v.len()); self.values_builder.append_slice(v); } @@ -130,12 +124,7 @@ impl PrimitiveBuilder { is_valid.len(), "Value and validity lengths must be equal" ); - if is_valid.iter().any(|v| !*v) { - self.materialize_bitmap_builder_if_needed(); - } - if let Some(b) = self.bitmap_builder.as_mut() { - b.append_slice(is_valid); - } + self.null_buffer_builder.append_slice(is_valid); self.values_builder.append_slice(values); } @@ -155,50 +144,23 @@ impl PrimitiveBuilder { .1 .expect("append_trusted_len_iter requires an upper bound"); - if let Some(b) = self.bitmap_builder.as_mut() { - b.append_n(len, true); - } + self.null_buffer_builder.append_n_true(len); self.values_builder.append_trusted_len_iter(iter); } - /// Builds the `PrimitiveArray` and reset this builder. + /// Builds the [`PrimitiveArray`] and reset this builder. pub fn finish(&mut self) -> PrimitiveArray { let len = self.len(); - let null_bit_buffer = self.bitmap_builder.as_mut().map(|b| b.finish()); - let null_count = len - - null_bit_buffer - .as_ref() - .map(|b| b.count_set_bits()) - .unwrap_or(len); + let null_bit_buffer = self.null_buffer_builder.finish(); let builder = ArrayData::builder(T::DATA_TYPE) .len(len) .add_buffer(self.values_builder.finish()) - .null_bit_buffer(if null_count > 0 { - null_bit_buffer - } else { - None - }); + .null_bit_buffer(null_bit_buffer); let array_data = unsafe { builder.build_unchecked() }; PrimitiveArray::::from(array_data) } - #[inline] - fn materialize_bitmap_builder_if_needed(&mut self) { - if self.bitmap_builder.is_some() { - return; - } - self.materialize_bitmap_builder() - } - - #[cold] - fn materialize_bitmap_builder(&mut self) { - let mut b = BooleanBufferBuilder::new(0); - b.reserve(self.values_builder.capacity()); - b.append_n(self.values_builder.len(), true); - self.bitmap_builder = Some(b); - } - /// Returns the current values buffer as a slice pub fn values_slice(&self) -> &[T::Native] { self.values_builder.as_slice() From 118417d4019548a3407a415c787ab09a546ddde5 Mon Sep 17 00:00:00 2001 From: remzi <13716567376yh@gmail.com> Date: Fri, 22 Jul 2022 12:30:27 +0800 Subject: [PATCH 02/21] use null buffer builder in boolean array builder Signed-off-by: remzi <13716567376yh@gmail.com> --- arrow/src/array/builder/boolean_builder.rs | 42 +++++++------------- arrow/src/array/builder/primitive_builder.rs | 3 +- 2 files changed, 15 insertions(+), 30 deletions(-) diff --git a/arrow/src/array/builder/boolean_builder.rs b/arrow/src/array/builder/boolean_builder.rs index 1e052b644c8..a8ef862d5ad 100644 --- a/arrow/src/array/builder/boolean_builder.rs +++ b/arrow/src/array/builder/boolean_builder.rs @@ -28,6 +28,7 @@ use crate::error::ArrowError; use crate::error::Result; use super::BooleanBufferBuilder; +use super::NullBufferBuilder; /// Array builder for fixed-width primitive types /// @@ -64,7 +65,7 @@ pub struct BooleanBuilder { values_builder: BooleanBufferBuilder, /// We only materialize the builder when we add `false`. /// This optimization is **very** important for the performance. - bitmap_builder: Option, + null_buffer_builder: NullBufferBuilder, } impl BooleanBuilder { @@ -72,7 +73,7 @@ impl BooleanBuilder { pub fn new(capacity: usize) -> Self { Self { values_builder: BooleanBufferBuilder::new(capacity), - bitmap_builder: None, + null_buffer_builder: NullBufferBuilder::new(), } } @@ -85,17 +86,19 @@ impl BooleanBuilder { #[inline] pub fn append_value(&mut self, v: bool) { self.values_builder.append(v); - if let Some(b) = self.bitmap_builder.as_mut() { - b.append(true) - } + self.null_buffer_builder.append_n_true(1); } /// Appends a null slot into the builder #[inline] pub fn append_null(&mut self) { - self.materialize_bitmap_builder(); - self.bitmap_builder.as_mut().unwrap().append(false); - self.values_builder.advance(1); + self.append_nulls(1) + } + + #[inline] + pub fn append_nulls(&mut self, n: usize) { + self.null_buffer_builder.append_n_false(n); + self.values_builder.advance(n); } /// Appends an `Option` into the builder @@ -110,9 +113,7 @@ impl BooleanBuilder { /// Appends a slice of type `T` into the builder #[inline] pub fn append_slice(&mut self, v: &[bool]) { - if let Some(b) = self.bitmap_builder.as_mut() { - b.append_n(v.len(), true) - } + self.null_buffer_builder.append_n_true(v.len()); self.values_builder.append_slice(v); } @@ -126,13 +127,7 @@ impl BooleanBuilder { "Value and validity lengths must be equal".to_string(), )) } else { - is_valid - .iter() - .any(|v| !*v) - .then(|| self.materialize_bitmap_builder()); - if let Some(b) = self.bitmap_builder.as_mut() { - b.append_slice(is_valid) - } + self.null_buffer_builder.append_slice(is_valid); self.values_builder.append_slice(values); Ok(()) } @@ -141,7 +136,7 @@ impl BooleanBuilder { /// Builds the [BooleanArray] and reset this builder. pub fn finish(&mut self) -> BooleanArray { let len = self.len(); - let null_bit_buffer = self.bitmap_builder.as_mut().map(|b| b.finish()); + let null_bit_buffer = self.null_buffer_builder.finish(); let builder = ArrayData::builder(DataType::Boolean) .len(len) .add_buffer(self.values_builder.finish()) @@ -150,15 +145,6 @@ impl BooleanBuilder { let array_data = unsafe { builder.build_unchecked() }; BooleanArray::from(array_data) } - - fn materialize_bitmap_builder(&mut self) { - if self.bitmap_builder.is_none() { - let mut b = BooleanBufferBuilder::new(0); - b.reserve(self.values_builder.capacity()); - b.append_n(self.values_builder.len(), true); - self.bitmap_builder = Some(b); - } - } } impl ArrayBuilder for BooleanBuilder { diff --git a/arrow/src/array/builder/primitive_builder.rs b/arrow/src/array/builder/primitive_builder.rs index 07e799e0aab..5263e5b3082 100644 --- a/arrow/src/array/builder/primitive_builder.rs +++ b/arrow/src/array/builder/primitive_builder.rs @@ -90,8 +90,7 @@ impl PrimitiveBuilder { /// Appends a null slot into the builder #[inline] pub fn append_null(&mut self) { - self.null_buffer_builder.append_n_false(1); - self.values_builder.advance(1); + self.append_nulls(1) } #[inline] From 6f13da67ecd38a18ba7b8a330ba4d0cfa50a2072 Mon Sep 17 00:00:00 2001 From: remzi <13716567376yh@gmail.com> Date: Fri, 22 Jul 2022 12:40:16 +0800 Subject: [PATCH 03/21] add capacity field and update docs Signed-off-by: remzi <13716567376yh@gmail.com> --- arrow/src/array/builder/boolean_builder.rs | 4 +--- arrow/src/array/builder/null_buffer_builder.rs | 10 +++++++--- arrow/src/array/builder/primitive_builder.rs | 4 +--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/arrow/src/array/builder/boolean_builder.rs b/arrow/src/array/builder/boolean_builder.rs index a8ef862d5ad..64ffb2b442b 100644 --- a/arrow/src/array/builder/boolean_builder.rs +++ b/arrow/src/array/builder/boolean_builder.rs @@ -63,8 +63,6 @@ use super::NullBufferBuilder; #[derive(Debug)] pub struct BooleanBuilder { values_builder: BooleanBufferBuilder, - /// We only materialize the builder when we add `false`. - /// This optimization is **very** important for the performance. null_buffer_builder: NullBufferBuilder, } @@ -73,7 +71,7 @@ impl BooleanBuilder { pub fn new(capacity: usize) -> Self { Self { values_builder: BooleanBufferBuilder::new(capacity), - null_buffer_builder: NullBufferBuilder::new(), + null_buffer_builder: NullBufferBuilder::new(capacity), } } diff --git a/arrow/src/array/builder/null_buffer_builder.rs b/arrow/src/array/builder/null_buffer_builder.rs index 7286b366d46..e8ec4c09dbf 100644 --- a/arrow/src/array/builder/null_buffer_builder.rs +++ b/arrow/src/array/builder/null_buffer_builder.rs @@ -19,17 +19,21 @@ use crate::buffer::Buffer; use super::BooleanBufferBuilder; +/// We only materialize the builder when we add `false`. +/// This optimization is **very** important for the performance. #[derive(Debug)] pub struct NullBufferBuilder { bitmap_builder: Option, len: usize, + capacity: usize, } impl NullBufferBuilder { - pub fn new() -> Self { + pub fn new(capacity: usize) -> Self { Self { bitmap_builder: None, - len: 0 + len: 0, + capacity } } @@ -65,7 +69,7 @@ impl NullBufferBuilder { fn materialize(&mut self) { if self.bitmap_builder.is_none() { - let mut b = BooleanBufferBuilder::new(self.len); + let mut b = BooleanBufferBuilder::new(self.len.max(self.capacity)); b.append_n(self.len, true); self.bitmap_builder = Some(b); } diff --git a/arrow/src/array/builder/primitive_builder.rs b/arrow/src/array/builder/primitive_builder.rs index 5263e5b3082..5d5d919722d 100644 --- a/arrow/src/array/builder/primitive_builder.rs +++ b/arrow/src/array/builder/primitive_builder.rs @@ -29,8 +29,6 @@ use super::{ArrayBuilder, BufferBuilder, NullBufferBuilder}; #[derive(Debug)] pub struct PrimitiveBuilder { values_builder: BufferBuilder, - /// We only materialize the builder when we add `false`. - /// This optimization is **very** important for performance of `StringBuilder`. null_buffer_builder: NullBufferBuilder, } @@ -71,7 +69,7 @@ impl PrimitiveBuilder { pub fn new(capacity: usize) -> Self { Self { values_builder: BufferBuilder::::new(capacity), - null_buffer_builder: NullBufferBuilder::new(), + null_buffer_builder: NullBufferBuilder::new(capacity), } } From 6b83eb1bc18e9de3446322a4cb458f0abb143b78 Mon Sep 17 00:00:00 2001 From: remzi <13716567376yh@gmail.com> Date: Fri, 22 Jul 2022 12:48:45 +0800 Subject: [PATCH 04/21] update fixed size binary builder Signed-off-by: remzi <13716567376yh@gmail.com> --- .../builder/fixed_size_binary_builder.rs | 16 +++++++-------- .../src/array/builder/null_buffer_builder.rs | 20 +++++++++++++++++++ 2 files changed, 28 insertions(+), 8 deletions(-) diff --git a/arrow/src/array/builder/fixed_size_binary_builder.rs b/arrow/src/array/builder/fixed_size_binary_builder.rs index 5ef89d8f460..5e35c9d5eef 100644 --- a/arrow/src/array/builder/fixed_size_binary_builder.rs +++ b/arrow/src/array/builder/fixed_size_binary_builder.rs @@ -23,12 +23,12 @@ use crate::error::{ArrowError, Result}; use std::any::Any; use std::sync::Arc; -use super::BooleanBufferBuilder; +use super::{NullBufferBuilder}; #[derive(Debug)] pub struct FixedSizeBinaryBuilder { values_builder: UInt8BufferBuilder, - bitmap_builder: BooleanBufferBuilder, + null_buffer_builder: NullBufferBuilder, value_length: i32, } @@ -43,7 +43,7 @@ impl FixedSizeBinaryBuilder { ); Self { values_builder: UInt8BufferBuilder::new(capacity), - bitmap_builder: BooleanBufferBuilder::new(if byte_width > 0 { + null_buffer_builder: NullBufferBuilder::new(if byte_width > 0 { capacity / byte_width as usize } else { 0 @@ -64,7 +64,7 @@ impl FixedSizeBinaryBuilder { )) } else { self.values_builder.append_slice(value.as_ref()); - self.bitmap_builder.append(true); + self.null_buffer_builder.append_true(); Ok(()) } } @@ -74,7 +74,7 @@ impl FixedSizeBinaryBuilder { pub fn append_null(&mut self) { self.values_builder .append_slice(&vec![0u8; self.value_length as usize][..]); - self.bitmap_builder.append(false); + self.null_buffer_builder.append_false(); } /// Builds the [`FixedSizeBinaryArray`] and reset this builder. @@ -83,7 +83,7 @@ impl FixedSizeBinaryBuilder { let array_data_builder = ArrayData::builder(DataType::FixedSizeBinary(self.value_length)) .add_buffer(self.values_builder.finish()) - .null_bit_buffer(Some(self.bitmap_builder.finish())) + .null_bit_buffer(self.null_buffer_builder.finish()) .len(array_length); let array_data = unsafe { array_data_builder.build_unchecked() }; FixedSizeBinaryArray::from(array_data) @@ -108,12 +108,12 @@ impl ArrayBuilder for FixedSizeBinaryBuilder { /// Returns the number of array slots in the builder fn len(&self) -> usize { - self.bitmap_builder.len() + self.null_buffer_builder.len() } /// Returns whether the number of array slots is zero fn is_empty(&self) -> bool { - self.bitmap_builder.is_empty() + self.null_buffer_builder.is_empty() } /// Builds the array and reset this builder. diff --git a/arrow/src/array/builder/null_buffer_builder.rs b/arrow/src/array/builder/null_buffer_builder.rs index e8ec4c09dbf..b291866763a 100644 --- a/arrow/src/array/builder/null_buffer_builder.rs +++ b/arrow/src/array/builder/null_buffer_builder.rs @@ -44,12 +44,22 @@ impl NullBufferBuilder { self.len += n; } + #[inline] + pub fn append_true(&mut self) { + self.append_n_true(1); + } + pub fn append_n_false(&mut self, n: usize) { self.materialize(); self.bitmap_builder.as_mut().unwrap().append_n(n, false); self.len += n; } + #[inline] + pub fn append_false(&mut self) { + self.append_n_false(1); + } + pub fn append_slice(&mut self, slice: &[bool]) { if slice.iter().any(|v| !v) { self.materialize() @@ -74,4 +84,14 @@ impl NullBufferBuilder { self.bitmap_builder = Some(b); } } +} + +impl NullBufferBuilder { + pub fn len(&self) -> usize { + self.len + } + + pub fn is_empty(&self) -> bool { + self.len == 0 + } } \ No newline at end of file From 22b6a33e4b9f618b4a088149fa9b70c0ee4fcb3b Mon Sep 17 00:00:00 2001 From: remzi <13716567376yh@gmail.com> Date: Fri, 22 Jul 2022 12:55:27 +0800 Subject: [PATCH 05/21] update fixed size list builder Signed-off-by: remzi <13716567376yh@gmail.com> --- .../array/builder/fixed_size_binary_builder.rs | 2 +- .../src/array/builder/fixed_size_list_builder.rs | 16 ++++++++-------- arrow/src/array/builder/null_buffer_builder.rs | 8 ++++++++ 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/arrow/src/array/builder/fixed_size_binary_builder.rs b/arrow/src/array/builder/fixed_size_binary_builder.rs index 5e35c9d5eef..a03e6dcc35e 100644 --- a/arrow/src/array/builder/fixed_size_binary_builder.rs +++ b/arrow/src/array/builder/fixed_size_binary_builder.rs @@ -23,7 +23,7 @@ use crate::error::{ArrowError, Result}; use std::any::Any; use std::sync::Arc; -use super::{NullBufferBuilder}; +use super::NullBufferBuilder; #[derive(Debug)] pub struct FixedSizeBinaryBuilder { diff --git a/arrow/src/array/builder/fixed_size_list_builder.rs b/arrow/src/array/builder/fixed_size_list_builder.rs index 4e80e585a94..343ce3657b3 100644 --- a/arrow/src/array/builder/fixed_size_list_builder.rs +++ b/arrow/src/array/builder/fixed_size_list_builder.rs @@ -25,12 +25,12 @@ use crate::datatypes::DataType; use crate::datatypes::Field; use super::ArrayBuilder; -use super::BooleanBufferBuilder; +use super::NullBufferBuilder; /// Array builder for [`FixedSizeListArray`] #[derive(Debug)] pub struct FixedSizeListBuilder { - bitmap_builder: BooleanBufferBuilder, + null_buffer_builder: NullBufferBuilder, values_builder: T, list_len: i32, } @@ -48,7 +48,7 @@ impl FixedSizeListBuilder { /// `capacity` is the number of items to pre-allocate space for in this builder pub fn with_capacity(values_builder: T, value_length: i32, capacity: usize) -> Self { Self { - bitmap_builder: BooleanBufferBuilder::new(capacity), + null_buffer_builder: NullBufferBuilder::new(capacity), values_builder, list_len: value_length, } @@ -76,12 +76,12 @@ where /// Returns the number of array slots in the builder fn len(&self) -> usize { - self.bitmap_builder.len() + self.null_buffer_builder.len() } /// Returns whether the number of array slots is zero fn is_empty(&self) -> bool { - self.bitmap_builder.is_empty() + self.null_buffer_builder.is_empty() } /// Builds the array and reset this builder. @@ -109,7 +109,7 @@ where /// Finish the current fixed-length list array slot #[inline] pub fn append(&mut self, is_valid: bool) { - self.bitmap_builder.append(is_valid); + self.null_buffer_builder.append(is_valid); } /// Builds the [`FixedSizeListBuilder`] and reset this builder. @@ -131,14 +131,14 @@ where len, ); - let null_bit_buffer = self.bitmap_builder.finish(); + let null_bit_buffer = self.null_buffer_builder.finish(); let array_data = ArrayData::builder(DataType::FixedSizeList( Box::new(Field::new("item", values_data.data_type().clone(), true)), self.list_len, )) .len(len) .add_child_data(values_data.clone()) - .null_bit_buffer(Some(null_bit_buffer)); + .null_bit_buffer(null_bit_buffer); let array_data = unsafe { array_data.build_unchecked() }; diff --git a/arrow/src/array/builder/null_buffer_builder.rs b/arrow/src/array/builder/null_buffer_builder.rs index b291866763a..7765c66f651 100644 --- a/arrow/src/array/builder/null_buffer_builder.rs +++ b/arrow/src/array/builder/null_buffer_builder.rs @@ -60,6 +60,14 @@ impl NullBufferBuilder { self.append_n_false(1); } + pub fn append(&mut self, v: bool) { + if v { + self.append_true() + } else { + self.append_false() + } + } + pub fn append_slice(&mut self, slice: &[bool]) { if slice.iter().any(|v| !v) { self.materialize() From 5c7844112c4329bda12396052ef52a6dd315a97d Mon Sep 17 00:00:00 2001 From: remzi <13716567376yh@gmail.com> Date: Fri, 22 Jul 2022 12:58:40 +0800 Subject: [PATCH 06/21] update binary builder Signed-off-by: remzi <13716567376yh@gmail.com> --- arrow/src/array/builder/generic_binary_builder.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arrow/src/array/builder/generic_binary_builder.rs b/arrow/src/array/builder/generic_binary_builder.rs index 54c1855a1b7..1e29c470cac 100644 --- a/arrow/src/array/builder/generic_binary_builder.rs +++ b/arrow/src/array/builder/generic_binary_builder.rs @@ -25,14 +25,14 @@ use crate::{ use std::any::Any; use std::sync::Arc; -use super::{BooleanBufferBuilder, BufferBuilder}; +use super::{BufferBuilder, NullBufferBuilder}; /// Array builder for [`GenericBinaryArray`] #[derive(Debug)] pub struct GenericBinaryBuilder { value_builder: UInt8BufferBuilder, offsets_builder: BufferBuilder, - null_buffer_builder: BooleanBufferBuilder, + null_buffer_builder: NullBufferBuilder, } impl GenericBinaryBuilder { @@ -44,7 +44,7 @@ impl GenericBinaryBuilder { Self { value_builder: UInt8BufferBuilder::new(capacity), offsets_builder, - null_buffer_builder: BooleanBufferBuilder::new(1024), + null_buffer_builder: NullBufferBuilder::new(1024), } } @@ -76,7 +76,7 @@ impl GenericBinaryBuilder { .len(self.len()) .add_buffer(self.offsets_builder.finish()) .add_buffer(self.value_builder.finish()) - .null_bit_buffer(Some(self.null_buffer_builder.finish())); + .null_bit_buffer(self.null_buffer_builder.finish()); self.offsets_builder.append(OffsetSize::zero()); let array_data = unsafe { array_builder.build_unchecked() }; From 98cd443728b1bf1ea789bd41f31e8ec6c93ffe58 Mon Sep 17 00:00:00 2001 From: remzi <13716567376yh@gmail.com> Date: Fri, 22 Jul 2022 13:01:26 +0800 Subject: [PATCH 07/21] update list builder Signed-off-by: remzi <13716567376yh@gmail.com> --- arrow/src/array/builder/generic_list_builder.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arrow/src/array/builder/generic_list_builder.rs b/arrow/src/array/builder/generic_list_builder.rs index e478cc7a3b1..1e5151ef719 100644 --- a/arrow/src/array/builder/generic_list_builder.rs +++ b/arrow/src/array/builder/generic_list_builder.rs @@ -25,13 +25,13 @@ use crate::array::OffsetSizeTrait; use crate::datatypes::DataType; use crate::datatypes::Field; -use super::{ArrayBuilder, BooleanBufferBuilder, BufferBuilder}; +use super::{ArrayBuilder, NullBufferBuilder, BufferBuilder}; /// Array builder for [`GenericListArray`] #[derive(Debug)] pub struct GenericListBuilder { offsets_builder: BufferBuilder, - bitmap_builder: BooleanBufferBuilder, + bitmap_builder: NullBufferBuilder, values_builder: T, } @@ -49,7 +49,7 @@ impl GenericListBuilder Date: Fri, 22 Jul 2022 13:04:10 +0800 Subject: [PATCH 08/21] update map builder Signed-off-by: remzi <13716567376yh@gmail.com> --- arrow/src/array/builder/map_builder.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arrow/src/array/builder/map_builder.rs b/arrow/src/array/builder/map_builder.rs index 6078f8f2251..29d4a9d465c 100644 --- a/arrow/src/array/builder/map_builder.rs +++ b/arrow/src/array/builder/map_builder.rs @@ -18,7 +18,7 @@ use std::any::Any; use std::sync::Arc; -use super::{ArrayBuilder, BooleanBufferBuilder, BufferBuilder}; +use super::{ArrayBuilder, NullBufferBuilder, BufferBuilder}; use crate::array::array::Array; use crate::array::ArrayData; use crate::array::ArrayRef; @@ -32,7 +32,7 @@ use crate::error::Result; #[derive(Debug)] pub struct MapBuilder { offsets_builder: BufferBuilder, - bitmap_builder: BooleanBufferBuilder, + bitmap_builder: NullBufferBuilder, field_names: MapFieldNames, key_builder: K, value_builder: V, @@ -78,7 +78,7 @@ impl MapBuilder { offsets_builder.append(len); Self { offsets_builder, - bitmap_builder: BooleanBufferBuilder::new(capacity), + bitmap_builder: NullBufferBuilder::new(capacity), field_names: field_names.unwrap_or_default(), key_builder, value_builder, @@ -156,7 +156,7 @@ impl MapBuilder { .len(len) .add_buffer(offset_buffer) .add_child_data(struct_array.into_data()) - .null_bit_buffer(Some(null_bit_buffer)); + .null_bit_buffer(null_bit_buffer); let array_data = unsafe { array_data.build_unchecked() }; From caa4c131b803d4bcd23f06384cd3933bcba7a726 Mon Sep 17 00:00:00 2001 From: remzi <13716567376yh@gmail.com> Date: Fri, 22 Jul 2022 13:08:35 +0800 Subject: [PATCH 09/21] update struct builder Signed-off-by: remzi <13716567376yh@gmail.com> --- arrow/src/array/builder/struct_builder.rs | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/arrow/src/array/builder/struct_builder.rs b/arrow/src/array/builder/struct_builder.rs index c679dd6217e..0121526982d 100644 --- a/arrow/src/array/builder/struct_builder.rs +++ b/arrow/src/array/builder/struct_builder.rs @@ -24,6 +24,8 @@ use crate::array::*; use crate::datatypes::DataType; use crate::datatypes::Field; +use super::NullBufferBuilder; + /// Array builder for Struct types. /// /// Note that callers should make sure that methods of all the child field builders are @@ -31,7 +33,7 @@ use crate::datatypes::Field; pub struct StructBuilder { fields: Vec, field_builders: Vec>, - bitmap_builder: BooleanBufferBuilder, + bitmap_builder: NullBufferBuilder, len: usize, } @@ -173,7 +175,7 @@ impl StructBuilder { Self { fields, field_builders, - bitmap_builder: BooleanBufferBuilder::new(0), + bitmap_builder: NullBufferBuilder::new(0), len: 0, } } @@ -221,13 +223,10 @@ impl StructBuilder { } let null_bit_buffer = self.bitmap_builder.finish(); - let null_count = self.len - null_bit_buffer.count_set_bits(); - let mut builder = ArrayData::builder(DataType::Struct(self.fields.clone())) + let builder = ArrayData::builder(DataType::Struct(self.fields.clone())) .len(self.len) - .child_data(child_data); - if null_count > 0 { - builder = builder.null_bit_buffer(Some(null_bit_buffer)); - } + .child_data(child_data) + .null_bit_buffer(null_bit_buffer); self.len = 0; From a54d2f6b9e0f90627a264ec4c6ecd12d900eb4e3 Mon Sep 17 00:00:00 2001 From: remzi <13716567376yh@gmail.com> Date: Fri, 22 Jul 2022 13:12:20 +0800 Subject: [PATCH 10/21] update union builder Signed-off-by: remzi <13716567376yh@gmail.com> --- arrow/src/array/builder/union_builder.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arrow/src/array/builder/union_builder.rs b/arrow/src/array/builder/union_builder.rs index 95d9ea40a3d..5650b87818d 100644 --- a/arrow/src/array/builder/union_builder.rs +++ b/arrow/src/array/builder/union_builder.rs @@ -29,7 +29,7 @@ use crate::datatypes::Field; use crate::datatypes::{ArrowNativeType, ArrowPrimitiveType}; use crate::error::{ArrowError, Result}; -use super::{BooleanBufferBuilder, BufferBuilder}; +use super::{NullBufferBuilder, BufferBuilder}; use crate::array::make_array; @@ -45,7 +45,7 @@ struct FieldData { /// The number of array slots represented by the buffer slots: usize, /// A builder for the null bitmap - bitmap_builder: BooleanBufferBuilder, + null_buffer_builder: NullBufferBuilder, } /// A type-erased [`BufferBuilder`] used by [`FieldData`] @@ -79,7 +79,7 @@ impl FieldData { data_type, slots: 0, values_buffer: Box::new(BufferBuilder::::new(1)), - bitmap_builder: BooleanBufferBuilder::new(1), + null_buffer_builder: NullBufferBuilder::new(1), } } @@ -91,14 +91,14 @@ impl FieldData { .expect("Tried to append unexpected type") .append(v); - self.bitmap_builder.append(true); + self.null_buffer_builder.append(true); self.slots += 1; } /// Appends a null to this `FieldData`. fn append_null(&mut self) { self.values_buffer.append_null(); - self.bitmap_builder.append(false); + self.null_buffer_builder.append(false); self.slots += 1; } } @@ -264,7 +264,7 @@ impl UnionBuilder { data_type, mut values_buffer, slots, - mut bitmap_builder, + null_buffer_builder: mut bitmap_builder, }, ) in self.fields.into_iter() { @@ -272,7 +272,7 @@ impl UnionBuilder { let arr_data_builder = ArrayDataBuilder::new(data_type.clone()) .add_buffer(buffer) .len(slots) - .null_bit_buffer(Some(bitmap_builder.finish())); + .null_bit_buffer(bitmap_builder.finish()); let arr_data_ref = unsafe { arr_data_builder.build_unchecked() }; let array_ref = make_array(arr_data_ref); From bd2e642f838edc88343cb6e1299738f3626cb417 Mon Sep 17 00:00:00 2001 From: remzi <13716567376yh@gmail.com> Date: Fri, 22 Jul 2022 13:14:56 +0800 Subject: [PATCH 11/21] rename Signed-off-by: remzi <13716567376yh@gmail.com> --- arrow/src/array/builder/generic_list_builder.rs | 12 ++++++------ arrow/src/array/builder/map_builder.rs | 8 ++++---- arrow/src/array/builder/struct_builder.rs | 10 +++++----- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/arrow/src/array/builder/generic_list_builder.rs b/arrow/src/array/builder/generic_list_builder.rs index 1e5151ef719..9b29793779f 100644 --- a/arrow/src/array/builder/generic_list_builder.rs +++ b/arrow/src/array/builder/generic_list_builder.rs @@ -31,7 +31,7 @@ use super::{ArrayBuilder, NullBufferBuilder, BufferBuilder}; #[derive(Debug)] pub struct GenericListBuilder { offsets_builder: BufferBuilder, - bitmap_builder: NullBufferBuilder, + null_buffer_builder: NullBufferBuilder, values_builder: T, } @@ -49,7 +49,7 @@ impl GenericListBuilder usize { - self.bitmap_builder.len() + self.null_buffer_builder.len() } /// Returns whether the number of array slots is zero fn is_empty(&self) -> bool { - self.bitmap_builder.is_empty() + self.null_buffer_builder.is_empty() } /// Builds the array and reset this builder. @@ -113,7 +113,7 @@ where pub fn append(&mut self, is_valid: bool) { self.offsets_builder .append(OffsetSize::from_usize(self.values_builder.len()).unwrap()); - self.bitmap_builder.append(is_valid); + self.null_buffer_builder.append(is_valid); } /// Builds the [`GenericListArray`] and reset this builder. @@ -128,7 +128,7 @@ where let values_data = values_arr.data(); let offset_buffer = self.offsets_builder.finish(); - let null_bit_buffer = self.bitmap_builder.finish(); + let null_bit_buffer = self.null_buffer_builder.finish(); self.offsets_builder.append(OffsetSize::zero()); let field = Box::new(Field::new( "item", diff --git a/arrow/src/array/builder/map_builder.rs b/arrow/src/array/builder/map_builder.rs index 29d4a9d465c..4ea02d53654 100644 --- a/arrow/src/array/builder/map_builder.rs +++ b/arrow/src/array/builder/map_builder.rs @@ -32,7 +32,7 @@ use crate::error::Result; #[derive(Debug)] pub struct MapBuilder { offsets_builder: BufferBuilder, - bitmap_builder: NullBufferBuilder, + null_buffer_builder: NullBufferBuilder, field_names: MapFieldNames, key_builder: K, value_builder: V, @@ -78,7 +78,7 @@ impl MapBuilder { offsets_builder.append(len); Self { offsets_builder, - bitmap_builder: NullBufferBuilder::new(capacity), + null_buffer_builder: NullBufferBuilder::new(capacity), field_names: field_names.unwrap_or_default(), key_builder, value_builder, @@ -107,7 +107,7 @@ impl MapBuilder { ))); } self.offsets_builder.append(self.key_builder.len() as i32); - self.bitmap_builder.append(is_valid); + self.null_buffer_builder.append(is_valid); self.len += 1; Ok(()) } @@ -145,7 +145,7 @@ impl MapBuilder { StructArray::from(vec![(keys_field, keys_arr), (values_field, values_arr)]); let offset_buffer = self.offsets_builder.finish(); - let null_bit_buffer = self.bitmap_builder.finish(); + let null_bit_buffer = self.null_buffer_builder.finish(); self.offsets_builder.append(self.len); let map_field = Box::new(Field::new( self.field_names.entry.as_str(), diff --git a/arrow/src/array/builder/struct_builder.rs b/arrow/src/array/builder/struct_builder.rs index 0121526982d..373a8458283 100644 --- a/arrow/src/array/builder/struct_builder.rs +++ b/arrow/src/array/builder/struct_builder.rs @@ -33,7 +33,7 @@ use super::NullBufferBuilder; pub struct StructBuilder { fields: Vec, field_builders: Vec>, - bitmap_builder: NullBufferBuilder, + null_buffer_builder: NullBufferBuilder, len: usize, } @@ -41,7 +41,7 @@ impl fmt::Debug for StructBuilder { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("StructBuilder") .field("fields", &self.fields) - .field("bitmap_builder", &self.bitmap_builder) + .field("bitmap_builder", &self.null_buffer_builder) .field("len", &self.len) .finish() } @@ -175,7 +175,7 @@ impl StructBuilder { Self { fields, field_builders, - bitmap_builder: NullBufferBuilder::new(0), + null_buffer_builder: NullBufferBuilder::new(0), len: 0, } } @@ -204,7 +204,7 @@ impl StructBuilder { /// should be appended for each child sub-array in a consistent way. #[inline] pub fn append(&mut self, is_valid: bool) { - self.bitmap_builder.append(is_valid); + self.null_buffer_builder.append(is_valid); self.len += 1; } @@ -222,7 +222,7 @@ impl StructBuilder { child_data.push(arr.into_data()); } - let null_bit_buffer = self.bitmap_builder.finish(); + let null_bit_buffer = self.null_buffer_builder.finish(); let builder = ArrayData::builder(DataType::Struct(self.fields.clone())) .len(self.len) .child_data(child_data) From b473be3bacb02d3bda849aa11dfa08ddda9bdb87 Mon Sep 17 00:00:00 2001 From: remzi <13716567376yh@gmail.com> Date: Fri, 22 Jul 2022 14:15:53 +0800 Subject: [PATCH 12/21] add tests Signed-off-by: remzi <13716567376yh@gmail.com> --- .../src/array/builder/generic_list_builder.rs | 2 +- arrow/src/array/builder/map_builder.rs | 2 +- .../src/array/builder/null_buffer_builder.rs | 69 +++++++++++++++++-- arrow/src/array/builder/union_builder.rs | 2 +- 4 files changed, 67 insertions(+), 8 deletions(-) diff --git a/arrow/src/array/builder/generic_list_builder.rs b/arrow/src/array/builder/generic_list_builder.rs index 9b29793779f..911182f6571 100644 --- a/arrow/src/array/builder/generic_list_builder.rs +++ b/arrow/src/array/builder/generic_list_builder.rs @@ -25,7 +25,7 @@ use crate::array::OffsetSizeTrait; use crate::datatypes::DataType; use crate::datatypes::Field; -use super::{ArrayBuilder, NullBufferBuilder, BufferBuilder}; +use super::{ArrayBuilder, BufferBuilder, NullBufferBuilder}; /// Array builder for [`GenericListArray`] #[derive(Debug)] diff --git a/arrow/src/array/builder/map_builder.rs b/arrow/src/array/builder/map_builder.rs index 4ea02d53654..7e68abd5f1a 100644 --- a/arrow/src/array/builder/map_builder.rs +++ b/arrow/src/array/builder/map_builder.rs @@ -18,7 +18,7 @@ use std::any::Any; use std::sync::Arc; -use super::{ArrayBuilder, NullBufferBuilder, BufferBuilder}; +use super::{ArrayBuilder, BufferBuilder, NullBufferBuilder}; use crate::array::array::Array; use crate::array::ArrayData; use crate::array::ArrayRef; diff --git a/arrow/src/array/builder/null_buffer_builder.rs b/arrow/src/array/builder/null_buffer_builder.rs index 7765c66f651..4a316c94ecc 100644 --- a/arrow/src/array/builder/null_buffer_builder.rs +++ b/arrow/src/array/builder/null_buffer_builder.rs @@ -30,17 +30,17 @@ pub struct NullBufferBuilder { impl NullBufferBuilder { pub fn new(capacity: usize) -> Self { - Self { - bitmap_builder: None, + Self { + bitmap_builder: None, len: 0, - capacity + capacity, } } pub fn append_n_true(&mut self, n: usize) { if let Some(buf) = self.bitmap_builder.as_mut() { buf.append_n(n, true) - } + } self.len += n; } @@ -102,4 +102,63 @@ impl NullBufferBuilder { pub fn is_empty(&self) -> bool { self.len == 0 } -} \ No newline at end of file +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_null_buffer_builder() { + let mut builder = NullBufferBuilder::new(0); + builder.append_false(); + builder.append_true(); + builder.append_n_false(2); + builder.append_n_true(2); + assert_eq!(6, builder.len()); + + let buf = builder.finish().unwrap(); + assert_eq!(Buffer::from(&[0b110010_u8]), buf); + } + + #[test] + fn test_null_buffer_builder_all_nulls() { + let mut builder = NullBufferBuilder::new(0); + builder.append_false(); + builder.append_n_false(2); + builder.append_slice(&[false, false, false]); + assert_eq!(6, builder.len()); + + let buf = builder.finish().unwrap(); + assert_eq!(Buffer::from(&[0b0_u8]), buf); + } + + #[test] + fn test_null_buffer_builder_no_null() { + let mut builder = NullBufferBuilder::new(0); + builder.append_true(); + builder.append_n_true(2); + builder.append_slice(&[true, true, true]); + assert_eq!(6, builder.len()); + + let buf = builder.finish(); + assert!(buf.is_none()); + } + + #[test] + fn test_null_buffer_builder_reset() { + let mut builder = NullBufferBuilder::new(0); + builder.append_slice(&[true, false, true]); + builder.finish(); + assert!(builder.is_empty()); + + builder.append_slice(&[true, true, true]); + assert!(builder.finish().is_none()); + assert!(builder.is_empty()); + + builder.append_slice(&[true, true, false, true]); + + let buf = builder.finish().unwrap(); + assert_eq!(Buffer::from(&[0b1011_u8]), buf); + } +} diff --git a/arrow/src/array/builder/union_builder.rs b/arrow/src/array/builder/union_builder.rs index 5650b87818d..8ef5028afc8 100644 --- a/arrow/src/array/builder/union_builder.rs +++ b/arrow/src/array/builder/union_builder.rs @@ -29,7 +29,7 @@ use crate::datatypes::Field; use crate::datatypes::{ArrowNativeType, ArrowPrimitiveType}; use crate::error::{ArrowError, Result}; -use super::{NullBufferBuilder, BufferBuilder}; +use super::{BufferBuilder, NullBufferBuilder}; use crate::array::make_array; From 16740937c5e81a374d9915e71347f460e9a348be Mon Sep 17 00:00:00 2001 From: remzi <13716567376yh@gmail.com> Date: Fri, 22 Jul 2022 15:02:09 +0800 Subject: [PATCH 13/21] optimize Signed-off-by: remzi <13716567376yh@gmail.com> --- arrow/src/array/builder/null_buffer_builder.rs | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arrow/src/array/builder/null_buffer_builder.rs b/arrow/src/array/builder/null_buffer_builder.rs index 4a316c94ecc..4ec07e3c194 100644 --- a/arrow/src/array/builder/null_buffer_builder.rs +++ b/arrow/src/array/builder/null_buffer_builder.rs @@ -50,7 +50,7 @@ impl NullBufferBuilder { } pub fn append_n_false(&mut self, n: usize) { - self.materialize(); + self.materialize_if_needed(); self.bitmap_builder.as_mut().unwrap().append_n(n, false); self.len += n; } @@ -70,7 +70,7 @@ impl NullBufferBuilder { pub fn append_slice(&mut self, slice: &[bool]) { if slice.iter().any(|v| !v) { - self.materialize() + self.materialize_if_needed() } if let Some(buf) = self.bitmap_builder.as_mut() { buf.append_slice(slice) @@ -85,6 +85,14 @@ impl NullBufferBuilder { buf } + #[inline] + fn materialize_if_needed(&mut self) { + if self.bitmap_builder.is_none() { + self.materialize() + } + } + + #[cold] fn materialize(&mut self) { if self.bitmap_builder.is_none() { let mut b = BooleanBufferBuilder::new(self.len.max(self.capacity)); From 04c43cc30b7a4a9bde2067c5318d2a0a1b5febdf Mon Sep 17 00:00:00 2001 From: remzi <13716567376yh@gmail.com> Date: Fri, 22 Jul 2022 16:31:00 +0800 Subject: [PATCH 14/21] expose null buffer builder in builder mod Signed-off-by: remzi <13716567376yh@gmail.com> --- arrow/src/array/builder/mod.rs | 2 +- arrow/src/array/builder/null_buffer_builder.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arrow/src/array/builder/mod.rs b/arrow/src/array/builder/mod.rs index d2d4ff85922..c1aac00f0cf 100644 --- a/arrow/src/array/builder/mod.rs +++ b/arrow/src/array/builder/mod.rs @@ -54,7 +54,7 @@ pub use generic_binary_builder::GenericBinaryBuilder; pub use generic_list_builder::GenericListBuilder; pub use generic_string_builder::GenericStringBuilder; pub use map_builder::MapBuilder; -pub use null_buffer_builder::NullBufferBuilder; +pub(self) use null_buffer_builder::NullBufferBuilder; pub use primitive_builder::PrimitiveBuilder; pub use primitive_dictionary_builder::PrimitiveDictionaryBuilder; pub use string_dictionary_builder::StringDictionaryBuilder; diff --git a/arrow/src/array/builder/null_buffer_builder.rs b/arrow/src/array/builder/null_buffer_builder.rs index 4ec07e3c194..ab0c3592113 100644 --- a/arrow/src/array/builder/null_buffer_builder.rs +++ b/arrow/src/array/builder/null_buffer_builder.rs @@ -22,7 +22,7 @@ use super::BooleanBufferBuilder; /// We only materialize the builder when we add `false`. /// This optimization is **very** important for the performance. #[derive(Debug)] -pub struct NullBufferBuilder { +pub(super) struct NullBufferBuilder { bitmap_builder: Option, len: usize, capacity: usize, From dd49d3082cca4e63d5eb8bb9144f171e0902fbac Mon Sep 17 00:00:00 2001 From: remzi <13716567376yh@gmail.com> Date: Fri, 22 Jul 2022 16:51:27 +0800 Subject: [PATCH 15/21] add docs Signed-off-by: remzi <13716567376yh@gmail.com> --- arrow/src/array/builder/boolean_builder.rs | 1 + arrow/src/array/builder/null_buffer_builder.rs | 15 ++++++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/arrow/src/array/builder/boolean_builder.rs b/arrow/src/array/builder/boolean_builder.rs index 64ffb2b442b..dcf80b693c5 100644 --- a/arrow/src/array/builder/boolean_builder.rs +++ b/arrow/src/array/builder/boolean_builder.rs @@ -93,6 +93,7 @@ impl BooleanBuilder { self.append_nulls(1) } + /// Appends `n` `null`s into the builder. #[inline] pub fn append_nulls(&mut self, n: usize) { self.null_buffer_builder.append_n_false(n); diff --git a/arrow/src/array/builder/null_buffer_builder.rs b/arrow/src/array/builder/null_buffer_builder.rs index ab0c3592113..07499583a7b 100644 --- a/arrow/src/array/builder/null_buffer_builder.rs +++ b/arrow/src/array/builder/null_buffer_builder.rs @@ -19,7 +19,10 @@ use crate::buffer::Buffer; use super::BooleanBufferBuilder; -/// We only materialize the builder when we add `false`. +/// Builder for creating the null bit buffer. +/// This builder only materializes the buffer when we append `false`. +/// If you only append `true`s to the builder, what you get will be +/// `None` when calling [`finish`](#method.finish). /// This optimization is **very** important for the performance. #[derive(Debug)] pub(super) struct NullBufferBuilder { @@ -29,6 +32,8 @@ pub(super) struct NullBufferBuilder { } impl NullBufferBuilder { + /// Creates a new empty builder. + /// `capacity` is the number of bits in the null buffer. pub fn new(capacity: usize) -> Self { Self { bitmap_builder: None, @@ -37,6 +42,7 @@ impl NullBufferBuilder { } } + /// Appends `n` `true`s into the builder. pub fn append_n_true(&mut self, n: usize) { if let Some(buf) = self.bitmap_builder.as_mut() { buf.append_n(n, true) @@ -44,22 +50,26 @@ impl NullBufferBuilder { self.len += n; } + /// Appends a `true` into the builder. #[inline] pub fn append_true(&mut self) { self.append_n_true(1); } + /// Appends `n` `false`s into the builder. pub fn append_n_false(&mut self, n: usize) { self.materialize_if_needed(); self.bitmap_builder.as_mut().unwrap().append_n(n, false); self.len += n; } + /// Appends a `false` into the builder. #[inline] pub fn append_false(&mut self) { self.append_n_false(1); } + /// Appends a boolean value into the builder. pub fn append(&mut self, v: bool) { if v { self.append_true() @@ -68,6 +78,7 @@ impl NullBufferBuilder { } } + /// Appends a boolean slice into the builder. pub fn append_slice(&mut self, slice: &[bool]) { if slice.iter().any(|v| !v) { self.materialize_if_needed() @@ -78,6 +89,8 @@ impl NullBufferBuilder { self.len += slice.len(); } + /// Builds the null buffer and resets the builder. + /// Returns `None` if the builder only contains `true`s. pub fn finish(&mut self) -> Option { let buf = self.bitmap_builder.as_mut().map(|b| b.finish()); self.bitmap_builder = None; From 9b6d98411f5e90575e2318f9b7170fe7cdf74769 Mon Sep 17 00:00:00 2001 From: remzi <13716567376yh@gmail.com> Date: Sat, 23 Jul 2022 16:53:01 +0800 Subject: [PATCH 16/21] reduce redundant computation Signed-off-by: remzi <13716567376yh@gmail.com> --- arrow/src/array/builder/boolean_builder.rs | 5 ++-- .../src/array/builder/null_buffer_builder.rs | 26 ++++++++++++++----- arrow/src/array/builder/primitive_builder.rs | 5 ++-- 3 files changed, 25 insertions(+), 11 deletions(-) diff --git a/arrow/src/array/builder/boolean_builder.rs b/arrow/src/array/builder/boolean_builder.rs index dcf80b693c5..9b10474a497 100644 --- a/arrow/src/array/builder/boolean_builder.rs +++ b/arrow/src/array/builder/boolean_builder.rs @@ -84,13 +84,14 @@ impl BooleanBuilder { #[inline] pub fn append_value(&mut self, v: bool) { self.values_builder.append(v); - self.null_buffer_builder.append_n_true(1); + self.null_buffer_builder.append_true(); } /// Appends a null slot into the builder #[inline] pub fn append_null(&mut self) { - self.append_nulls(1) + self.null_buffer_builder.append_false(); + self.values_builder.advance(1); } /// Appends `n` `null`s into the builder. diff --git a/arrow/src/array/builder/null_buffer_builder.rs b/arrow/src/array/builder/null_buffer_builder.rs index 07499583a7b..ba573de2e9c 100644 --- a/arrow/src/array/builder/null_buffer_builder.rs +++ b/arrow/src/array/builder/null_buffer_builder.rs @@ -27,6 +27,7 @@ use super::BooleanBufferBuilder; #[derive(Debug)] pub(super) struct NullBufferBuilder { bitmap_builder: Option, + /// Store the length of the buffer before materilaizing. len: usize, capacity: usize, } @@ -46,30 +47,36 @@ impl NullBufferBuilder { pub fn append_n_true(&mut self, n: usize) { if let Some(buf) = self.bitmap_builder.as_mut() { buf.append_n(n, true) + } else { + self.len += n; } - self.len += n; } /// Appends a `true` into the builder. #[inline] pub fn append_true(&mut self) { - self.append_n_true(1); + if let Some(buf) = self.bitmap_builder.as_mut() { + buf.append(true) + } else { + self.len += 1; + } } /// Appends `n` `false`s into the builder. pub fn append_n_false(&mut self, n: usize) { self.materialize_if_needed(); self.bitmap_builder.as_mut().unwrap().append_n(n, false); - self.len += n; } /// Appends a `false` into the builder. #[inline] pub fn append_false(&mut self) { - self.append_n_false(1); + self.materialize_if_needed(); + self.bitmap_builder.as_mut().unwrap().append(false); } /// Appends a boolean value into the builder. + #[inline] pub fn append(&mut self, v: bool) { if v { self.append_true() @@ -85,8 +92,9 @@ impl NullBufferBuilder { } if let Some(buf) = self.bitmap_builder.as_mut() { buf.append_slice(slice) + } else { + self.len += slice.len(); } - self.len += slice.len(); } /// Builds the null buffer and resets the builder. @@ -117,11 +125,15 @@ impl NullBufferBuilder { impl NullBufferBuilder { pub fn len(&self) -> usize { - self.len + if let Some(b) = &self.bitmap_builder { + b.len() + } else { + self.len + } } pub fn is_empty(&self) -> bool { - self.len == 0 + self.len() == 0 } } diff --git a/arrow/src/array/builder/primitive_builder.rs b/arrow/src/array/builder/primitive_builder.rs index 5d5d919722d..6d24e5d483d 100644 --- a/arrow/src/array/builder/primitive_builder.rs +++ b/arrow/src/array/builder/primitive_builder.rs @@ -81,14 +81,15 @@ impl PrimitiveBuilder { /// Appends a value of type `T` into the builder #[inline] pub fn append_value(&mut self, v: T::Native) { - self.null_buffer_builder.append_n_true(1); + self.null_buffer_builder.append_true(); self.values_builder.append(v); } /// Appends a null slot into the builder #[inline] pub fn append_null(&mut self) { - self.append_nulls(1) + self.null_buffer_builder.append_false(); + self.values_builder.advance(1); } #[inline] From 0c0031fcc7e3780d0e01a09ce3865f0073855396 Mon Sep 17 00:00:00 2001 From: remzi <13716567376yh@gmail.com> Date: Sat, 23 Jul 2022 17:46:42 +0800 Subject: [PATCH 17/21] inline methods to achieve better performance Signed-off-by: remzi <13716567376yh@gmail.com> --- arrow/src/array/builder/null_buffer_builder.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arrow/src/array/builder/null_buffer_builder.rs b/arrow/src/array/builder/null_buffer_builder.rs index ba573de2e9c..41bfc755812 100644 --- a/arrow/src/array/builder/null_buffer_builder.rs +++ b/arrow/src/array/builder/null_buffer_builder.rs @@ -44,6 +44,7 @@ impl NullBufferBuilder { } /// Appends `n` `true`s into the builder. + #[inline] pub fn append_n_true(&mut self, n: usize) { if let Some(buf) = self.bitmap_builder.as_mut() { buf.append_n(n, true) @@ -63,6 +64,7 @@ impl NullBufferBuilder { } /// Appends `n` `false`s into the builder. + #[inline] pub fn append_n_false(&mut self, n: usize) { self.materialize_if_needed(); self.bitmap_builder.as_mut().unwrap().append_n(n, false); From b09081fa1c69500e9295568906105d17c652e40d Mon Sep 17 00:00:00 2001 From: remzi <13716567376yh@gmail.com> Date: Sat, 23 Jul 2022 17:48:33 +0800 Subject: [PATCH 18/21] use instead of pub(self) use Signed-off-by: remzi <13716567376yh@gmail.com> --- arrow/src/array/builder/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow/src/array/builder/mod.rs b/arrow/src/array/builder/mod.rs index c1aac00f0cf..7fe348f1b8a 100644 --- a/arrow/src/array/builder/mod.rs +++ b/arrow/src/array/builder/mod.rs @@ -54,7 +54,7 @@ pub use generic_binary_builder::GenericBinaryBuilder; pub use generic_list_builder::GenericListBuilder; pub use generic_string_builder::GenericStringBuilder; pub use map_builder::MapBuilder; -pub(self) use null_buffer_builder::NullBufferBuilder; +use null_buffer_builder::NullBufferBuilder; pub use primitive_builder::PrimitiveBuilder; pub use primitive_dictionary_builder::PrimitiveDictionaryBuilder; pub use string_dictionary_builder::StringDictionaryBuilder; From 1444d7b63ba6834ed63ca52bb92077ae4f1ded4f Mon Sep 17 00:00:00 2001 From: Remzi Yang <59198230+HaoYang670@users.noreply.github.com> Date: Mon, 25 Jul 2022 17:43:16 +0800 Subject: [PATCH 19/21] Update arrow/src/array/builder/null_buffer_builder.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix spelling mistake Co-authored-by: Jörn Horstmann --- arrow/src/array/builder/null_buffer_builder.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow/src/array/builder/null_buffer_builder.rs b/arrow/src/array/builder/null_buffer_builder.rs index 41bfc755812..40ee34d3649 100644 --- a/arrow/src/array/builder/null_buffer_builder.rs +++ b/arrow/src/array/builder/null_buffer_builder.rs @@ -27,7 +27,7 @@ use super::BooleanBufferBuilder; #[derive(Debug)] pub(super) struct NullBufferBuilder { bitmap_builder: Option, - /// Store the length of the buffer before materilaizing. + /// Store the length of the buffer before materializing. len: usize, capacity: usize, } From c073b32d172aeaa8a6995efbdcfcea6bba27f3fe Mon Sep 17 00:00:00 2001 From: remzi <13716567376yh@gmail.com> Date: Tue, 26 Jul 2022 07:16:24 +0800 Subject: [PATCH 20/21] reorder Signed-off-by: remzi <13716567376yh@gmail.com> --- arrow/src/array/builder/boolean_builder.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow/src/array/builder/boolean_builder.rs b/arrow/src/array/builder/boolean_builder.rs index 9b10474a497..2492f0017ab 100644 --- a/arrow/src/array/builder/boolean_builder.rs +++ b/arrow/src/array/builder/boolean_builder.rs @@ -113,8 +113,8 @@ impl BooleanBuilder { /// Appends a slice of type `T` into the builder #[inline] pub fn append_slice(&mut self, v: &[bool]) { - self.null_buffer_builder.append_n_true(v.len()); self.values_builder.append_slice(v); + self.null_buffer_builder.append_n_true(v.len()); } /// Appends values from a slice of type `T` and a validity boolean slice. From 49c963c3361946876570dfca7e3c16e2e458dbe0 Mon Sep 17 00:00:00 2001 From: remzi <13716567376yh@gmail.com> Date: Tue, 26 Jul 2022 10:00:37 +0800 Subject: [PATCH 21/21] rename methods and update docs Signed-off-by: remzi <13716567376yh@gmail.com> --- arrow/src/array/builder/boolean_builder.rs | 8 ++-- .../builder/fixed_size_binary_builder.rs | 4 +- .../src/array/builder/null_buffer_builder.rs | 47 ++++++++++--------- arrow/src/array/builder/primitive_builder.rs | 10 ++-- 4 files changed, 37 insertions(+), 32 deletions(-) diff --git a/arrow/src/array/builder/boolean_builder.rs b/arrow/src/array/builder/boolean_builder.rs index 2492f0017ab..e28e37bc9e2 100644 --- a/arrow/src/array/builder/boolean_builder.rs +++ b/arrow/src/array/builder/boolean_builder.rs @@ -84,20 +84,20 @@ impl BooleanBuilder { #[inline] pub fn append_value(&mut self, v: bool) { self.values_builder.append(v); - self.null_buffer_builder.append_true(); + self.null_buffer_builder.append_non_null(); } /// Appends a null slot into the builder #[inline] pub fn append_null(&mut self) { - self.null_buffer_builder.append_false(); + self.null_buffer_builder.append_null(); self.values_builder.advance(1); } /// Appends `n` `null`s into the builder. #[inline] pub fn append_nulls(&mut self, n: usize) { - self.null_buffer_builder.append_n_false(n); + self.null_buffer_builder.append_n_nulls(n); self.values_builder.advance(n); } @@ -114,7 +114,7 @@ impl BooleanBuilder { #[inline] pub fn append_slice(&mut self, v: &[bool]) { self.values_builder.append_slice(v); - self.null_buffer_builder.append_n_true(v.len()); + self.null_buffer_builder.append_n_non_nulls(v.len()); } /// Appends values from a slice of type `T` and a validity boolean slice. diff --git a/arrow/src/array/builder/fixed_size_binary_builder.rs b/arrow/src/array/builder/fixed_size_binary_builder.rs index a03e6dcc35e..96f90c4cae2 100644 --- a/arrow/src/array/builder/fixed_size_binary_builder.rs +++ b/arrow/src/array/builder/fixed_size_binary_builder.rs @@ -64,7 +64,7 @@ impl FixedSizeBinaryBuilder { )) } else { self.values_builder.append_slice(value.as_ref()); - self.null_buffer_builder.append_true(); + self.null_buffer_builder.append_non_null(); Ok(()) } } @@ -74,7 +74,7 @@ impl FixedSizeBinaryBuilder { pub fn append_null(&mut self) { self.values_builder .append_slice(&vec![0u8; self.value_length as usize][..]); - self.null_buffer_builder.append_false(); + self.null_buffer_builder.append_null(); } /// Builds the [`FixedSizeBinaryArray`] and reset this builder. diff --git a/arrow/src/array/builder/null_buffer_builder.rs b/arrow/src/array/builder/null_buffer_builder.rs index 40ee34d3649..ef2e4c50ab9 100644 --- a/arrow/src/array/builder/null_buffer_builder.rs +++ b/arrow/src/array/builder/null_buffer_builder.rs @@ -43,9 +43,10 @@ impl NullBufferBuilder { } } - /// Appends `n` `true`s into the builder. + /// Appends `n` `true`s into the builder + /// to indicate that these `n` items are not nulls. #[inline] - pub fn append_n_true(&mut self, n: usize) { + pub fn append_n_non_nulls(&mut self, n: usize) { if let Some(buf) = self.bitmap_builder.as_mut() { buf.append_n(n, true) } else { @@ -53,9 +54,10 @@ impl NullBufferBuilder { } } - /// Appends a `true` into the builder. + /// Appends a `true` into the builder + /// to indicate that this item is not null. #[inline] - pub fn append_true(&mut self) { + pub fn append_non_null(&mut self) { if let Some(buf) = self.bitmap_builder.as_mut() { buf.append(true) } else { @@ -63,31 +65,34 @@ impl NullBufferBuilder { } } - /// Appends `n` `false`s into the builder. + /// Appends `n` `false`s into the builder + /// to indicate that these `n` items are nulls. #[inline] - pub fn append_n_false(&mut self, n: usize) { + pub fn append_n_nulls(&mut self, n: usize) { self.materialize_if_needed(); self.bitmap_builder.as_mut().unwrap().append_n(n, false); } - /// Appends a `false` into the builder. + /// Appends a `false` into the builder + /// to indicate that this item is null. #[inline] - pub fn append_false(&mut self) { + pub fn append_null(&mut self) { self.materialize_if_needed(); self.bitmap_builder.as_mut().unwrap().append(false); } /// Appends a boolean value into the builder. #[inline] - pub fn append(&mut self, v: bool) { - if v { - self.append_true() + pub fn append(&mut self, not_null: bool) { + if not_null { + self.append_non_null() } else { - self.append_false() + self.append_null() } } - /// Appends a boolean slice into the builder. + /// Appends a boolean slice into the builder + /// to indicate the validations of these items. pub fn append_slice(&mut self, slice: &[bool]) { if slice.iter().any(|v| !v) { self.materialize_if_needed() @@ -146,10 +151,10 @@ mod tests { #[test] fn test_null_buffer_builder() { let mut builder = NullBufferBuilder::new(0); - builder.append_false(); - builder.append_true(); - builder.append_n_false(2); - builder.append_n_true(2); + builder.append_null(); + builder.append_non_null(); + builder.append_n_nulls(2); + builder.append_n_non_nulls(2); assert_eq!(6, builder.len()); let buf = builder.finish().unwrap(); @@ -159,8 +164,8 @@ mod tests { #[test] fn test_null_buffer_builder_all_nulls() { let mut builder = NullBufferBuilder::new(0); - builder.append_false(); - builder.append_n_false(2); + builder.append_null(); + builder.append_n_nulls(2); builder.append_slice(&[false, false, false]); assert_eq!(6, builder.len()); @@ -171,8 +176,8 @@ mod tests { #[test] fn test_null_buffer_builder_no_null() { let mut builder = NullBufferBuilder::new(0); - builder.append_true(); - builder.append_n_true(2); + builder.append_non_null(); + builder.append_n_non_nulls(2); builder.append_slice(&[true, true, true]); assert_eq!(6, builder.len()); diff --git a/arrow/src/array/builder/primitive_builder.rs b/arrow/src/array/builder/primitive_builder.rs index 6d24e5d483d..3b9db1f01e6 100644 --- a/arrow/src/array/builder/primitive_builder.rs +++ b/arrow/src/array/builder/primitive_builder.rs @@ -81,20 +81,20 @@ impl PrimitiveBuilder { /// Appends a value of type `T` into the builder #[inline] pub fn append_value(&mut self, v: T::Native) { - self.null_buffer_builder.append_true(); + self.null_buffer_builder.append_non_null(); self.values_builder.append(v); } /// Appends a null slot into the builder #[inline] pub fn append_null(&mut self) { - self.null_buffer_builder.append_false(); + self.null_buffer_builder.append_null(); self.values_builder.advance(1); } #[inline] pub fn append_nulls(&mut self, n: usize) { - self.null_buffer_builder.append_n_false(n); + self.null_buffer_builder.append_n_nulls(n); self.values_builder.advance(n); } @@ -110,7 +110,7 @@ impl PrimitiveBuilder { /// Appends a slice of type `T` into the builder #[inline] pub fn append_slice(&mut self, v: &[T::Native]) { - self.null_buffer_builder.append_n_true(v.len()); + self.null_buffer_builder.append_n_non_nulls(v.len()); self.values_builder.append_slice(v); } @@ -142,7 +142,7 @@ impl PrimitiveBuilder { .1 .expect("append_trusted_len_iter requires an upper bound"); - self.null_buffer_builder.append_n_true(len); + self.null_buffer_builder.append_n_non_nulls(len); self.values_builder.append_trusted_len_iter(iter); }