diff --git a/arrow/src/array/array_string.rs b/arrow/src/array/array_string.rs index df858d858e2..12a6b2f98b5 100644 --- a/arrow/src/array/array_string.rs +++ b/arrow/src/array/array_string.rs @@ -20,8 +20,8 @@ use std::fmt; use std::{any::Any, iter::FromIterator}; use super::{ - array::print_long_array, raw_pointer::RawPtrBox, Array, ArrayData, GenericListArray, - GenericStringIter, OffsetSizeTrait, + array::print_long_array, raw_pointer::RawPtrBox, Array, ArrayData, + GenericBinaryArray, GenericListArray, GenericStringIter, OffsetSizeTrait, }; use crate::array::array::ArrayAccessor; use crate::buffer::Buffer; @@ -313,6 +313,27 @@ impl<'a, OffsetSize: OffsetSizeTrait> ArrayAccessor } } +impl From> + for GenericStringArray +{ + fn from(v: GenericListArray) -> Self { + GenericStringArray::::from_list(v) + } +} + +impl From> + for GenericStringArray +{ + fn from(v: GenericBinaryArray) -> Self { + let builder = v + .into_data() + .into_builder() + .data_type(Self::get_data_type()); + let data = unsafe { builder.build_unchecked() }; + Self::from(data) + } +} + impl From for GenericStringArray { fn from(data: ArrayData) -> Self { assert_eq!( @@ -385,12 +406,6 @@ pub type StringArray = GenericStringArray; /// ``` pub type LargeStringArray = GenericStringArray; -impl From> for GenericStringArray { - fn from(v: GenericListArray) -> Self { - GenericStringArray::::from_list(v) - } -} - #[cfg(test)] mod tests { diff --git a/arrow/src/array/builder/generic_binary_builder.rs b/arrow/src/array/builder/generic_binary_builder.rs index 1e29c470cac..8f242243cd7 100644 --- a/arrow/src/array/builder/generic_binary_builder.rs +++ b/arrow/src/array/builder/generic_binary_builder.rs @@ -48,6 +48,19 @@ impl GenericBinaryBuilder { } } + /// Creates a new [`GenericBinaryBuilder`], + /// `item_capacity` is the number of items to pre-allocate space for in this builder + /// `data_capacity` is the number of bytes to pre-allocate space for in this builder + pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self { + let mut offsets_builder = BufferBuilder::::new(item_capacity + 1); + offsets_builder.append(OffsetSize::zero()); + Self { + value_builder: UInt8BufferBuilder::new(data_capacity), + offsets_builder, + null_buffer_builder: NullBufferBuilder::new(item_capacity), + } + } + /// Appends a byte slice into the builder. #[inline] pub fn append_value(&mut self, value: impl AsRef<[u8]>) { @@ -82,6 +95,16 @@ impl GenericBinaryBuilder { let array_data = unsafe { array_builder.build_unchecked() }; GenericBinaryArray::::from(array_data) } + + /// Returns the current values buffer as a slice + pub fn values_slice(&self) -> &[u8] { + self.value_builder.as_slice() + } + + /// Returns the current offsets buffer as a slice + pub fn offsets_slice(&self) -> &[OffsetSize] { + self.offsets_builder.as_slice() + } } impl ArrayBuilder for GenericBinaryBuilder { diff --git a/arrow/src/array/builder/generic_string_builder.rs b/arrow/src/array/builder/generic_string_builder.rs index d44aed44a49..02c34bdd313 100644 --- a/arrow/src/array/builder/generic_string_builder.rs +++ b/arrow/src/array/builder/generic_string_builder.rs @@ -15,60 +15,46 @@ // specific language governing permissions and limitations // under the License. -use crate::array::{ - ArrayBuilder, ArrayRef, GenericListBuilder, GenericStringArray, OffsetSizeTrait, - UInt8Builder, -}; +use crate::array::{ArrayBuilder, ArrayRef, GenericStringArray, OffsetSizeTrait}; use std::any::Any; use std::sync::Arc; +use super::GenericBinaryBuilder; + +/// Array builder for [`GenericStringArray`] #[derive(Debug)] pub struct GenericStringBuilder { - builder: GenericListBuilder, + builder: GenericBinaryBuilder, } impl GenericStringBuilder { - /// Creates a new `StringBuilder`, + /// Creates a new [`GenericStringBuilder`], /// `capacity` is the number of bytes of string data to pre-allocate space for in this builder pub fn new(capacity: usize) -> Self { - let values_builder = UInt8Builder::new(capacity); Self { - builder: GenericListBuilder::new(values_builder), + builder: GenericBinaryBuilder::new(capacity), } } - /// Creates a new `StringBuilder`, + /// Creates a new [`GenericStringBuilder`], /// `data_capacity` is the number of bytes of string data to pre-allocate space for in this builder /// `item_capacity` is the number of items to pre-allocate space for in this builder pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self { - let values_builder = UInt8Builder::new(data_capacity); Self { - builder: GenericListBuilder::with_capacity(values_builder, item_capacity), + builder: GenericBinaryBuilder::with_capacity(item_capacity, data_capacity), } } /// Appends a string into the builder. - /// - /// Automatically calls the `append` method to delimit the string appended in as a - /// distinct array element. #[inline] pub fn append_value(&mut self, value: impl AsRef) { - self.builder - .values() - .append_slice(value.as_ref().as_bytes()); - self.builder.append(true); - } - - /// Finish the current variable-length list array slot. - #[inline] - pub fn append(&mut self, is_valid: bool) { - self.builder.append(is_valid) + self.builder.append_value(value.as_ref().as_bytes()); } /// Append a null value to the array. #[inline] pub fn append_null(&mut self) { - self.append(false) + self.builder.append_null() } /// Append an `Option` value to the array. @@ -80,14 +66,14 @@ impl GenericStringBuilder { }; } - /// Builds the `StringArray` and reset this builder. + /// Builds the [`GenericStringArray`] and reset this builder. pub fn finish(&mut self) -> GenericStringArray { GenericStringArray::::from(self.builder.finish()) } /// Returns the current values buffer as a slice pub fn values_slice(&self) -> &[u8] { - self.builder.values_ref().values_slice() + self.builder.values_slice() } /// Returns the current offsets buffer as a slice @@ -131,79 +117,72 @@ impl ArrayBuilder for GenericStringBuilder() { + let mut builder = GenericStringBuilder::::new(20); + let owned = "arrow".to_owned(); builder.append_value("hello"); - builder.append(true); - builder.append_value("world"); - - let string_array = builder.finish(); + builder.append_value(""); + builder.append_value(&owned); + builder.append_null(); + builder.append_option(Some("rust")); + builder.append_option(None::<&str>); + builder.append_option(None::); + assert_eq!(7, builder.len()); + + assert_eq!( + GenericStringArray::::from(vec![ + Some("hello"), + Some(""), + Some("arrow"), + None, + Some("rust"), + None, + None + ]), + builder.finish() + ); + } - assert_eq!(3, string_array.len()); - assert_eq!(0, string_array.null_count()); - assert_eq!("hello", string_array.value(0)); - assert_eq!("", string_array.value(1)); - assert_eq!("world", string_array.value(2)); - assert_eq!(5, string_array.value_offsets()[2]); - assert_eq!(5, string_array.value_length(2)); + #[test] + fn test_string_array_builder() { + _test_generic_string_array_builder::() } #[test] - fn test_string_array_builder_finish() { - let mut builder = StringBuilder::new(10); + fn test_large_string_array_builder() { + _test_generic_string_array_builder::() + } + + fn _test_generic_string_array_builder_finish() { + let mut builder = GenericStringBuilder::::with_capacity(3, 11); builder.append_value("hello"); - builder.append_value("world"); + builder.append_value("rust"); + builder.append_null(); - let mut arr = builder.finish(); - assert_eq!(2, arr.len()); - assert_eq!(0, builder.len()); + builder.finish(); + assert!(builder.is_empty()); + assert_eq!(&[O::zero()], builder.offsets_slice()); builder.append_value("arrow"); - arr = builder.finish(); - assert_eq!(1, arr.len()); - assert_eq!(0, builder.len()); + builder.append_value("parquet"); + let arr = builder.finish(); + // array should not have null buffer because there is not `null` value. + assert_eq!(None, arr.data().null_buffer()); + assert_eq!(GenericStringArray::::from(vec!["arrow", "parquet"]), arr,) } #[test] - fn test_string_array_builder_append_string() { - let mut builder = StringBuilder::new(20); - - let var = "hello".to_owned(); - builder.append_value(&var); - builder.append(true); - builder.append_value("world"); - - let string_array = builder.finish(); - - assert_eq!(3, string_array.len()); - assert_eq!(0, string_array.null_count()); - assert_eq!("hello", string_array.value(0)); - assert_eq!("", string_array.value(1)); - assert_eq!("world", string_array.value(2)); - assert_eq!(5, string_array.value_offsets()[2]); - assert_eq!(5, string_array.value_length(2)); + fn test_string_array_builder_finish() { + _test_generic_string_array_builder_finish::() } #[test] - fn test_string_array_builder_append_option() { - let mut builder = StringBuilder::new(20); - builder.append_option(Some("hello")); - builder.append_option(None::<&str>); - builder.append_option(None::); - builder.append_option(Some("world")); - - let string_array = builder.finish(); - - assert_eq!(4, string_array.len()); - assert_eq!("hello", string_array.value(0)); - assert!(string_array.is_null(1)); - assert!(string_array.is_null(2)); - assert_eq!("world", string_array.value(3)); + fn test_large_string_array_builder_finish() { + _test_generic_string_array_builder_finish::() } }