diff --git a/arrow/src/array/array_primitive.rs b/arrow/src/array/array_primitive.rs index 427bda636ee..d36caaca810 100644 --- a/arrow/src/array/array_primitive.rs +++ b/arrow/src/array/array_primitive.rs @@ -166,6 +166,17 @@ impl PrimitiveArray { ) -> impl Iterator> + 'a { indexes.map(|opt_index| opt_index.map(|index| self.value_unchecked(index))) } + + /// Returns the backing [`ArrayData`] of this [`PrimitiveArray`] + pub fn into_data(self) -> ArrayData { + self.into() + } +} + +impl From> for ArrayData { + fn from(a: PrimitiveArray) -> Self { + a.data + } } impl Array for PrimitiveArray { diff --git a/arrow/src/array/array_string.rs b/arrow/src/array/array_string.rs index dba983b0180..d007582935c 100644 --- a/arrow/src/array/array_string.rs +++ b/arrow/src/array/array_string.rs @@ -195,6 +195,11 @@ impl GenericStringArray { ) -> impl Iterator> + 'a { indexes.map(|opt_index| opt_index.map(|index| self.value_unchecked(index))) } + + /// Returns the backing [`ArrayData`] of this [`GenericStringArray`] + pub fn into_data(self) -> ArrayData { + self.into() + } } impl<'a, Ptr, OffsetSize: OffsetSizeTrait> FromIterator<&'a Option> @@ -336,6 +341,12 @@ impl From> for GenericStringArray From> for ArrayData { + fn from(a: GenericStringArray) -> Self { + a.data + } +} + /// An array where each element is a variable-sized sequence of bytes representing a string /// whose maximum length (in bytes) is represented by a i32. /// diff --git a/arrow/src/array/builder/primitive_builder.rs b/arrow/src/array/builder/primitive_builder.rs index b18239b7547..2f5eeac73c0 100644 --- a/arrow/src/array/builder/primitive_builder.rs +++ b/arrow/src/array/builder/primitive_builder.rs @@ -20,10 +20,8 @@ use std::sync::Arc; use crate::array::ArrayData; use crate::array::ArrayRef; -use crate::array::DictionaryArray; use crate::array::PrimitiveArray; use crate::datatypes::ArrowPrimitiveType; -use crate::datatypes::DataType; use crate::error::{ArrowError, Result}; use super::{ArrayBuilder, BooleanBufferBuilder, BufferBuilder}; @@ -197,30 +195,6 @@ impl PrimitiveBuilder { PrimitiveArray::::from(array_data) } - /// Builds the `DictionaryArray` and reset this builder. - pub fn finish_dict(&mut self, values: ArrayRef) -> DictionaryArray { - let len = self.len(); - let null_bit_buffer = self.bitmap_builder.as_mut().map(|b| b.finish()); - let null_count = len - - null_bit_buffer - .as_ref() - .map(|b| b.count_set_bits()) - .unwrap_or(len); - let data_type = DataType::Dictionary( - Box::new(T::DATA_TYPE), - Box::new(values.data_type().clone()), - ); - let mut builder = ArrayData::builder(data_type) - .len(len) - .add_buffer(self.values_builder.finish()); - if null_count > 0 { - builder = builder.null_bit_buffer(null_bit_buffer); - } - builder = builder.add_child_data(values.data().clone()); - let array_data = unsafe { builder.build_unchecked() }; - DictionaryArray::::from(array_data) - } - fn materialize_bitmap_builder(&mut self) { if self.bitmap_builder.is_some() { return; diff --git a/arrow/src/array/builder/primitive_dictionary_builder.rs b/arrow/src/array/builder/primitive_dictionary_builder.rs index 93695e0b730..5e989f3cc87 100644 --- a/arrow/src/array/builder/primitive_dictionary_builder.rs +++ b/arrow/src/array/builder/primitive_dictionary_builder.rs @@ -19,11 +19,8 @@ use std::any::Any; use std::collections::HashMap; use std::sync::Arc; -use crate::array::ArrayRef; -use crate::array::ArrowPrimitiveType; -use crate::array::DictionaryArray; -use crate::datatypes::ArrowNativeType; -use crate::datatypes::ToByteSlice; +use crate::array::{ArrayRef, ArrowPrimitiveType, DictionaryArray}; +use crate::datatypes::{ArrowNativeType, DataType, ToByteSlice}; use crate::error::{ArrowError, Result}; use super::ArrayBuilder; @@ -164,8 +161,19 @@ where /// Builds the `DictionaryArray` and reset this builder. pub fn finish(&mut self) -> DictionaryArray { self.map.clear(); - let value_ref: ArrayRef = Arc::new(self.values_builder.finish()); - self.keys_builder.finish_dict(value_ref) + let values = self.values_builder.finish(); + let keys = self.keys_builder.finish(); + + let data_type = + DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(V::DATA_TYPE)); + + let builder = keys + .into_data() + .into_builder() + .data_type(data_type) + .child_data(vec![values.into_data()]); + + DictionaryArray::from(unsafe { builder.build_unchecked() }) } } diff --git a/arrow/src/array/builder/string_dictionary_builder.rs b/arrow/src/array/builder/string_dictionary_builder.rs index 918bf975692..77b2b23160c 100644 --- a/arrow/src/array/builder/string_dictionary_builder.rs +++ b/arrow/src/array/builder/string_dictionary_builder.rs @@ -19,7 +19,7 @@ use super::PrimitiveBuilder; use crate::array::{ Array, ArrayBuilder, ArrayRef, DictionaryArray, StringArray, StringBuilder, }; -use crate::datatypes::{ArrowDictionaryKeyType, ArrowNativeType}; +use crate::datatypes::{ArrowDictionaryKeyType, ArrowNativeType, DataType}; use crate::error::{ArrowError, Result}; use hashbrown::hash_map::RawEntryMut; use hashbrown::HashMap; @@ -250,8 +250,19 @@ where /// Builds the `DictionaryArray` and reset this builder. pub fn finish(&mut self) -> DictionaryArray { self.dedup.clear(); - let value_ref: ArrayRef = Arc::new(self.values_builder.finish()); - self.keys_builder.finish_dict(value_ref) + let values = self.values_builder.finish(); + let keys = self.keys_builder.finish(); + + let data_type = + DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(DataType::Utf8)); + + let builder = keys + .into_data() + .into_builder() + .data_type(data_type) + .child_data(vec![values.into_data()]); + + DictionaryArray::from(unsafe { builder.build_unchecked() }) } } diff --git a/arrow/src/array/data.rs b/arrow/src/array/data.rs index 52920827d24..c1d608228af 100644 --- a/arrow/src/array/data.rs +++ b/arrow/src/array/data.rs @@ -1217,6 +1217,11 @@ impl ArrayData { .zip(other.child_data.iter()) .all(|(a, b)| a.ptr_eq(b)) } + + /// Converts this [`ArrayData`] into an [`ArrayDataBuilder`] + pub fn into_builder(self) -> ArrayDataBuilder { + self.into() + } } /// Return the expected [`DataTypeLayout`] Arrays of this data @@ -1408,6 +1413,10 @@ impl ArrayDataBuilder { } } + pub fn data_type(self, data_type: DataType) -> Self { + Self { data_type, ..self } + } + #[inline] #[allow(clippy::len_without_is_empty)] pub const fn len(mut self, n: usize) -> Self { @@ -1482,6 +1491,22 @@ impl ArrayDataBuilder { } } +impl From for ArrayDataBuilder { + fn from(d: ArrayData) -> Self { + // TODO: Store Bitmap on ArrayData (#1799) + let null_bit_buffer = d.null_buffer().cloned(); + Self { + null_bit_buffer, + data_type: d.data_type, + len: d.len, + null_count: Some(d.null_count), + offset: d.offset, + buffers: d.buffers, + child_data: d.child_data, + } + } +} + #[cfg(test)] mod tests { use super::*;