Skip to content

Commit

Permalink
Simplify DictionaryBuilder constructors (apache#2684) (apache#2054)
Browse files Browse the repository at this point in the history
  • Loading branch information
tustvold committed Sep 8, 2022
1 parent 566ef3d commit 47c0d19
Show file tree
Hide file tree
Showing 14 changed files with 140 additions and 180 deletions.
8 changes: 4 additions & 4 deletions arrow/benches/string_dictionary_builder.rs
Expand Up @@ -15,7 +15,8 @@
// specific language governing permissions and limitations
// under the License.

use arrow::array::{Int32Builder, StringBuilder, StringDictionaryBuilder};
use arrow::array::StringDictionaryBuilder;
use arrow::datatypes::Int32Type;
use criterion::{criterion_group, criterion_main, Criterion};
use rand::{thread_rng, Rng};

Expand Down Expand Up @@ -43,12 +44,11 @@ fn criterion_benchmark(c: &mut Criterion) {
|b| {
let strings = build_strings(dict_size, total_size, key_len);
b.iter(|| {
let keys = Int32Builder::with_capacity(strings.len());
let values = StringBuilder::with_capacity(
let mut builder = StringDictionaryBuilder::<Int32Type>::with_capacity(
strings.len(),
key_len + 1,
(key_len + 1) * dict_size,
);
let mut builder = StringDictionaryBuilder::new(keys, values);

for val in &strings {
builder.append(val).unwrap();
Expand Down
22 changes: 8 additions & 14 deletions arrow/src/array/array_dictionary.rs
Expand Up @@ -22,8 +22,8 @@ use std::iter::IntoIterator;
use std::{convert::From, iter::FromIterator};

use super::{
make_array, Array, ArrayData, ArrayRef, PrimitiveArray, PrimitiveBuilder,
StringArray, StringBuilder, StringDictionaryBuilder,
make_array, Array, ArrayData, ArrayRef, PrimitiveArray, StringArray,
StringDictionaryBuilder,
};
use crate::datatypes::{
ArrowDictionaryKeyType, ArrowNativeType, ArrowPrimitiveType, DataType,
Expand Down Expand Up @@ -329,9 +329,7 @@ impl<'a, T: ArrowDictionaryKeyType> FromIterator<Option<&'a str>> for Dictionary
fn from_iter<I: IntoIterator<Item = Option<&'a str>>>(iter: I) -> Self {
let it = iter.into_iter();
let (lower, _) = it.size_hint();
let key_builder = PrimitiveBuilder::<T>::with_capacity(lower);
let value_builder = StringBuilder::with_capacity(256, 1024);
let mut builder = StringDictionaryBuilder::new(key_builder, value_builder);
let mut builder = StringDictionaryBuilder::with_capacity(lower, 256, 1024);
it.for_each(|i| {
if let Some(i) = i {
// Note: impl ... for Result<DictionaryArray<T>> fails with
Expand Down Expand Up @@ -367,9 +365,7 @@ impl<'a, T: ArrowDictionaryKeyType> FromIterator<&'a str> for DictionaryArray<T>
fn from_iter<I: IntoIterator<Item = &'a str>>(iter: I) -> Self {
let it = iter.into_iter();
let (lower, _) = it.size_hint();
let key_builder = PrimitiveBuilder::<T>::with_capacity(lower);
let value_builder = StringBuilder::with_capacity(256, 1024);
let mut builder = StringDictionaryBuilder::new(key_builder, value_builder);
let mut builder = StringDictionaryBuilder::with_capacity(lower, 256, 1024);
it.for_each(|i| {
builder
.append(i)
Expand Down Expand Up @@ -589,9 +585,8 @@ mod tests {

#[test]
fn test_dictionary_array_fmt_debug() {
let key_builder = PrimitiveBuilder::<UInt8Type>::with_capacity(3);
let value_builder = PrimitiveBuilder::<UInt32Type>::with_capacity(2);
let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder);
let mut builder =
PrimitiveDictionaryBuilder::<UInt8Type, UInt32Type>::with_capacity(3, 2);
builder.append(12345678).unwrap();
builder.append_null();
builder.append(22345678).unwrap();
Expand All @@ -601,9 +596,8 @@ mod tests {
format!("{:?}", array)
);

let key_builder = PrimitiveBuilder::<UInt8Type>::with_capacity(20);
let value_builder = PrimitiveBuilder::<UInt32Type>::with_capacity(2);
let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder);
let mut builder =
PrimitiveDictionaryBuilder::<UInt8Type, UInt32Type>::with_capacity(20, 2);
for _ in 0..20 {
builder.append(1).unwrap();
}
Expand Down
45 changes: 30 additions & 15 deletions arrow/src/array/builder/primitive_dictionary_builder.rs
Expand Up @@ -60,9 +60,7 @@ impl<T: ToByteSlice> Eq for Value<T> {}
/// };
/// use arrow::datatypes::{UInt8Type, UInt32Type};
///
/// let key_builder = PrimitiveBuilder::<UInt8Type>::with_capacity(3);
/// let value_builder = PrimitiveBuilder::<UInt32Type>::with_capacity(2);
/// let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder);
/// let mut builder = PrimitiveDictionaryBuilder::<UInt8Type, UInt32Type>::new();
/// builder.append(12345678).unwrap();
/// builder.append_null();
/// builder.append(22345678).unwrap();
Expand Down Expand Up @@ -95,22 +93,41 @@ where
map: HashMap<Value<V::Native>, K::Native>,
}

impl<K, V> Default for PrimitiveDictionaryBuilder<K, V>
where
K: ArrowPrimitiveType,
V: ArrowPrimitiveType,
{
fn default() -> Self {
Self::new()
}
}

impl<K, V> PrimitiveDictionaryBuilder<K, V>
where
K: ArrowPrimitiveType,
V: ArrowPrimitiveType,
{
/// Creates a new `PrimitiveDictionaryBuilder` from a keys builder and a value builder.
pub fn new(
keys_builder: PrimitiveBuilder<K>,
values_builder: PrimitiveBuilder<V>,
) -> Self {
pub fn new() -> Self {
Self {
keys_builder,
values_builder,
keys_builder: PrimitiveBuilder::new(),
values_builder: PrimitiveBuilder::new(),
map: HashMap::new(),
}
}

/// Creates a new `PrimitiveDictionaryBuilder` with the provided capacities
///
/// `keys_capacity`: the number of keys, i.e. length of array to build
/// `values_capacity`: the number of distinct dictionary values, i.e. size of dictionary
pub fn with_capacity(keys_capacity: usize, values_capacity: usize) -> Self {
Self {
keys_builder: PrimitiveBuilder::with_capacity(keys_capacity),
values_builder: PrimitiveBuilder::with_capacity(values_capacity),
map: HashMap::with_capacity(values_capacity),
}
}
}

impl<K, V> ArrayBuilder for PrimitiveDictionaryBuilder<K, V>
Expand Down Expand Up @@ -211,9 +228,8 @@ mod tests {

#[test]
fn test_primitive_dictionary_builder() {
let key_builder = PrimitiveBuilder::<UInt8Type>::with_capacity(3);
let value_builder = PrimitiveBuilder::<UInt32Type>::with_capacity(2);
let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder);
let mut builder =
PrimitiveDictionaryBuilder::<UInt8Type, UInt32Type>::with_capacity(3, 2);
builder.append(12345678).unwrap();
builder.append_null();
builder.append(22345678).unwrap();
Expand All @@ -239,9 +255,8 @@ mod tests {
#[test]
#[should_panic(expected = "DictionaryKeyOverflowError")]
fn test_primitive_dictionary_overflow() {
let key_builder = PrimitiveBuilder::<UInt8Type>::with_capacity(257);
let value_builder = PrimitiveBuilder::<UInt32Type>::with_capacity(257);
let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder);
let mut builder =
PrimitiveDictionaryBuilder::<UInt8Type, UInt32Type>::with_capacity(257, 257);
// 256 unique keys.
for i in 0..256 {
builder.append(i + 1000).unwrap();
Expand Down
54 changes: 38 additions & 16 deletions arrow/src/array/builder/string_dictionary_builder.rs
Expand Up @@ -42,9 +42,7 @@ use std::sync::Arc;
/// // Create a dictionary array indexed by bytes whose values are Strings.
/// // It can thus hold up to 256 distinct string values.
///
/// let key_builder = PrimitiveBuilder::<Int8Type>::with_capacity(100);
/// let value_builder = StringBuilder::new();
/// let mut builder = StringDictionaryBuilder::new(key_builder, value_builder);
/// let mut builder = StringDictionaryBuilder::<Int8Type>::new();
///
/// // The builder builds the dictionary value by value
/// builder.append("abc").unwrap();
Expand Down Expand Up @@ -84,12 +82,23 @@ where
values_builder: StringBuilder,
}

impl<K> Default for StringDictionaryBuilder<K>
where
K: ArrowDictionaryKeyType,
{
fn default() -> Self {
Self::new()
}
}

impl<K> StringDictionaryBuilder<K>
where
K: ArrowDictionaryKeyType,
{
/// Creates a new `StringDictionaryBuilder` from a keys builder and a value builder.
pub fn new(keys_builder: PrimitiveBuilder<K>, values_builder: StringBuilder) -> Self {
/// Creates a new `StringDictionaryBuilder`
pub fn new() -> Self {
let keys_builder = PrimitiveBuilder::new();
let values_builder = StringBuilder::new();
Self {
state: Default::default(),
dedup: HashMap::with_capacity_and_hasher(keys_builder.capacity(), ()),
Expand All @@ -98,6 +107,24 @@ where
}
}

/// Creates a new `StringDictionaryBuilder` with the provided capacities
///
/// `keys_capacity`: the number of keys, i.e. length of array to build
/// `value_capacity`: the number of distinct dictionary values, i.e. size of dictionary
/// `string_capacity`: the total number of bytes of all distinct strings in the dictionary
pub fn with_capacity(
keys_capacity: usize,
value_capacity: usize,
string_capacity: usize,
) -> Self {
Self {
state: Default::default(),
dedup: Default::default(),
keys_builder: PrimitiveBuilder::with_capacity(keys_capacity),
values_builder: StringBuilder::with_capacity(value_capacity, string_capacity),
}
}

/// Creates a new `StringDictionaryBuilder` from a keys builder and a dictionary
/// which is initialized with the given values.
/// The indices of those dictionary values are used as keys.
Expand All @@ -111,7 +138,7 @@ where
///
/// let dictionary_values = StringArray::from(vec![None, Some("abc"), Some("def")]);
///
/// let mut builder = StringDictionaryBuilder::new_with_dictionary(PrimitiveBuilder::<Int16Type>::with_capacity(3), &dictionary_values).unwrap();
/// let mut builder = StringDictionaryBuilder::new_with_dictionary(3, &dictionary_values).unwrap();
/// builder.append("def").unwrap();
/// builder.append_null();
/// builder.append("abc").unwrap();
Expand All @@ -123,7 +150,7 @@ where
/// assert_eq!(keys, &Int16Array::from(vec![Some(2), None, Some(1)]));
/// ```
pub fn new_with_dictionary(
keys_builder: PrimitiveBuilder<K>,
keys_capacity: usize,
dictionary_values: &StringArray,
) -> Result<Self> {
let state = ahash::RandomState::default();
Expand Down Expand Up @@ -162,7 +189,7 @@ where
Ok(Self {
state,
dedup,
keys_builder,
keys_builder: PrimitiveBuilder::with_capacity(keys_capacity),
values_builder,
})
}
Expand Down Expand Up @@ -290,9 +317,7 @@ mod tests {

#[test]
fn test_string_dictionary_builder() {
let key_builder = PrimitiveBuilder::<Int8Type>::with_capacity(5);
let value_builder = StringBuilder::new();
let mut builder = StringDictionaryBuilder::new(key_builder, value_builder);
let mut builder = StringDictionaryBuilder::<Int8Type>::new();
builder.append("abc").unwrap();
builder.append_null();
builder.append("def").unwrap();
Expand All @@ -317,10 +342,8 @@ mod tests {
fn test_string_dictionary_builder_with_existing_dictionary() {
let dictionary = StringArray::from(vec![None, Some("def"), Some("abc")]);

let key_builder = PrimitiveBuilder::<Int8Type>::with_capacity(6);
let mut builder =
StringDictionaryBuilder::new_with_dictionary(key_builder, &dictionary)
.unwrap();
StringDictionaryBuilder::new_with_dictionary(6, &dictionary).unwrap();
builder.append("abc").unwrap();
builder.append_null();
builder.append("def").unwrap();
Expand Down Expand Up @@ -349,9 +372,8 @@ mod tests {
let dictionary: Vec<Option<&str>> = vec![None];
let dictionary = StringArray::from(dictionary);

let key_builder = PrimitiveBuilder::<Int16Type>::with_capacity(4);
let mut builder =
StringDictionaryBuilder::new_with_dictionary(key_builder, &dictionary)
StringDictionaryBuilder::<Int16Type>::new_with_dictionary(4, &dictionary)
.unwrap();
builder.append("abc").unwrap();
builder.append_null();
Expand Down
8 changes: 4 additions & 4 deletions arrow/src/array/equal/mod.rs
Expand Up @@ -265,8 +265,8 @@ mod tests {
use crate::array::{
array::Array, ArrayData, ArrayDataBuilder, ArrayRef, BooleanArray,
FixedSizeBinaryBuilder, FixedSizeListBuilder, GenericBinaryArray, Int32Builder,
ListBuilder, NullArray, PrimitiveBuilder, StringArray, StringDictionaryBuilder,
StructArray, UnionBuilder,
ListBuilder, NullArray, StringArray, StringDictionaryBuilder, StructArray,
UnionBuilder,
};
use crate::array::{GenericStringArray, Int32Array};
use crate::buffer::Buffer;
Expand Down Expand Up @@ -1245,8 +1245,8 @@ mod tests {

fn create_dictionary_array(values: &[&str], keys: &[Option<&str>]) -> ArrayData {
let values = StringArray::from(values.to_vec());
let mut builder = StringDictionaryBuilder::new_with_dictionary(
PrimitiveBuilder::<Int16Type>::with_capacity(3),
let mut builder = StringDictionaryBuilder::<Int16Type>::new_with_dictionary(
keys.len(),
&values,
)
.unwrap();
Expand Down
8 changes: 4 additions & 4 deletions arrow/src/array/transform/mod.rs
Expand Up @@ -675,8 +675,8 @@ mod tests {
array::{
Array, ArrayData, ArrayRef, BooleanArray, DictionaryArray,
FixedSizeBinaryArray, Int16Array, Int16Type, Int32Array, Int64Array,
Int64Builder, ListBuilder, MapBuilder, NullArray, PrimitiveBuilder,
StringArray, StringDictionaryBuilder, StructArray, UInt8Array,
Int64Builder, ListBuilder, MapBuilder, NullArray, StringArray,
StringDictionaryBuilder, StructArray, UInt8Array,
},
buffer::Buffer,
datatypes::Field,
Expand Down Expand Up @@ -963,8 +963,8 @@ mod tests {

fn create_dictionary_array(values: &[&str], keys: &[Option<&str>]) -> ArrayData {
let values = StringArray::from(values.to_vec());
let mut builder = StringDictionaryBuilder::new_with_dictionary(
PrimitiveBuilder::<Int16Type>::with_capacity(3),
let mut builder = StringDictionaryBuilder::<Int16Type>::new_with_dictionary(
keys.len(),
&values,
)
.unwrap();
Expand Down

0 comments on commit 47c0d19

Please sign in to comment.