Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Simplify DictionaryBuilder constructors (#2684) (#2054) #2685

Merged
merged 3 commits into from Sep 8, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
8 changes: 4 additions & 4 deletions arrow/benches/string_dictionary_builder.rs
Expand Up @@ -15,7 +15,8 @@
// specific language governing permissions and limitations
// under the License.

use arrow::array::{Int32Builder, StringBuilder, StringDictionaryBuilder};
use arrow::array::StringDictionaryBuilder;
use arrow::datatypes::Int32Type;
use criterion::{criterion_group, criterion_main, Criterion};
use rand::{thread_rng, Rng};

Expand Down Expand Up @@ -43,12 +44,11 @@ fn criterion_benchmark(c: &mut Criterion) {
|b| {
let strings = build_strings(dict_size, total_size, key_len);
b.iter(|| {
let keys = Int32Builder::with_capacity(strings.len());
let values = StringBuilder::with_capacity(
let mut builder = StringDictionaryBuilder::<Int32Type>::with_capacity(
strings.len(),
key_len + 1,
(key_len + 1) * dict_size,
);
let mut builder = StringDictionaryBuilder::new(keys, values);

for val in &strings {
builder.append(val).unwrap();
Expand Down
22 changes: 8 additions & 14 deletions arrow/src/array/array_dictionary.rs
Expand Up @@ -22,8 +22,8 @@ use std::iter::IntoIterator;
use std::{convert::From, iter::FromIterator};

use super::{
make_array, Array, ArrayData, ArrayRef, PrimitiveArray, PrimitiveBuilder,
StringArray, StringBuilder, StringDictionaryBuilder,
make_array, Array, ArrayData, ArrayRef, PrimitiveArray, StringArray,
StringDictionaryBuilder,
};
use crate::datatypes::{
ArrowDictionaryKeyType, ArrowNativeType, ArrowPrimitiveType, DataType,
Expand Down Expand Up @@ -329,9 +329,7 @@ impl<'a, T: ArrowDictionaryKeyType> FromIterator<Option<&'a str>> for Dictionary
fn from_iter<I: IntoIterator<Item = Option<&'a str>>>(iter: I) -> Self {
let it = iter.into_iter();
let (lower, _) = it.size_hint();
let key_builder = PrimitiveBuilder::<T>::with_capacity(lower);
let value_builder = StringBuilder::with_capacity(256, 1024);
let mut builder = StringDictionaryBuilder::new(key_builder, value_builder);
let mut builder = StringDictionaryBuilder::with_capacity(lower, 256, 1024);
it.for_each(|i| {
if let Some(i) = i {
// Note: impl ... for Result<DictionaryArray<T>> fails with
Expand Down Expand Up @@ -367,9 +365,7 @@ impl<'a, T: ArrowDictionaryKeyType> FromIterator<&'a str> for DictionaryArray<T>
fn from_iter<I: IntoIterator<Item = &'a str>>(iter: I) -> Self {
let it = iter.into_iter();
let (lower, _) = it.size_hint();
let key_builder = PrimitiveBuilder::<T>::with_capacity(lower);
let value_builder = StringBuilder::with_capacity(256, 1024);
let mut builder = StringDictionaryBuilder::new(key_builder, value_builder);
let mut builder = StringDictionaryBuilder::with_capacity(lower, 256, 1024);
it.for_each(|i| {
builder
.append(i)
Expand Down Expand Up @@ -589,9 +585,8 @@ mod tests {

#[test]
fn test_dictionary_array_fmt_debug() {
let key_builder = PrimitiveBuilder::<UInt8Type>::with_capacity(3);
let value_builder = PrimitiveBuilder::<UInt32Type>::with_capacity(2);
let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder);
let mut builder =
PrimitiveDictionaryBuilder::<UInt8Type, UInt32Type>::with_capacity(3, 2);
builder.append(12345678).unwrap();
builder.append_null();
builder.append(22345678).unwrap();
Expand All @@ -601,9 +596,8 @@ mod tests {
format!("{:?}", array)
);

let key_builder = PrimitiveBuilder::<UInt8Type>::with_capacity(20);
let value_builder = PrimitiveBuilder::<UInt32Type>::with_capacity(2);
let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder);
let mut builder =
PrimitiveDictionaryBuilder::<UInt8Type, UInt32Type>::with_capacity(20, 2);
for _ in 0..20 {
builder.append(1).unwrap();
}
Expand Down
47 changes: 31 additions & 16 deletions arrow/src/array/builder/primitive_dictionary_builder.rs
Expand Up @@ -60,9 +60,7 @@ impl<T: ToByteSlice> Eq for Value<T> {}
/// };
/// use arrow::datatypes::{UInt8Type, UInt32Type};
///
/// let key_builder = PrimitiveBuilder::<UInt8Type>::with_capacity(3);
/// let value_builder = PrimitiveBuilder::<UInt32Type>::with_capacity(2);
/// let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder);
/// let mut builder = PrimitiveDictionaryBuilder::<UInt8Type, UInt32Type>::new();
/// builder.append(12345678).unwrap();
/// builder.append_null();
/// builder.append(22345678).unwrap();
Expand Down Expand Up @@ -95,22 +93,41 @@ where
map: HashMap<Value<V::Native>, K::Native>,
}

impl<K, V> Default for PrimitiveDictionaryBuilder<K, V>
where
K: ArrowPrimitiveType,
V: ArrowPrimitiveType,
{
fn default() -> Self {
Self::new()
}
}

impl<K, V> PrimitiveDictionaryBuilder<K, V>
where
K: ArrowPrimitiveType,
V: ArrowPrimitiveType,
{
/// Creates a new `PrimitiveDictionaryBuilder` from a keys builder and a value builder.
pub fn new(
keys_builder: PrimitiveBuilder<K>,
values_builder: PrimitiveBuilder<V>,
) -> Self {
/// Creates a new `PrimitiveDictionaryBuilder`.
pub fn new() -> Self {
Self {
keys_builder,
values_builder,
keys_builder: PrimitiveBuilder::new(),
values_builder: PrimitiveBuilder::new(),
Comment on lines +112 to +115
Copy link
Member

@viirya viirya Sep 8, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Although I think the updated API is more clear, I just wonder if the issue is provided key builder might be not empty, we can check the length of the key builder when it is passed in? The API could be unchanged.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For sure, however, I'd argue it is a good opportunity to bring this API closer into alignment with the other builders. We don't require passing the various different inner builders to StringArray, for example. The only other place we do is ListArray, which again I think is a little odd.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see. Yea, it is somehow weird to provide key and value builders from outside of these dictionary builder.

map: HashMap::new(),
}
}

/// Creates a new `PrimitiveDictionaryBuilder` with the provided capacities
///
/// `keys_capacity`: the number of keys, i.e. length of array to build
/// `values_capacity`: the number of distinct dictionary values, i.e. size of dictionary
pub fn with_capacity(keys_capacity: usize, values_capacity: usize) -> Self {
Self {
keys_builder: PrimitiveBuilder::with_capacity(keys_capacity),
values_builder: PrimitiveBuilder::with_capacity(values_capacity),
map: HashMap::with_capacity(values_capacity),
}
}
}

impl<K, V> ArrayBuilder for PrimitiveDictionaryBuilder<K, V>
Expand Down Expand Up @@ -211,9 +228,8 @@ mod tests {

#[test]
fn test_primitive_dictionary_builder() {
let key_builder = PrimitiveBuilder::<UInt8Type>::with_capacity(3);
let value_builder = PrimitiveBuilder::<UInt32Type>::with_capacity(2);
let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder);
let mut builder =
PrimitiveDictionaryBuilder::<UInt8Type, UInt32Type>::with_capacity(3, 2);
builder.append(12345678).unwrap();
builder.append_null();
builder.append(22345678).unwrap();
Expand All @@ -239,9 +255,8 @@ mod tests {
#[test]
#[should_panic(expected = "DictionaryKeyOverflowError")]
fn test_primitive_dictionary_overflow() {
let key_builder = PrimitiveBuilder::<UInt8Type>::with_capacity(257);
let value_builder = PrimitiveBuilder::<UInt32Type>::with_capacity(257);
let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder);
let mut builder =
PrimitiveDictionaryBuilder::<UInt8Type, UInt32Type>::with_capacity(257, 257);
// 256 unique keys.
for i in 0..256 {
builder.append(i + 1000).unwrap();
Expand Down
56 changes: 39 additions & 17 deletions arrow/src/array/builder/string_dictionary_builder.rs
Expand Up @@ -42,9 +42,7 @@ use std::sync::Arc;
/// // Create a dictionary array indexed by bytes whose values are Strings.
/// // It can thus hold up to 256 distinct string values.
///
/// let key_builder = PrimitiveBuilder::<Int8Type>::with_capacity(100);
/// let value_builder = StringBuilder::new();
/// let mut builder = StringDictionaryBuilder::new(key_builder, value_builder);
/// let mut builder = StringDictionaryBuilder::<Int8Type>::new();
///
/// // The builder builds the dictionary value by value
/// builder.append("abc").unwrap();
Expand Down Expand Up @@ -84,12 +82,23 @@ where
values_builder: StringBuilder,
}

impl<K> Default for StringDictionaryBuilder<K>
where
K: ArrowDictionaryKeyType,
{
fn default() -> Self {
Self::new()
}
}

impl<K> StringDictionaryBuilder<K>
where
K: ArrowDictionaryKeyType,
{
/// Creates a new `StringDictionaryBuilder` from a keys builder and a value builder.
pub fn new(keys_builder: PrimitiveBuilder<K>, values_builder: StringBuilder) -> Self {
/// Creates a new `StringDictionaryBuilder`
pub fn new() -> Self {
let keys_builder = PrimitiveBuilder::new();
let values_builder = StringBuilder::new();
Self {
state: Default::default(),
dedup: HashMap::with_capacity_and_hasher(keys_builder.capacity(), ()),
Expand All @@ -98,7 +107,25 @@ where
}
}

/// Creates a new `StringDictionaryBuilder` from a keys builder and a dictionary
/// Creates a new `StringDictionaryBuilder` with the provided capacities
///
/// `keys_capacity`: the number of keys, i.e. length of array to build
/// `value_capacity`: the number of distinct dictionary values, i.e. size of dictionary
/// `string_capacity`: the total number of bytes of all distinct strings in the dictionary
pub fn with_capacity(
keys_capacity: usize,
value_capacity: usize,
string_capacity: usize,
) -> Self {
Self {
state: Default::default(),
dedup: Default::default(),
keys_builder: PrimitiveBuilder::with_capacity(keys_capacity),
values_builder: StringBuilder::with_capacity(value_capacity, string_capacity),
}
}

/// Creates a new `StringDictionaryBuilder` from a keys capacity and a dictionary
/// which is initialized with the given values.
/// The indices of those dictionary values are used as keys.
///
Expand All @@ -111,7 +138,7 @@ where
///
/// let dictionary_values = StringArray::from(vec![None, Some("abc"), Some("def")]);
///
/// let mut builder = StringDictionaryBuilder::new_with_dictionary(PrimitiveBuilder::<Int16Type>::with_capacity(3), &dictionary_values).unwrap();
/// let mut builder = StringDictionaryBuilder::new_with_dictionary(3, &dictionary_values).unwrap();
/// builder.append("def").unwrap();
/// builder.append_null();
/// builder.append("abc").unwrap();
Expand All @@ -123,7 +150,7 @@ where
/// assert_eq!(keys, &Int16Array::from(vec![Some(2), None, Some(1)]));
/// ```
pub fn new_with_dictionary(
keys_builder: PrimitiveBuilder<K>,
keys_capacity: usize,
dictionary_values: &StringArray,
) -> Result<Self> {
let state = ahash::RandomState::default();
Expand Down Expand Up @@ -162,7 +189,7 @@ where
Ok(Self {
state,
dedup,
keys_builder,
keys_builder: PrimitiveBuilder::with_capacity(keys_capacity),
values_builder,
})
}
Expand Down Expand Up @@ -290,9 +317,7 @@ mod tests {

#[test]
fn test_string_dictionary_builder() {
let key_builder = PrimitiveBuilder::<Int8Type>::with_capacity(5);
let value_builder = StringBuilder::new();
let mut builder = StringDictionaryBuilder::new(key_builder, value_builder);
let mut builder = StringDictionaryBuilder::<Int8Type>::new();
builder.append("abc").unwrap();
builder.append_null();
builder.append("def").unwrap();
Expand All @@ -317,10 +342,8 @@ mod tests {
fn test_string_dictionary_builder_with_existing_dictionary() {
let dictionary = StringArray::from(vec![None, Some("def"), Some("abc")]);

let key_builder = PrimitiveBuilder::<Int8Type>::with_capacity(6);
let mut builder =
StringDictionaryBuilder::new_with_dictionary(key_builder, &dictionary)
.unwrap();
StringDictionaryBuilder::new_with_dictionary(6, &dictionary).unwrap();
builder.append("abc").unwrap();
builder.append_null();
builder.append("def").unwrap();
Expand Down Expand Up @@ -349,9 +372,8 @@ mod tests {
let dictionary: Vec<Option<&str>> = vec![None];
let dictionary = StringArray::from(dictionary);

let key_builder = PrimitiveBuilder::<Int16Type>::with_capacity(4);
let mut builder =
StringDictionaryBuilder::new_with_dictionary(key_builder, &dictionary)
StringDictionaryBuilder::<Int16Type>::new_with_dictionary(4, &dictionary)
.unwrap();
builder.append("abc").unwrap();
builder.append_null();
Expand Down
8 changes: 4 additions & 4 deletions arrow/src/array/equal/mod.rs
Expand Up @@ -265,8 +265,8 @@ mod tests {
use crate::array::{
array::Array, ArrayData, ArrayDataBuilder, ArrayRef, BooleanArray,
FixedSizeBinaryBuilder, FixedSizeListBuilder, GenericBinaryArray, Int32Builder,
ListBuilder, NullArray, PrimitiveBuilder, StringArray, StringDictionaryBuilder,
StructArray, UnionBuilder,
ListBuilder, NullArray, StringArray, StringDictionaryBuilder, StructArray,
UnionBuilder,
};
use crate::array::{GenericStringArray, Int32Array};
use crate::buffer::Buffer;
Expand Down Expand Up @@ -1245,8 +1245,8 @@ mod tests {

fn create_dictionary_array(values: &[&str], keys: &[Option<&str>]) -> ArrayData {
let values = StringArray::from(values.to_vec());
let mut builder = StringDictionaryBuilder::new_with_dictionary(
PrimitiveBuilder::<Int16Type>::with_capacity(3),
let mut builder = StringDictionaryBuilder::<Int16Type>::new_with_dictionary(
keys.len(),
&values,
)
.unwrap();
Expand Down
8 changes: 4 additions & 4 deletions arrow/src/array/transform/mod.rs
Expand Up @@ -675,8 +675,8 @@ mod tests {
array::{
Array, ArrayData, ArrayRef, BooleanArray, DictionaryArray,
FixedSizeBinaryArray, Int16Array, Int16Type, Int32Array, Int64Array,
Int64Builder, ListBuilder, MapBuilder, NullArray, PrimitiveBuilder,
StringArray, StringDictionaryBuilder, StructArray, UInt8Array,
Int64Builder, ListBuilder, MapBuilder, NullArray, StringArray,
StringDictionaryBuilder, StructArray, UInt8Array,
},
buffer::Buffer,
datatypes::Field,
Expand Down Expand Up @@ -963,8 +963,8 @@ mod tests {

fn create_dictionary_array(values: &[&str], keys: &[Option<&str>]) -> ArrayData {
let values = StringArray::from(values.to_vec());
let mut builder = StringDictionaryBuilder::new_with_dictionary(
PrimitiveBuilder::<Int16Type>::with_capacity(3),
let mut builder = StringDictionaryBuilder::<Int16Type>::new_with_dictionary(
keys.len(),
&values,
)
.unwrap();
Expand Down