Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement FromIterator for Builders #2038

38 changes: 4 additions & 34 deletions arrow/src/array/array_binary.rs
Expand Up @@ -23,6 +23,7 @@ use super::{
array::print_long_array, raw_pointer::RawPtrBox, Array, ArrayData,
FixedSizeListArray, GenericBinaryIter, GenericListArray, OffsetSizeTrait,
};
use crate::array::builder::GenericBinaryBuilder;
pub use crate::array::DecimalIter;
use crate::buffer::Buffer;
use crate::error::{ArrowError, Result};
Expand Down Expand Up @@ -279,40 +280,9 @@ where
Ptr: AsRef<[u8]>,
{
fn from_iter<I: IntoIterator<Item = Option<Ptr>>>(iter: I) -> Self {
let iter = iter.into_iter();
let (_, data_len) = iter.size_hint();
let data_len = data_len.expect("Iterator must be sized"); // panic if no upper bound.

let mut offsets = Vec::with_capacity(data_len + 1);
let mut values = Vec::new();
let mut null_buf = MutableBuffer::new_null(data_len);
let mut length_so_far: OffsetSize = OffsetSize::zero();
offsets.push(length_so_far);

{
let null_slice = null_buf.as_slice_mut();

for (i, s) in iter.enumerate() {
if let Some(s) = s {
let s = s.as_ref();
bit_util::set_bit(null_slice, i);
length_so_far += OffsetSize::from_usize(s.len()).unwrap();
values.extend_from_slice(s);
}
// always add an element in offsets
offsets.push(length_so_far);
}
}

// calculate actual data_len, which may be different from the iterator's upper bound
let data_len = offsets.len() - 1;
let array_data = ArrayData::builder(Self::get_data_type())
.len(data_len)
.add_buffer(Buffer::from_slice_ref(&offsets))
.add_buffer(Buffer::from_slice_ref(&values))
.null_bit_buffer(Some(null_buf.into()));
let array_data = unsafe { array_data.build_unchecked() };
Self::from(array_data)
iter.into_iter()
.collect::<GenericBinaryBuilder<OffsetSize>>()
.finish()
}
}

Expand Down
33 changes: 1 addition & 32 deletions arrow/src/array/array_boolean.rs
Expand Up @@ -222,38 +222,7 @@ impl<'a> BooleanArray {

impl<Ptr: Borrow<Option<bool>>> FromIterator<Ptr> for BooleanArray {
fn from_iter<I: IntoIterator<Item = Ptr>>(iter: I) -> Self {
let iter = iter.into_iter();
let (_, data_len) = iter.size_hint();
let data_len = data_len.expect("Iterator must be sized"); // panic if no upper bound.
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tustvold I noticed some implementations panic if the upper size hint is not available and others pick whatever is available. Is this a deliberate decision? AFAIK we use size hint for slight optimization and it should not be enforced such a way. Thoughts?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It should be optional for the non-trusted length variants


let num_bytes = bit_util::ceil(data_len, 8);
let mut null_buf = MutableBuffer::from_len_zeroed(num_bytes);
let mut val_buf = MutableBuffer::from_len_zeroed(num_bytes);

let data = val_buf.as_slice_mut();

let null_slice = null_buf.as_slice_mut();
iter.enumerate().for_each(|(i, item)| {
if let Some(a) = item.borrow() {
bit_util::set_bit(null_slice, i);
if *a {
bit_util::set_bit(data, i);
}
}
});

let data = unsafe {
ArrayData::new_unchecked(
DataType::Boolean,
data_len,
None,
Some(null_buf.into()),
0,
vec![val_buf.into()],
vec![],
)
};
BooleanArray::from(data)
iter.into_iter().collect::<BooleanBuilder>().finish()
}
}

Expand Down
35 changes: 3 additions & 32 deletions arrow/src/array/array_decimal.rs
Expand Up @@ -15,15 +15,16 @@
// specific language governing permissions and limitations
// under the License.

use crate::array::DecimalBuilder;
use std::borrow::Borrow;
use std::convert::From;
use std::fmt;
use std::{any::Any, iter::FromIterator};

use super::FixedSizeBinaryArray;
use super::{
array::print_long_array, raw_pointer::RawPtrBox, Array, ArrayData, FixedSizeListArray,
};
use super::{BooleanBufferBuilder, FixedSizeBinaryArray};
pub use crate::array::DecimalIter;
use crate::buffer::Buffer;
use crate::datatypes::DataType;
Expand Down Expand Up @@ -374,37 +375,7 @@ impl<'a> DecimalArray {

impl<Ptr: Borrow<Option<i128>>> FromIterator<Ptr> for DecimalArray {
fn from_iter<I: IntoIterator<Item = Ptr>>(iter: I) -> Self {
let iter = iter.into_iter();
let (lower, upper) = iter.size_hint();
let size_hint = upper.unwrap_or(lower);

let mut null_buf = BooleanBufferBuilder::new(size_hint);

let buffer: Buffer = iter
.map(|item| {
if let Some(a) = item.borrow() {
null_buf.append(true);
*a
} else {
null_buf.append(false);
// arbitrary value for NULL
0
}
})
.collect();

let data = unsafe {
ArrayData::new_unchecked(
Self::default_type(),
null_buf.len(),
None,
Some(null_buf.into()),
0,
vec![buffer],
vec![],
)
};
DecimalArray::from(data)
iter.into_iter().collect::<DecimalBuilder>().finish()
}
}

Expand Down
44 changes: 10 additions & 34 deletions arrow/src/array/array_dictionary.rs
Expand Up @@ -21,8 +21,8 @@ use std::iter::IntoIterator;
use std::{convert::From, iter::FromIterator};

use super::{
make_array, Array, ArrayData, ArrayRef, PrimitiveArray, PrimitiveBuilder,
StringArray, StringBuilder, StringDictionaryBuilder,
make_array, Array, ArrayData, ArrayRef, PrimitiveArray, StringArray,
StringDictionaryBuilder,
};
use crate::datatypes::{
ArrowDictionaryKeyType, ArrowNativeType, ArrowPrimitiveType, DataType,
Expand Down Expand Up @@ -306,26 +306,9 @@ impl<'a, T: ArrowPrimitiveType + ArrowDictionaryKeyType> FromIterator<Option<&'a
for DictionaryArray<T>
{
fn from_iter<I: IntoIterator<Item = Option<&'a str>>>(iter: I) -> Self {
let it = iter.into_iter();
let (lower, _) = it.size_hint();
let key_builder = PrimitiveBuilder::<T>::new(lower);
let value_builder = StringBuilder::new(256);
let mut builder = StringDictionaryBuilder::new(key_builder, value_builder);
it.for_each(|i| {
if let Some(i) = i {
// Note: impl ... for Result<DictionaryArray<T>> fails with
// error[E0117]: only traits defined in the current crate can be implemented for arbitrary types
builder
.append(i)
.expect("Unable to append a value to a dictionary array.");
} else {
builder
.append_null()
.expect("Unable to append a null value to a dictionary array.");
}
});

builder.finish()
iter.into_iter()
.collect::<StringDictionaryBuilder<T>>()
.finish()
}
}

Expand All @@ -348,18 +331,10 @@ impl<'a, T: ArrowPrimitiveType + ArrowDictionaryKeyType> FromIterator<&'a str>
for DictionaryArray<T>
{
fn from_iter<I: IntoIterator<Item = &'a str>>(iter: I) -> Self {
let it = iter.into_iter();
let (lower, _) = it.size_hint();
let key_builder = PrimitiveBuilder::<T>::new(lower);
let value_builder = StringBuilder::new(256);
let mut builder = StringDictionaryBuilder::new(key_builder, value_builder);
it.for_each(|i| {
builder
.append(i)
.expect("Unable to append a value to a dictionary array.");
});

builder.finish()
iter.into_iter()
.map(Some)
.collect::<StringDictionaryBuilder<T>>()
.finish()
}
}

Expand Down Expand Up @@ -391,6 +366,7 @@ impl<T: ArrowPrimitiveType> fmt::Debug for DictionaryArray<T> {
mod tests {
use super::*;

use crate::array::PrimitiveBuilder;
use crate::array::{Float32Array, Int8Array};
use crate::datatypes::{Float32Type, Int16Type};
use crate::{
Expand Down
39 changes: 1 addition & 38 deletions arrow/src/array/array_primitive.rs
Expand Up @@ -404,44 +404,7 @@ impl<T: ArrowPrimitiveType, Ptr: Into<NativeAdapter<T>>> FromIterator<Ptr>
for PrimitiveArray<T>
{
fn from_iter<I: IntoIterator<Item = Ptr>>(iter: I) -> Self {
let iter = iter.into_iter();
let (lower, _) = iter.size_hint();

let mut null_builder = BooleanBufferBuilder::new(lower);

let buffer: Buffer = iter
.map(|item| {
if let Some(a) = item.into().native {
null_builder.append(true);
a
} else {
null_builder.append(false);
// this ensures that null items on the buffer are not arbitrary.
// This is important because fallible operations can use null values (e.g. a vectorized "add")
// which may panic (e.g. overflow if the number on the slots happen to be very large).
T::Native::default()
}
})
.collect();

let len = null_builder.len();
let null_buf: Buffer = null_builder.into();
let valid_count = null_buf.count_set_bits();
let null_count = len - valid_count;
let opt_null_buf = (null_count != 0).then(|| null_buf);

let data = unsafe {
ArrayData::new_unchecked(
T::DATA_TYPE,
len,
Some(null_count),
opt_null_buf,
0,
vec![buffer],
vec![],
)
};
PrimitiveArray::from(data)
iter.into_iter().collect::<PrimitiveBuilder<T>>().finish()
}
}

Expand Down
44 changes: 7 additions & 37 deletions arrow/src/array/array_string.rs
Expand Up @@ -23,8 +23,8 @@ use super::{
array::print_long_array, raw_pointer::RawPtrBox, Array, ArrayData, GenericListArray,
GenericStringIter, OffsetSizeTrait,
};
use crate::array::GenericStringBuilder;
use crate::buffer::Buffer;
use crate::util::bit_util;
use crate::{buffer::MutableBuffer, datatypes::DataType};

/// Generic struct for \[Large\]StringArray
Expand Down Expand Up @@ -207,7 +207,9 @@ where
// Convert each owned Ptr into &str and wrap in an owned `Option`
let iter = iter.into_iter().map(|o| o.as_ref().map(|p| p.as_ref()));
// Build a `GenericStringArray` with the resulting iterator
iter.collect::<GenericStringArray<OffsetSize>>()
iter.into_iter()
.collect::<GenericStringBuilder<OffsetSize>>()
.finish()
}
}

Expand All @@ -218,41 +220,9 @@ where
{
/// Creates a [`GenericStringArray`] based on an iterator of [`Option`]s
fn from_iter<I: IntoIterator<Item = Option<Ptr>>>(iter: I) -> Self {
let iter = iter.into_iter();
let (_, data_len) = iter.size_hint();
let data_len = data_len.expect("Iterator must be sized"); // panic if no upper bound.

let offset_size = std::mem::size_of::<OffsetSize>();
let mut offsets = MutableBuffer::new((data_len + 1) * offset_size);
let mut values = MutableBuffer::new(0);
let mut null_buf = MutableBuffer::new_null(data_len);
let null_slice = null_buf.as_slice_mut();
let mut length_so_far = OffsetSize::zero();
offsets.push(length_so_far);

for (i, s) in iter.enumerate() {
let value_bytes = if let Some(ref s) = s {
// set null bit
bit_util::set_bit(null_slice, i);
let s_bytes = s.as_ref().as_bytes();
length_so_far += OffsetSize::from_usize(s_bytes.len()).unwrap();
s_bytes
} else {
b""
};
values.extend_from_slice(value_bytes);
offsets.push(length_so_far);
}

// calculate actual data_len, which may be different from the iterator's upper bound
let data_len = (offsets.len() / offset_size) - 1;
let array_data = ArrayData::builder(Self::get_data_type())
.len(data_len)
.add_buffer(offsets.into())
.add_buffer(values.into())
.null_bit_buffer(Some(null_buf.into()));
let array_data = unsafe { array_data.build_unchecked() };
Self::from(array_data)
iter.into_iter()
.collect::<GenericStringBuilder<OffsetSize>>()
.finish()
}
}

Expand Down