Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add more const evaluation for GenericBinaryArray and GenericListArray: add PREFIX and data type constructor #2327

Merged
merged 1 commit into from Aug 5, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
26 changes: 10 additions & 16 deletions arrow/src/array/array_binary.rs
Expand Up @@ -236,7 +236,7 @@ impl<'a, T: OffsetSizeTrait> GenericBinaryArray<T> {

impl<OffsetSize: OffsetSizeTrait> fmt::Debug for GenericBinaryArray<OffsetSize> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let prefix = if OffsetSize::IS_LARGE { "Large" } else { "" };
let prefix = OffsetSize::PREFIX;

write!(f, "{}BinaryArray\n[\n", prefix)?;
print_long_array(self, f, |array, index, f| {
Expand Down Expand Up @@ -608,11 +608,9 @@ mod tests {
.unwrap();
let binary_array1 = GenericBinaryArray::<O>::from(array_data1);

let data_type = if O::IS_LARGE {
DataType::LargeList
} else {
DataType::List
}(Box::new(Field::new("item", DataType::UInt8, false)));
let data_type = GenericListArray::<O>::DATA_TYPE_CONSTRUCTOR(Box::new(
Field::new("item", DataType::UInt8, false),
));

let array_data2 = ArrayData::builder(data_type)
.len(3)
Expand Down Expand Up @@ -660,11 +658,9 @@ mod tests {

let offsets = [0, 5, 8, 15].map(|n| O::from_usize(n).unwrap());
let null_buffer = Buffer::from_slice_ref(&[0b101]);
let data_type = if O::IS_LARGE {
DataType::LargeList
} else {
DataType::List
}(Box::new(Field::new("item", DataType::UInt8, false)));
let data_type = GenericListArray::<O>::DATA_TYPE_CONSTRUCTOR(Box::new(
Field::new("item", DataType::UInt8, false),
));

// [None, Some(b"Parquet")]
let array_data = ArrayData::builder(data_type)
Expand Down Expand Up @@ -707,11 +703,9 @@ mod tests {
.unwrap();

let offsets = [0, 5, 10].map(|n| O::from_usize(n).unwrap());
let data_type = if O::IS_LARGE {
DataType::LargeList
} else {
DataType::List
}(Box::new(Field::new("item", DataType::UInt8, false)));
let data_type = GenericListArray::<O>::DATA_TYPE_CONSTRUCTOR(Box::new(
Field::new("item", DataType::UInt8, false),
));

// [None, Some(b"Parquet")]
let array_data = ArrayData::builder(data_type)
Expand Down
21 changes: 15 additions & 6 deletions arrow/src/array/array_list.rs
Expand Up @@ -34,14 +34,17 @@ use crate::{
/// trait declaring an offset size, relevant for i32 vs i64 array types.
pub trait OffsetSizeTrait: ArrowNativeType + std::ops::AddAssign + Integer {
const IS_LARGE: bool;
const PREFIX: &'static str;
}

impl OffsetSizeTrait for i32 {
const IS_LARGE: bool = false;
const PREFIX: &'static str = "";
}

impl OffsetSizeTrait for i64 {
const IS_LARGE: bool = true;
const PREFIX: &'static str = "Large";
}

/// Generic struct for a variable-size list array.
Expand All @@ -57,6 +60,16 @@ pub struct GenericListArray<OffsetSize> {
}

impl<OffsetSize: OffsetSizeTrait> GenericListArray<OffsetSize> {
/// The data type constructor of list array.
/// The input is the schema of the child array and
/// the output is the [`DataType`], List or LargeList.
pub const DATA_TYPE_CONSTRUCTOR: fn(Box<Field>) -> DataType = if OffsetSize::IS_LARGE
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
pub const DATA_TYPE_CONSTRUCTOR: fn(Box<Field>) -> DataType = if OffsetSize::IS_LARGE
pub const MAKE_DATA_TYPE: fn(Box<Field>) -> DataType = if OffsetSize::IS_LARGE

DATA_TYPE_CONSTRUCTOR was somewhat hard on my eyes when reading this PR. That being said while I might prefer something like MAKE_DATA_TYPE it is only an opinion for your consideration

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for your review @alamb. Personally, I prefer the name CONSTRUCTOR, as ListArray and LargeListArray constructor tags of DataType: https://en.wikipedia.org/wiki/Algebraic_data_type#:~:text=A%20general%20algebraic%20data%20type,factors%20of%20the%20product%20type.

{
DataType::LargeList
} else {
DataType::List
};

/// Returns a reference to the values of this list.
pub fn values(&self) -> ArrayRef {
self.values.clone()
Expand Down Expand Up @@ -170,11 +183,7 @@ impl<OffsetSize: OffsetSizeTrait> GenericListArray<OffsetSize> {
.collect();

let field = Box::new(Field::new("item", T::DATA_TYPE, true));
let data_type = if OffsetSize::IS_LARGE {
DataType::LargeList(field)
} else {
DataType::List(field)
};
let data_type = Self::DATA_TYPE_CONSTRUCTOR(field);
let array_data = ArrayData::builder(data_type)
.len(null_buf.len())
.add_buffer(offsets.into())
Expand Down Expand Up @@ -274,7 +283,7 @@ impl<'a, OffsetSize: OffsetSizeTrait> ArrayAccessor for &'a GenericListArray<Off

impl<OffsetSize: OffsetSizeTrait> fmt::Debug for GenericListArray<OffsetSize> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let prefix = if OffsetSize::IS_LARGE { "Large" } else { "" };
let prefix = OffsetSize::PREFIX;

write!(f, "{}ListArray\n[\n", prefix)?;
print_long_array(self, f, |array, index, f| {
Expand Down
26 changes: 10 additions & 16 deletions arrow/src/array/array_string.rs
Expand Up @@ -294,7 +294,7 @@ impl<'a, T: OffsetSizeTrait> GenericStringArray<T> {

impl<OffsetSize: OffsetSizeTrait> fmt::Debug for GenericStringArray<OffsetSize> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let prefix = if OffsetSize::IS_LARGE { "Large" } else { "" };
let prefix = OffsetSize::PREFIX;

write!(f, "{}StringArray\n[\n", prefix)?;
print_long_array(self, f, |array, index, f| {
Expand Down Expand Up @@ -707,11 +707,9 @@ mod tests {

let offsets = [0, 5, 8, 15].map(|n| O::from_usize(n).unwrap());
let null_buffer = Buffer::from_slice_ref(&[0b101]);
let data_type = if O::IS_LARGE {
DataType::LargeList
} else {
DataType::List
}(Box::new(Field::new("item", DataType::UInt8, false)));
let data_type = GenericListArray::<O>::DATA_TYPE_CONSTRUCTOR(Box::new(
Field::new("item", DataType::UInt8, false),
));

// [None, Some("Parquet")]
let array_data = ArrayData::builder(data_type)
Expand Down Expand Up @@ -754,11 +752,9 @@ mod tests {
.unwrap();

let offsets = [0, 5, 10].map(|n| O::from_usize(n).unwrap());
let data_type = if O::IS_LARGE {
DataType::LargeList
} else {
DataType::List
}(Box::new(Field::new("item", DataType::UInt8, false)));
let data_type = GenericListArray::<O>::DATA_TYPE_CONSTRUCTOR(Box::new(
Field::new("item", DataType::UInt8, false),
));

// [None, Some(b"Parquet")]
let array_data = ArrayData::builder(data_type)
Expand Down Expand Up @@ -792,11 +788,9 @@ mod tests {
.unwrap();

let offsets = [0, 2, 3].map(|n| O::from_usize(n).unwrap());
let data_type = if O::IS_LARGE {
DataType::LargeList
} else {
DataType::List
}(Box::new(Field::new("item", DataType::UInt16, false)));
let data_type = GenericListArray::<O>::DATA_TYPE_CONSTRUCTOR(Box::new(
Field::new("item", DataType::UInt16, false),
));

let array_data = ArrayData::builder(data_type)
.len(2)
Expand Down
8 changes: 2 additions & 6 deletions arrow/src/array/builder/generic_list_builder.rs
Expand Up @@ -22,7 +22,6 @@ use crate::array::ArrayData;
use crate::array::ArrayRef;
use crate::array::GenericListArray;
use crate::array::OffsetSizeTrait;
use crate::datatypes::DataType;
use crate::datatypes::Field;

use super::{ArrayBuilder, BufferBuilder, NullBufferBuilder};
Expand Down Expand Up @@ -135,11 +134,7 @@ where
values_data.data_type().clone(),
true, // TODO: find a consistent way of getting this
));
let data_type = if OffsetSize::IS_LARGE {
DataType::LargeList(field)
} else {
DataType::List(field)
};
let data_type = GenericListArray::<OffsetSize>::DATA_TYPE_CONSTRUCTOR(field);
let array_data_builder = ArrayData::builder(data_type)
.len(len)
.add_buffer(offset_buffer)
Expand All @@ -163,6 +158,7 @@ mod tests {
use crate::array::builder::ListBuilder;
use crate::array::{Array, Int32Array, Int32Builder};
use crate::buffer::Buffer;
use crate::datatypes::DataType;

fn _test_generic_list_array_builder<O: OffsetSizeTrait>() {
let values_builder = Int32Builder::new(10);
Expand Down
6 changes: 1 addition & 5 deletions arrow/src/array/transform/mod.rs
Expand Up @@ -313,11 +313,7 @@ fn preallocate_offset_and_binary_buffer<Offset: OffsetSizeTrait>(
// offsets
let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<Offset>());
// safety: `unsafe` code assumes that this buffer is initialized with one element
if Offset::IS_LARGE {
buffer.push(0i64);
} else {
buffer.push(0i32)
}
buffer.push(Offset::zero());

[
buffer,
Expand Down
38 changes: 10 additions & 28 deletions arrow/src/compute/util.rs
Expand Up @@ -351,9 +351,7 @@ pub(super) mod tests {
T: ArrowPrimitiveType,
PrimitiveArray<T>: From<Vec<Option<T::Native>>>,
{
use std::any::TypeId;

let mut offset = vec![0];
let mut offset = vec![S::zero()];
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

let mut values = vec![];

let list_len = data.len();
Expand All @@ -367,34 +365,18 @@ pub(super) mod tests {
list_null_count += 1;
bit_util::unset_bit(list_bitmap.as_slice_mut(), idx);
}
offset.push(values.len() as i64);
offset.push(S::from_usize(values.len()).unwrap());
}

let value_data = PrimitiveArray::<T>::from(values).into_data();
let (list_data_type, value_offsets) = if TypeId::of::<S>() == TypeId::of::<i32>()
{
(
DataType::List(Box::new(Field::new(
"item",
T::DATA_TYPE,
list_null_count == 0,
))),
Buffer::from_slice_ref(
&offset.into_iter().map(|x| x as i32).collect::<Vec<i32>>(),
),
)
} else if TypeId::of::<S>() == TypeId::of::<i64>() {
(
DataType::LargeList(Box::new(Field::new(
"item",
T::DATA_TYPE,
list_null_count == 0,
))),
Buffer::from_slice_ref(&offset),
)
} else {
unreachable!()
};
let (list_data_type, value_offsets) = (
GenericListArray::<S>::DATA_TYPE_CONSTRUCTOR(Box::new(Field::new(
"item",
T::DATA_TYPE,
list_null_count == 0,
))),
Buffer::from_slice_ref(&offset),
);

let list_data = ArrayData::builder(list_data_type)
.len(list_len)
Expand Down
9 changes: 3 additions & 6 deletions arrow/src/ffi.rs
Expand Up @@ -1034,12 +1034,9 @@ mod tests {
.collect::<Buffer>();

// Construct a list array from the above two
let list_data_type = match std::mem::size_of::<Offset>() {
4 => DataType::List(Box::new(Field::new("item", DataType::Int32, false))),
_ => {
DataType::LargeList(Box::new(Field::new("item", DataType::Int32, false)))
}
};
let list_data_type = GenericListArray::<Offset>::DATA_TYPE_CONSTRUCTOR(Box::new(
Field::new("item", DataType::Int32, false),
));

let list_data = ArrayData::builder(list_data_type)
.len(3)
Expand Down
5 changes: 1 addition & 4 deletions parquet/src/arrow/array_reader/list_array.rs
Expand Up @@ -268,10 +268,7 @@ mod tests {
item_nullable: bool,
) -> ArrowType {
let field = Box::new(Field::new("item", data_type, item_nullable));
match OffsetSize::IS_LARGE {
true => ArrowType::LargeList(field),
false => ArrowType::List(field),
}
GenericListArray::<OffsetSize>::DATA_TYPE_CONSTRUCTOR(field)
}

fn downcast<OffsetSize: OffsetSizeTrait>(
Expand Down