Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add FixedSizeBinaryArray::try_from_sparse_iter_with_size #3054

Merged
merged 12 commits into from Nov 13, 2022
Merged
121 changes: 114 additions & 7 deletions arrow-array/src/array/fixed_size_binary_array.rs
Expand Up @@ -129,6 +129,9 @@ impl FixedSizeBinaryArray {
/// # Errors
///
/// Returns error if argument has length zero, or sizes of nested slices don't match.
#[deprecated(
note = "This function will fail if the iterator produces only None values; prefer `try_from_sparse_iter_with_size`"
)]
pub fn try_from_sparse_iter<T, U>(mut iter: T) -> Result<Self, ArrowError>
where
T: Iterator<Item = Option<U>>,
Expand Down Expand Up @@ -196,6 +199,86 @@ impl FixedSizeBinaryArray {
Ok(FixedSizeBinaryArray::from(array_data))
}

/// Create an array from an iterable argument of sparse byte slices.
/// Sparsity means that items returned by the iterator are optional, i.e input argument can
/// contain `None` items. In cases where the iterator returns only `None` values, this
/// also takes a size parameter to ensure that the a valid FixedSizeBinaryArray is still
/// created.
///
/// # Examples
///
/// ```
/// use arrow_array::FixedSizeBinaryArray;
/// let input_arg = vec![
/// None,
/// Some(vec![7, 8]),
/// Some(vec![9, 10]),
/// None,
/// Some(vec![13, 14]),
/// None,
/// ];
/// let array = FixedSizeBinaryArray::try_from_sparse_iter_with_size(input_arg.into_iter(), 2).unwrap();
/// ```
///
/// # Errors
///
/// Returns error if argument has length zero, or sizes of nested slices don't match.
pub fn try_from_sparse_iter_with_size<T, U>(
mut iter: T,
size: i32,
) -> Result<Self, ArrowError>
where
T: Iterator<Item = Option<U>>,
U: AsRef<[u8]>,
{
let mut len = 0;
let mut byte = 0;
let mut null_buf = MutableBuffer::from_len_zeroed(0);
let mut buffer = MutableBuffer::from_len_zeroed(0);

iter.try_for_each(|item| -> Result<(), ArrowError> {
// extend null bitmask by one byte per each 8 items
if byte == 0 {
null_buf.push(0u8);
byte = 8;
}
byte -= 1;

if let Some(slice) = item {
let slice = slice.as_ref();
if size as usize != slice.len() {
return Err(ArrowError::InvalidArgumentError(format!(
"Nested array size mismatch: one is {}, and the other is {}",
size,
slice.len()
)));
}

bit_util::set_bit(null_buf.as_slice_mut(), len);
buffer.extend_from_slice(slice);
} else {
buffer.extend_zeros(size as usize);
}

len += 1;

Ok(())
})?;

let array_data = unsafe {
ArrayData::new_unchecked(
DataType::FixedSizeBinary(size),
len,
None,
Some(null_buf.into()),
0,
vec![buffer.into()],
vec![],
)
};
Ok(FixedSizeBinaryArray::from(array_data))
}

/// Create an array from an iterable argument of byte slices.
///
/// # Examples
Expand Down Expand Up @@ -333,6 +416,7 @@ impl From<FixedSizeListArray> for FixedSizeBinaryArray {

impl From<Vec<Option<&[u8]>>> for FixedSizeBinaryArray {
fn from(v: Vec<Option<&[u8]>>) -> Self {
#[allow(deprecated)]
Self::try_from_sparse_iter(v.into_iter()).unwrap()
}
}
Expand Down Expand Up @@ -561,6 +645,7 @@ mod tests {
fn test_all_none_fixed_size_binary_array_from_sparse_iter() {
let none_option: Option<[u8; 32]> = None;
let input_arg = vec![none_option, none_option, none_option];
#[allow(deprecated)]
let arr =
FixedSizeBinaryArray::try_from_sparse_iter(input_arg.into_iter()).unwrap();
assert_eq!(0, arr.value_length());
Expand All @@ -576,9 +661,31 @@ mod tests {
None,
Some(vec![13, 14]),
];
let arr =
FixedSizeBinaryArray::try_from_sparse_iter(input_arg.into_iter()).unwrap();
#[allow(deprecated)]
let arr = FixedSizeBinaryArray::try_from_sparse_iter(input_arg.iter().cloned())
.unwrap();
assert_eq!(2, arr.value_length());
assert_eq!(5, arr.len());

let arr = FixedSizeBinaryArray::try_from_sparse_iter_with_size(
input_arg.into_iter(),
2,
)
.unwrap();
assert_eq!(2, arr.value_length());
assert_eq!(5, arr.len());
}

#[test]
fn test_fixed_size_binary_array_from_sparse_iter_with_size_all_none() {
let input_arg = vec![None, None, None, None, None] as Vec<Option<Vec<u8>>>;

let arr = FixedSizeBinaryArray::try_from_sparse_iter_with_size(
input_arg.into_iter(),
16,
)
.unwrap();
assert_eq!(16, arr.value_length());
assert_eq!(5, arr.len())
}

Expand Down Expand Up @@ -643,7 +750,9 @@ mod tests {
#[test]
fn fixed_size_binary_array_all_null() {
let data = vec![None] as Vec<Option<String>>;
let array = FixedSizeBinaryArray::try_from_sparse_iter(data.into_iter()).unwrap();
let array =
FixedSizeBinaryArray::try_from_sparse_iter_with_size(data.into_iter(), 0)
.unwrap();
array
.data()
.validate_full()
Expand All @@ -652,16 +761,14 @@ mod tests {

#[test]
// Test for https://github.com/apache/arrow-rs/issues/1390
#[should_panic(
expected = "column types must match schema types, expected FixedSizeBinary(2) but found FixedSizeBinary(0) at column index 0"
)]
fn fixed_size_binary_array_all_null_in_batch_with_schema() {
let schema =
Schema::new(vec![Field::new("a", DataType::FixedSizeBinary(2), true)]);

let none_option: Option<[u8; 2]> = None;
let item = FixedSizeBinaryArray::try_from_sparse_iter(
let item = FixedSizeBinaryArray::try_from_sparse_iter_with_size(
vec![none_option, none_option, none_option].into_iter(),
2,
)
.unwrap();

Expand Down
7 changes: 4 additions & 3 deletions arrow-select/src/take.rs
Expand Up @@ -207,12 +207,12 @@ where
DataType::LargeBinary => {
Ok(Arc::new(take_bytes(as_generic_binary_array::<i64>(values), indices)?))
}
DataType::FixedSizeBinary(_) => {
DataType::FixedSizeBinary(size) => {
let values = values
.as_any()
.downcast_ref::<FixedSizeBinaryArray>()
.unwrap();
Ok(Arc::new(take_fixed_size_binary(values, indices)?))
Ok(Arc::new(take_fixed_size_binary(values, indices, *size)?))
}
DataType::Null => {
// Take applied to a null array produces a null array.
Expand Down Expand Up @@ -769,6 +769,7 @@ where
fn take_fixed_size_binary<IndexType>(
values: &FixedSizeBinaryArray,
indices: &PrimitiveArray<IndexType>,
size: i32,
) -> Result<FixedSizeBinaryArray, ArrowError>
where
IndexType: ArrowPrimitiveType,
Expand All @@ -789,7 +790,7 @@ where
.collect::<Result<Vec<_>, ArrowError>>()?
.into_iter();

FixedSizeBinaryArray::try_from_sparse_iter(array_iter)
FixedSizeBinaryArray::try_from_sparse_iter_with_size(array_iter, size)
}

/// `take` implementation for dictionary arrays
Expand Down
41 changes: 41 additions & 0 deletions arrow/src/compute/kernels/comparison.rs
Expand Up @@ -2270,6 +2270,18 @@ macro_rules! typed_compares {
as_largestring_array($RIGHT),
$OP,
),
(DataType::FixedSizeBinary(_), DataType::FixedSizeBinary(_)) => {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change does not appear to be related?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, it's not, but it was something we were hoping to upstream and I think it accidentally fell into the branch. Would you like me to update this PR to mention this change?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think either we need tests of it, or we need to roll it back

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tests added.

let lhs = $LEFT
.as_any()
.downcast_ref::<FixedSizeBinaryArray>()
.unwrap();
let rhs = $RIGHT
.as_any()
.downcast_ref::<FixedSizeBinaryArray>()
.unwrap();

compare_op(lhs, rhs, $OP)
}
(DataType::Binary, DataType::Binary) => compare_op(
as_generic_binary_array::<i32>($LEFT),
as_generic_binary_array::<i32>($RIGHT),
Expand Down Expand Up @@ -5461,6 +5473,35 @@ mod tests {
);
}

#[test]
fn test_eq_dyn_neq_dyn_fixed_size_binary() {
use crate::array::FixedSizeBinaryArray;

let values1: Vec<Option<&[u8]>> =
vec![Some(&[0xfc, 0xa9]), None, Some(&[0x36, 0x01])];
let values2: Vec<Option<&[u8]>> =
vec![Some(&[0xfc, 0xa9]), None, Some(&[0x36, 0x00])];

let array1 =
FixedSizeBinaryArray::try_from_sparse_iter_with_size(values1.into_iter(), 2)
.unwrap();
let array2 =
FixedSizeBinaryArray::try_from_sparse_iter_with_size(values2.into_iter(), 2)
.unwrap();

let result = eq_dyn(&array1, &array2).unwrap();
assert_eq!(
BooleanArray::from(vec![Some(true), None, Some(false)]),
result
);

let result = neq_dyn(&array1, &array2).unwrap();
assert_eq!(
BooleanArray::from(vec![Some(false), None, Some(true)]),
result
);
}

#[test]
#[cfg(feature = "dyn_cmp_dict")]
fn test_eq_dyn_neq_dyn_dictionary_i8_array() {
Expand Down
15 changes: 11 additions & 4 deletions arrow/src/compute/kernels/sort.rs
Expand Up @@ -1437,17 +1437,24 @@ mod tests {
fixed_length: Option<i32>,
) {
// Fixed size binary array
if fixed_length.is_some() {
if let Some(length) = fixed_length {
let input = Arc::new(
FixedSizeBinaryArray::try_from_sparse_iter(data.iter().cloned()).unwrap(),
FixedSizeBinaryArray::try_from_sparse_iter_with_size(
data.iter().cloned(),
length,
)
.unwrap(),
);
let sorted = match limit {
Some(_) => sort_limit(&(input as ArrayRef), options, limit).unwrap(),
None => sort(&(input as ArrayRef), options).unwrap(),
};
let expected = Arc::new(
FixedSizeBinaryArray::try_from_sparse_iter(expected_data.iter().cloned())
.unwrap(),
FixedSizeBinaryArray::try_from_sparse_iter_with_size(
expected_data.iter().cloned(),
length,
)
.unwrap(),
) as ArrayRef;

assert_eq!(&sorted, &expected);
Expand Down
3 changes: 2 additions & 1 deletion arrow/src/compute/kernels/substring.rs
Expand Up @@ -713,8 +713,9 @@ mod tests {
.as_any()
.downcast_ref::<FixedSizeBinaryArray>()
.unwrap();
let expected = FixedSizeBinaryArray::try_from_sparse_iter(
let expected = FixedSizeBinaryArray::try_from_sparse_iter_with_size(
vec![None, Some(b"rrow")].into_iter(),
4,
)
.unwrap();
assert_eq!(result, &expected);
Expand Down
25 changes: 14 additions & 11 deletions arrow/src/util/bench_util.rs
Expand Up @@ -176,17 +176,20 @@ pub fn create_fsb_array(
) -> FixedSizeBinaryArray {
let rng = &mut seedable_rng();

FixedSizeBinaryArray::try_from_sparse_iter((0..size).map(|_| {
if rng.gen::<f32>() < null_density {
None
} else {
let value = rng
.sample_iter::<u8, _>(Standard)
.take(value_len)
.collect::<Vec<u8>>();
Some(value)
}
}))
FixedSizeBinaryArray::try_from_sparse_iter_with_size(
(0..size).map(|_| {
if rng.gen::<f32>() < null_density {
None
} else {
let value = rng
.sample_iter::<u8, _>(Standard)
.take(value_len)
.collect::<Vec<u8>>();
Some(value)
}
}),
value_len as i32,
)
.unwrap()
}

Expand Down
9 changes: 5 additions & 4 deletions arrow/tests/array_transform.rs
Expand Up @@ -868,7 +868,7 @@ fn test_list_of_strings_append() {
#[test]
fn test_fixed_size_binary_append() {
let a = vec![Some(vec![1, 2]), Some(vec![3, 4]), Some(vec![5, 6])];
let a = FixedSizeBinaryArray::try_from_sparse_iter(a.into_iter())
let a = FixedSizeBinaryArray::try_from_sparse_iter_with_size(a.into_iter(), 2)
.expect("Failed to create FixedSizeBinaryArray from iterable");

let b = vec![
Expand All @@ -879,7 +879,7 @@ fn test_fixed_size_binary_append() {
Some(vec![13, 14]),
None,
];
let b = FixedSizeBinaryArray::try_from_sparse_iter(b.into_iter())
let b = FixedSizeBinaryArray::try_from_sparse_iter_with_size(b.into_iter(), 2)
.expect("Failed to create FixedSizeBinaryArray from iterable");

let mut mutable = MutableArrayData::new(vec![a.data(), b.data()], false, 10);
Expand Down Expand Up @@ -911,8 +911,9 @@ fn test_fixed_size_binary_append() {
Some(vec![9, 10]),
// b[4..4]
];
let expected = FixedSizeBinaryArray::try_from_sparse_iter(expected.into_iter())
.expect("Failed to create FixedSizeBinaryArray from iterable");
let expected =
FixedSizeBinaryArray::try_from_sparse_iter_with_size(expected.into_iter(), 2)
.expect("Failed to create FixedSizeBinaryArray from iterable");
assert_eq!(&result, expected.data());
}

Expand Down