Skip to content

Commit

Permalink
Add FixedSizeBinaryArray::try_from_sparse_iter_with_size (#3054)
Browse files Browse the repository at this point in the history
* implement ord for FixedSizeBinary types

* add ord test

* add compare for fixed size binary arrays

* FixedSizeBinaryArray::try_from_sparse_iter has fails to generate a valid array when the iterator only produces None values

* Tweak try_from_sparse_iter_with_size to take an Option<i32>; pass tests

* simplify try_from_sparse_iter_with_size, make size parameter non-optional

* add test for fixed size binary comparisons

* move tests to use FixedSizeBinaryArray::from_sparse_iter_with_size, add docstring

* format + fix failing tests

* fix build
  • Loading branch information
maxburke committed Nov 13, 2022
1 parent b7af85c commit 20d81f5
Show file tree
Hide file tree
Showing 9 changed files with 198 additions and 34 deletions.
121 changes: 114 additions & 7 deletions arrow-array/src/array/fixed_size_binary_array.rs
Expand Up @@ -129,6 +129,9 @@ impl FixedSizeBinaryArray {
/// # Errors
///
/// Returns error if argument has length zero, or sizes of nested slices don't match.
#[deprecated(
note = "This function will fail if the iterator produces only None values; prefer `try_from_sparse_iter_with_size`"
)]
pub fn try_from_sparse_iter<T, U>(mut iter: T) -> Result<Self, ArrowError>
where
T: Iterator<Item = Option<U>>,
Expand Down Expand Up @@ -196,6 +199,86 @@ impl FixedSizeBinaryArray {
Ok(FixedSizeBinaryArray::from(array_data))
}

/// Create an array from an iterable argument of sparse byte slices.
/// Sparsity means that items returned by the iterator are optional, i.e input argument can
/// contain `None` items. In cases where the iterator returns only `None` values, this
/// also takes a size parameter to ensure that the a valid FixedSizeBinaryArray is still
/// created.
///
/// # Examples
///
/// ```
/// use arrow_array::FixedSizeBinaryArray;
/// let input_arg = vec![
/// None,
/// Some(vec![7, 8]),
/// Some(vec![9, 10]),
/// None,
/// Some(vec![13, 14]),
/// None,
/// ];
/// let array = FixedSizeBinaryArray::try_from_sparse_iter_with_size(input_arg.into_iter(), 2).unwrap();
/// ```
///
/// # Errors
///
/// Returns error if argument has length zero, or sizes of nested slices don't match.
pub fn try_from_sparse_iter_with_size<T, U>(
mut iter: T,
size: i32,
) -> Result<Self, ArrowError>
where
T: Iterator<Item = Option<U>>,
U: AsRef<[u8]>,
{
let mut len = 0;
let mut byte = 0;
let mut null_buf = MutableBuffer::from_len_zeroed(0);
let mut buffer = MutableBuffer::from_len_zeroed(0);

iter.try_for_each(|item| -> Result<(), ArrowError> {
// extend null bitmask by one byte per each 8 items
if byte == 0 {
null_buf.push(0u8);
byte = 8;
}
byte -= 1;

if let Some(slice) = item {
let slice = slice.as_ref();
if size as usize != slice.len() {
return Err(ArrowError::InvalidArgumentError(format!(
"Nested array size mismatch: one is {}, and the other is {}",
size,
slice.len()
)));
}

bit_util::set_bit(null_buf.as_slice_mut(), len);
buffer.extend_from_slice(slice);
} else {
buffer.extend_zeros(size as usize);
}

len += 1;

Ok(())
})?;

let array_data = unsafe {
ArrayData::new_unchecked(
DataType::FixedSizeBinary(size),
len,
None,
Some(null_buf.into()),
0,
vec![buffer.into()],
vec![],
)
};
Ok(FixedSizeBinaryArray::from(array_data))
}

/// Create an array from an iterable argument of byte slices.
///
/// # Examples
Expand Down Expand Up @@ -333,6 +416,7 @@ impl From<FixedSizeListArray> for FixedSizeBinaryArray {

impl From<Vec<Option<&[u8]>>> for FixedSizeBinaryArray {
fn from(v: Vec<Option<&[u8]>>) -> Self {
#[allow(deprecated)]
Self::try_from_sparse_iter(v.into_iter()).unwrap()
}
}
Expand Down Expand Up @@ -561,6 +645,7 @@ mod tests {
fn test_all_none_fixed_size_binary_array_from_sparse_iter() {
let none_option: Option<[u8; 32]> = None;
let input_arg = vec![none_option, none_option, none_option];
#[allow(deprecated)]
let arr =
FixedSizeBinaryArray::try_from_sparse_iter(input_arg.into_iter()).unwrap();
assert_eq!(0, arr.value_length());
Expand All @@ -576,9 +661,31 @@ mod tests {
None,
Some(vec![13, 14]),
];
let arr =
FixedSizeBinaryArray::try_from_sparse_iter(input_arg.into_iter()).unwrap();
#[allow(deprecated)]
let arr = FixedSizeBinaryArray::try_from_sparse_iter(input_arg.iter().cloned())
.unwrap();
assert_eq!(2, arr.value_length());
assert_eq!(5, arr.len());

let arr = FixedSizeBinaryArray::try_from_sparse_iter_with_size(
input_arg.into_iter(),
2,
)
.unwrap();
assert_eq!(2, arr.value_length());
assert_eq!(5, arr.len());
}

#[test]
fn test_fixed_size_binary_array_from_sparse_iter_with_size_all_none() {
let input_arg = vec![None, None, None, None, None] as Vec<Option<Vec<u8>>>;

let arr = FixedSizeBinaryArray::try_from_sparse_iter_with_size(
input_arg.into_iter(),
16,
)
.unwrap();
assert_eq!(16, arr.value_length());
assert_eq!(5, arr.len())
}

Expand Down Expand Up @@ -643,7 +750,9 @@ mod tests {
#[test]
fn fixed_size_binary_array_all_null() {
let data = vec![None] as Vec<Option<String>>;
let array = FixedSizeBinaryArray::try_from_sparse_iter(data.into_iter()).unwrap();
let array =
FixedSizeBinaryArray::try_from_sparse_iter_with_size(data.into_iter(), 0)
.unwrap();
array
.data()
.validate_full()
Expand All @@ -652,16 +761,14 @@ mod tests {

#[test]
// Test for https://github.com/apache/arrow-rs/issues/1390
#[should_panic(
expected = "column types must match schema types, expected FixedSizeBinary(2) but found FixedSizeBinary(0) at column index 0"
)]
fn fixed_size_binary_array_all_null_in_batch_with_schema() {
let schema =
Schema::new(vec![Field::new("a", DataType::FixedSizeBinary(2), true)]);

let none_option: Option<[u8; 2]> = None;
let item = FixedSizeBinaryArray::try_from_sparse_iter(
let item = FixedSizeBinaryArray::try_from_sparse_iter_with_size(
vec![none_option, none_option, none_option].into_iter(),
2,
)
.unwrap();

Expand Down
7 changes: 4 additions & 3 deletions arrow-select/src/take.rs
Expand Up @@ -207,12 +207,12 @@ where
DataType::LargeBinary => {
Ok(Arc::new(take_bytes(as_generic_binary_array::<i64>(values), indices)?))
}
DataType::FixedSizeBinary(_) => {
DataType::FixedSizeBinary(size) => {
let values = values
.as_any()
.downcast_ref::<FixedSizeBinaryArray>()
.unwrap();
Ok(Arc::new(take_fixed_size_binary(values, indices)?))
Ok(Arc::new(take_fixed_size_binary(values, indices, *size)?))
}
DataType::Null => {
// Take applied to a null array produces a null array.
Expand Down Expand Up @@ -769,6 +769,7 @@ where
fn take_fixed_size_binary<IndexType>(
values: &FixedSizeBinaryArray,
indices: &PrimitiveArray<IndexType>,
size: i32,
) -> Result<FixedSizeBinaryArray, ArrowError>
where
IndexType: ArrowPrimitiveType,
Expand All @@ -789,7 +790,7 @@ where
.collect::<Result<Vec<_>, ArrowError>>()?
.into_iter();

FixedSizeBinaryArray::try_from_sparse_iter(array_iter)
FixedSizeBinaryArray::try_from_sparse_iter_with_size(array_iter, size)
}

/// `take` implementation for dictionary arrays
Expand Down
3 changes: 2 additions & 1 deletion arrow/src/array/ffi.rs
Expand Up @@ -212,7 +212,8 @@ mod tests {
Some(vec![30, 30, 30]),
None,
];
let array = FixedSizeBinaryArray::try_from_sparse_iter(values.into_iter())?;
let array =
FixedSizeBinaryArray::try_from_sparse_iter_with_size(values.into_iter(), 3)?;

let data = array.data();
test_round_trip(data)
Expand Down
41 changes: 41 additions & 0 deletions arrow/src/compute/kernels/comparison.rs
Expand Up @@ -2270,6 +2270,18 @@ macro_rules! typed_compares {
as_largestring_array($RIGHT),
$OP,
),
(DataType::FixedSizeBinary(_), DataType::FixedSizeBinary(_)) => {
let lhs = $LEFT
.as_any()
.downcast_ref::<FixedSizeBinaryArray>()
.unwrap();
let rhs = $RIGHT
.as_any()
.downcast_ref::<FixedSizeBinaryArray>()
.unwrap();

compare_op(lhs, rhs, $OP)
}
(DataType::Binary, DataType::Binary) => compare_op(
as_generic_binary_array::<i32>($LEFT),
as_generic_binary_array::<i32>($RIGHT),
Expand Down Expand Up @@ -5449,6 +5461,35 @@ mod tests {
);
}

#[test]
fn test_eq_dyn_neq_dyn_fixed_size_binary() {
use crate::array::FixedSizeBinaryArray;

let values1: Vec<Option<&[u8]>> =
vec![Some(&[0xfc, 0xa9]), None, Some(&[0x36, 0x01])];
let values2: Vec<Option<&[u8]>> =
vec![Some(&[0xfc, 0xa9]), None, Some(&[0x36, 0x00])];

let array1 =
FixedSizeBinaryArray::try_from_sparse_iter_with_size(values1.into_iter(), 2)
.unwrap();
let array2 =
FixedSizeBinaryArray::try_from_sparse_iter_with_size(values2.into_iter(), 2)
.unwrap();

let result = eq_dyn(&array1, &array2).unwrap();
assert_eq!(
BooleanArray::from(vec![Some(true), None, Some(false)]),
result
);

let result = neq_dyn(&array1, &array2).unwrap();
assert_eq!(
BooleanArray::from(vec![Some(false), None, Some(true)]),
result
);
}

#[test]
#[cfg(feature = "dyn_cmp_dict")]
fn test_eq_dyn_neq_dyn_dictionary_i8_array() {
Expand Down
15 changes: 11 additions & 4 deletions arrow/src/compute/kernels/sort.rs
Expand Up @@ -1437,17 +1437,24 @@ mod tests {
fixed_length: Option<i32>,
) {
// Fixed size binary array
if fixed_length.is_some() {
if let Some(length) = fixed_length {
let input = Arc::new(
FixedSizeBinaryArray::try_from_sparse_iter(data.iter().cloned()).unwrap(),
FixedSizeBinaryArray::try_from_sparse_iter_with_size(
data.iter().cloned(),
length,
)
.unwrap(),
);
let sorted = match limit {
Some(_) => sort_limit(&(input as ArrayRef), options, limit).unwrap(),
None => sort(&(input as ArrayRef), options).unwrap(),
};
let expected = Arc::new(
FixedSizeBinaryArray::try_from_sparse_iter(expected_data.iter().cloned())
.unwrap(),
FixedSizeBinaryArray::try_from_sparse_iter_with_size(
expected_data.iter().cloned(),
length,
)
.unwrap(),
) as ArrayRef;

assert_eq!(&sorted, &expected);
Expand Down
3 changes: 2 additions & 1 deletion arrow/src/compute/kernels/substring.rs
Expand Up @@ -713,8 +713,9 @@ mod tests {
.as_any()
.downcast_ref::<FixedSizeBinaryArray>()
.unwrap();
let expected = FixedSizeBinaryArray::try_from_sparse_iter(
let expected = FixedSizeBinaryArray::try_from_sparse_iter_with_size(
vec![None, Some(b"rrow")].into_iter(),
4,
)
.unwrap();
assert_eq!(result, &expected);
Expand Down
8 changes: 5 additions & 3 deletions arrow/src/ffi.rs
Expand Up @@ -1231,7 +1231,8 @@ mod tests {
Some(vec![30, 30, 30]),
None,
];
let array = FixedSizeBinaryArray::try_from_sparse_iter(values.into_iter())?;
let array =
FixedSizeBinaryArray::try_from_sparse_iter_with_size(values.into_iter(), 3)?;

// export it
let array = ArrowArray::try_from(array.into_data())?;
Expand All @@ -1250,7 +1251,7 @@ mod tests {
// verify
assert_eq!(
array,
&FixedSizeBinaryArray::try_from_sparse_iter(
&FixedSizeBinaryArray::try_from_sparse_iter_with_size(
vec![
None,
Some(vec![10, 10, 10]),
Expand All @@ -1265,7 +1266,8 @@ mod tests {
Some(vec![30, 30, 30]),
None,
]
.into_iter()
.into_iter(),
3
)?
);

Expand Down
25 changes: 14 additions & 11 deletions arrow/src/util/bench_util.rs
Expand Up @@ -176,17 +176,20 @@ pub fn create_fsb_array(
) -> FixedSizeBinaryArray {
let rng = &mut seedable_rng();

FixedSizeBinaryArray::try_from_sparse_iter((0..size).map(|_| {
if rng.gen::<f32>() < null_density {
None
} else {
let value = rng
.sample_iter::<u8, _>(Standard)
.take(value_len)
.collect::<Vec<u8>>();
Some(value)
}
}))
FixedSizeBinaryArray::try_from_sparse_iter_with_size(
(0..size).map(|_| {
if rng.gen::<f32>() < null_density {
None
} else {
let value = rng
.sample_iter::<u8, _>(Standard)
.take(value_len)
.collect::<Vec<u8>>();
Some(value)
}
}),
value_len as i32,
)
.unwrap()
}

Expand Down

0 comments on commit 20d81f5

Please sign in to comment.