Skip to content

Commit

Permalink
Support nullable indices in boolean take kernel and some optimizations (
Browse files Browse the repository at this point in the history
apache#2064)

* Use iterator in boolean take kernel and support nullable indices

* Improve performance of take_boolean kernel by processing validity and value bits separately

* Add test for take_boolean with masked out of bounds indices

* Test and fix for when only the indices contain null values
  • Loading branch information
jhorstmann committed Jul 16, 2022
1 parent 72dada6 commit a6379fe
Showing 1 changed file with 86 additions and 48 deletions.
134 changes: 86 additions & 48 deletions arrow/src/compute/kernels/take.rs
Expand Up @@ -599,76 +599,68 @@ where
Ok(PrimitiveArray::<T>::from(data))
}

/// `take` implementation for boolean arrays
fn take_boolean<IndexType>(
values: &BooleanArray,
fn take_bits<IndexType>(
values: &Buffer,
values_offset: usize,
indices: &PrimitiveArray<IndexType>,
) -> Result<BooleanArray>
) -> Result<Buffer>
where
IndexType: ArrowNumericType,
IndexType::Native: ToPrimitive,
{
let data_len = indices.len();
let len = indices.len();
let values_slice = values.as_slice();
let mut output_buffer = MutableBuffer::new_null(len);
let output_slice = output_buffer.as_slice_mut();

let num_byte = bit_util::ceil(data_len, 8);
let mut val_buf = MutableBuffer::from_len_zeroed(num_byte);

let val_slice = val_buf.as_slice_mut();

let null_count = values.null_count();

let nulls = if null_count == 0 {
(0..data_len).try_for_each::<_, Result<()>>(|i| {
let index = ToPrimitive::to_usize(&indices.value(i)).ok_or_else(|| {
ArrowError::ComputeError("Cast to usize failed".to_string())
})?;
indices
.iter()
.enumerate()
.try_for_each::<_, Result<()>>(|(i, index)| {
if let Some(index) = index {
let index = ToPrimitive::to_usize(&index).ok_or_else(|| {
ArrowError::ComputeError("Cast to usize failed".to_string())
})?;

if values.value(index) {
bit_util::set_bit(val_slice, i);
if bit_util::get_bit(values_slice, values_offset + index) {
bit_util::set_bit(output_slice, i);
}
}

Ok(())
})?;

indices.data_ref().null_buffer().cloned()
} else {
let mut null_buf = MutableBuffer::new(num_byte).with_bitset(num_byte, true);
let null_slice = null_buf.as_slice_mut();

(0..data_len).try_for_each::<_, Result<()>>(|i| {
let index = ToPrimitive::to_usize(&indices.value(i)).ok_or_else(|| {
ArrowError::ComputeError("Cast to usize failed".to_string())
})?;

if values.is_null(index) {
bit_util::unset_bit(null_slice, i);
} else if values.value(index) {
bit_util::set_bit(val_slice, i);
}

Ok(())
})?;
Ok(output_buffer.into())
}

match indices.data_ref().null_buffer() {
Some(buffer) => Some(buffer_bin_and(
buffer,
indices.offset(),
&null_buf.into(),
0,
indices.len(),
)),
None => Some(null_buf.into()),
/// `take` implementation for boolean arrays
fn take_boolean<IndexType>(
values: &BooleanArray,
indices: &PrimitiveArray<IndexType>,
) -> Result<BooleanArray>
where
IndexType: ArrowNumericType,
IndexType::Native: ToPrimitive,
{
let val_buf = take_bits(values.values(), values.offset(), indices)?;
let null_buf = match values.data().null_buffer() {
Some(buf) if values.null_count() > 0 => {
Some(take_bits(buf, values.offset(), indices)?)
}
_ => indices
.data()
.null_buffer()
.map(|b| b.bit_slice(indices.offset(), indices.len())),
};

let data = unsafe {
ArrayData::new_unchecked(
DataType::Boolean,
indices.len(),
None,
nulls,
null_buf,
0,
vec![val_buf.into()],
vec![val_buf],
vec![],
)
};
Expand Down Expand Up @@ -1467,6 +1459,52 @@ mod tests {
);
}

#[test]
fn test_take_bool_nullable_index() {
// indices where the masked invalid elements would be out of bounds
let index_data = ArrayData::try_new(
DataType::Int32,
6,
Some(Buffer::from_iter(vec![
false, true, false, true, false, true,
])),
0,
vec![Buffer::from_iter(vec![99, 0, 999, 1, 9999, 2])],
vec![],
)
.unwrap();
let index = UInt32Array::from(index_data);
test_take_boolean_arrays(
vec![Some(true), None, Some(false)],
&index,
None,
vec![None, Some(true), None, None, None, Some(false)],
);
}

#[test]
fn test_take_bool_nullable_index_nonnull_values() {
// indices where the masked invalid elements would be out of bounds
let index_data = ArrayData::try_new(
DataType::Int32,
6,
Some(Buffer::from_iter(vec![
false, true, false, true, false, true,
])),
0,
vec![Buffer::from_iter(vec![99, 0, 999, 1, 9999, 2])],
vec![],
)
.unwrap();
let index = UInt32Array::from(index_data);
test_take_boolean_arrays(
vec![Some(true), Some(true), Some(false)],
&index,
None,
vec![None, Some(true), None, Some(true), None, Some(false)],
);
}

#[test]
fn test_take_bool_with_offset() {
let index =
Expand Down

0 comments on commit a6379fe

Please sign in to comment.