Skip to content

Commit

Permalink
Specialize interleave strings (apache#2864)
Browse files Browse the repository at this point in the history
  • Loading branch information
tustvold committed Oct 26, 2022
1 parent cf134af commit 8d7bec2
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 30 deletions.
10 changes: 4 additions & 6 deletions arrow-array/src/array/string_array.rs
Expand Up @@ -90,8 +90,8 @@ impl<OffsetSize: OffsetSizeTrait> GenericStringArray<OffsetSize> {
/// caller is responsible for ensuring that index is within the array bounds
#[inline]
pub unsafe fn value_unchecked(&self, i: usize) -> &str {
let end = self.value_offsets().get_unchecked(i + 1);
let start = self.value_offsets().get_unchecked(i);
let end = self.value_offsets().get_unchecked(i + 1).as_usize();
let start = self.value_offsets().get_unchecked(i).as_usize();

// Soundness
// pointer alignment & location is ensured by RawPtrBox
Expand All @@ -103,10 +103,8 @@ impl<OffsetSize: OffsetSizeTrait> GenericStringArray<OffsetSize> {
// OffsetSizeTrait. Currently, only i32 and i64 implement OffsetSizeTrait,
// both of which should cleanly cast to isize on an architecture that supports
// 32/64-bit offsets
let slice = std::slice::from_raw_parts(
self.value_data.as_ptr().offset(start.to_isize().unwrap()),
(*end - *start).to_usize().unwrap(),
);
let slice =
std::slice::from_raw_parts(self.value_data.as_ptr().add(start), end - start);
std::str::from_utf8_unchecked(slice)
}

Expand Down
102 changes: 78 additions & 24 deletions arrow-select/src/interleave.rs
Expand Up @@ -16,11 +16,11 @@
// under the License.

use arrow_array::builder::{BooleanBufferBuilder, BufferBuilder};
use arrow_array::cast::as_primitive_array;
use arrow_array::{
downcast_primitive, make_array, new_empty_array, Array, ArrayRef, ArrowPrimitiveType,
PrimitiveArray,
GenericStringArray, OffsetSizeTrait, PrimitiveArray,
};
use arrow_buffer::{Buffer, MutableBuffer};
use arrow_data::transform::MutableArrayData;
use arrow_data::ArrayDataBuilder;
use arrow_schema::{ArrowError, DataType};
Expand Down Expand Up @@ -85,51 +85,105 @@ pub fn interleave(

downcast_primitive! {
data_type => (primitive_helper, values, indices, data_type),
DataType::Utf8 => interleave_string::<i32>(values, indices, data_type),
DataType::LargeUtf8 => interleave_string::<i64>(values, indices, data_type),
_ => interleave_fallback(values, indices)
}
}

/// Common functionality for interleaving arrays
struct Interleave<'a, T> {
arrays: Vec<&'a T>,
null_count: usize,
nulls: Option<Buffer>,
}

impl<'a, T: Array + 'static> Interleave<'a, T> {
fn new(values: &[&'a dyn Array], indices: &'a [(usize, usize)]) -> Self {
let mut has_nulls = false;
let arrays: Vec<&T> = values
.iter()
.map(|x| {
has_nulls = has_nulls || x.null_count() != 0;
x.as_any().downcast_ref().unwrap()
})
.collect();

let mut null_count = 0;
let nulls = has_nulls.then(|| {
let mut builder = BooleanBufferBuilder::new(indices.len());
for (a, b) in indices {
let v = arrays[*a].is_valid(*b);
null_count += !v as usize;
builder.append(v)
}
builder.finish()
});

Self {
arrays,
null_count,
nulls,
}
}
}

fn interleave_primitive<T: ArrowPrimitiveType>(
values: &[&dyn Array],
indices: &[(usize, usize)],
data_type: &DataType,
) -> Result<ArrayRef, ArrowError> {
let mut has_nulls = false;
let cast: Vec<_> = values
.iter()
.map(|x| {
has_nulls = has_nulls || x.null_count() != 0;
as_primitive_array::<T>(*x)
})
.collect();
let interleaved = Interleave::<'_, PrimitiveArray<T>>::new(values, indices);

let mut values = BufferBuilder::<T::Native>::new(indices.len());
for (a, b) in indices {
let v = cast[*a].value(*b);
let v = interleaved.arrays[*a].value(*b);
values.append(v)
}

let mut null_count = 0;
let nulls = has_nulls.then(|| {
let mut builder = BooleanBufferBuilder::new(indices.len());
for (a, b) in indices {
let v = cast[*a].is_valid(*b);
null_count += !v as usize;
builder.append(v)
}
builder.finish()
});

let builder = ArrayDataBuilder::new(data_type.clone())
.len(indices.len())
.add_buffer(values.finish())
.null_bit_buffer(nulls)
.null_count(null_count);
.null_bit_buffer(interleaved.nulls)
.null_count(interleaved.null_count);

let data = unsafe { builder.build_unchecked() };
Ok(Arc::new(PrimitiveArray::<T>::from(data)))
}

fn interleave_string<O: OffsetSizeTrait>(
values: &[&dyn Array],
indices: &[(usize, usize)],
data_type: &DataType,
) -> Result<ArrayRef, ArrowError> {
let interleaved = Interleave::<'_, GenericStringArray<O>>::new(values, indices);

let mut capacity = 0;
let mut offsets = BufferBuilder::<O>::new(indices.len() + 1);
offsets.append(O::from_usize(0).unwrap());
for (a, b) in indices {
let o = interleaved.arrays[*a].value_offsets();
let len = o[*b + 1].as_usize() - o[*b].as_usize();
capacity += len;
offsets.append(O::from_usize(capacity).expect("overflow"));
}

let mut values = MutableBuffer::new(capacity);
for (a, b) in indices {
values.extend_from_slice(interleaved.arrays[*a].value(*b).as_bytes());
}

let builder = ArrayDataBuilder::new(data_type.clone())
.len(indices.len())
.add_buffer(offsets.finish())
.add_buffer(values.into())
.null_bit_buffer(interleaved.nulls)
.null_count(interleaved.null_count);

let data = unsafe { builder.build_unchecked() };
Ok(Arc::new(GenericStringArray::<O>::from(data)))
}

/// Fallback implementation of interleave using [`MutableArrayData`]
fn interleave_fallback(
values: &[&dyn Array],
Expand Down

0 comments on commit 8d7bec2

Please sign in to comment.