From f4ee8b9acbd2ad3110dfc1bf3cb8b93bd876adb5 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 13 Oct 2022 06:11:27 +0100 Subject: [PATCH] Handle empty offsets buffer (#1824) (#2836) * Handle empty offsets buffer (#1824) * Review feedback --- arrow-array/src/array/binary_array.rs | 30 +++++++++++++++++-- arrow-array/src/array/list_array.rs | 42 +++++++++++++++++++++++++-- arrow-array/src/array/string_array.rs | 31 ++++++++++++++++++-- 3 files changed, 96 insertions(+), 7 deletions(-) diff --git a/arrow-array/src/array/binary_array.rs b/arrow-array/src/array/binary_array.rs index cb168daf072..851fb60c078 100644 --- a/arrow-array/src/array/binary_array.rs +++ b/arrow-array/src/array/binary_array.rs @@ -17,7 +17,10 @@ use crate::iterator::GenericBinaryIter; use crate::raw_pointer::RawPtrBox; -use crate::{print_long_array, Array, ArrayAccessor, GenericListArray, OffsetSizeTrait}; +use crate::{ + empty_offsets, print_long_array, Array, ArrayAccessor, GenericListArray, + OffsetSizeTrait, +}; use arrow_buffer::{bit_util, Buffer, MutableBuffer}; use arrow_data::ArrayData; use arrow_schema::DataType; @@ -286,7 +289,11 @@ impl From for GenericBinaryArray empty_offsets::().as_ptr() as *const _, + false => data.buffers()[0].as_ptr(), + }; let values = data.buffers()[1].as_ptr(); Self { data, @@ -845,4 +852,23 @@ mod tests { .validate_full() .expect("All null array has valid array data"); } + + #[test] + fn test_empty_offsets() { + let string = BinaryArray::from( + ArrayData::builder(DataType::Binary) + .buffers(vec![Buffer::from(&[]), Buffer::from(&[])]) + .build() + .unwrap(), + ); + assert_eq!(string.value_offsets(), &[0]); + let string = LargeBinaryArray::from( + ArrayData::builder(DataType::LargeBinary) + .buffers(vec![Buffer::from(&[]), Buffer::from(&[])]) + .build() + .unwrap(), + ); + assert_eq!(string.len(), 0); + assert_eq!(string.value_offsets(), &[0]); + } } diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index b45a0f9257f..3022db023ab 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -43,6 +43,17 @@ impl OffsetSizeTrait for i64 { const PREFIX: &'static str = "Large"; } +/// Returns a slice of `OffsetSize` consisting of a single zero value +#[inline] +pub(crate) fn empty_offsets() -> &'static [OffsetSize] { + static OFFSET: &[i64] = &[0]; + // SAFETY: + // OffsetSize is ArrowNativeType and is therefore trivially transmutable + let (prefix, val, suffix) = unsafe { OFFSET.align_to::() }; + assert!(prefix.is_empty() && suffix.is_empty()); + val +} + /// Generic struct for a variable-size list array. /// /// Columnar format in Apache Arrow: @@ -240,8 +251,13 @@ impl GenericListArray { } let values = make_array(values); - let value_offsets = data.buffers()[0].as_ptr(); - let value_offsets = unsafe { RawPtrBox::::new(value_offsets) }; + // Handle case of empty offsets + let offsets = match data.is_empty() && data.buffers()[0].is_empty() { + true => empty_offsets::().as_ptr() as *const _, + false => data.buffers()[0].as_ptr(), + }; + + let value_offsets = unsafe { RawPtrBox::new(offsets) }; Ok(Self { data, values, @@ -941,4 +957,26 @@ mod tests { false, ); } + + #[test] + fn test_empty_offsets() { + let f = Box::new(Field::new("element", DataType::Int32, true)); + let string = ListArray::from( + ArrayData::builder(DataType::List(f.clone())) + .buffers(vec![Buffer::from(&[])]) + .add_child_data(ArrayData::new_empty(&DataType::Int32)) + .build() + .unwrap(), + ); + assert_eq!(string.value_offsets(), &[0]); + let string = LargeListArray::from( + ArrayData::builder(DataType::LargeList(f)) + .buffers(vec![Buffer::from(&[])]) + .add_child_data(ArrayData::new_empty(&DataType::Int32)) + .build() + .unwrap(), + ); + assert_eq!(string.len(), 0); + assert_eq!(string.value_offsets(), &[0]); + } } diff --git a/arrow-array/src/array/string_array.rs b/arrow-array/src/array/string_array.rs index 22ad81eaa3f..7e2ed3667e2 100644 --- a/arrow-array/src/array/string_array.rs +++ b/arrow-array/src/array/string_array.rs @@ -18,8 +18,8 @@ use crate::iterator::GenericStringIter; use crate::raw_pointer::RawPtrBox; use crate::{ - print_long_array, Array, ArrayAccessor, GenericBinaryArray, GenericListArray, - OffsetSizeTrait, + empty_offsets, print_long_array, Array, ArrayAccessor, GenericBinaryArray, + GenericListArray, OffsetSizeTrait, }; use arrow_buffer::{bit_util, Buffer, MutableBuffer}; use arrow_data::ArrayData; @@ -370,7 +370,11 @@ impl From for GenericStringArray empty_offsets::().as_ptr() as *const _, + false => data.buffers()[0].as_ptr(), + }; let values = data.buffers()[1].as_ptr(); Self { data, @@ -823,4 +827,25 @@ mod tests { fn test_large_string_array_from_list_array_wrong_type() { _test_generic_string_array_from_list_array_wrong_type::(); } + + #[test] + fn test_empty_offsets() { + let string = StringArray::from( + ArrayData::builder(DataType::Utf8) + .buffers(vec![Buffer::from(&[]), Buffer::from(&[])]) + .build() + .unwrap(), + ); + assert_eq!(string.len(), 0); + assert_eq!(string.value_offsets(), &[0]); + + let string = LargeStringArray::from( + ArrayData::builder(DataType::LargeUtf8) + .buffers(vec![Buffer::from(&[]), Buffer::from(&[])]) + .build() + .unwrap(), + ); + assert_eq!(string.len(), 0); + assert_eq!(string.value_offsets(), &[0]); + } }