From b06996bbd172128a0ade9883686f836b40dc30c6 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Fri, 22 Jul 2022 10:57:58 -0400 Subject: [PATCH 1/3] Add ArrayAccessor trait (#1948) --- arrow/src/array/array.rs | 73 +++++++ arrow/src/array/array_binary.rs | 15 ++ arrow/src/array/array_boolean.rs | 13 ++ arrow/src/array/array_list.rs | 27 ++- arrow/src/array/array_primitive.rs | 13 ++ arrow/src/array/array_string.rs | 15 ++ arrow/src/array/equal_json.rs | 10 + arrow/src/array/iterator.rs | 331 ++--------------------------- 8 files changed, 186 insertions(+), 311 deletions(-) diff --git a/arrow/src/array/array.rs b/arrow/src/array/array.rs index 3d8fdd70b30..3e1d443b0d0 100644 --- a/arrow/src/array/array.rs +++ b/arrow/src/array/array.rs @@ -297,6 +297,79 @@ impl Array for ArrayRef { } } +impl<'a, T: Array> Array for &'a T { + fn as_any(&self) -> &dyn Any { + T::as_any(self) + } + + fn data(&self) -> &ArrayData { + T::data(self) + } + + fn into_data(self) -> ArrayData { + self.data().clone() + } + + fn data_ref(&self) -> &ArrayData { + T::data_ref(self) + } + + fn data_type(&self) -> &DataType { + T::data_type(self) + } + + fn slice(&self, offset: usize, length: usize) -> ArrayRef { + T::slice(self, offset, length) + } + + fn len(&self) -> usize { + T::len(self) + } + + fn is_empty(&self) -> bool { + T::is_empty(self) + } + + fn offset(&self) -> usize { + T::offset(self) + } + + fn is_null(&self, index: usize) -> bool { + T::is_null(self, index) + } + + fn is_valid(&self, index: usize) -> bool { + T::is_valid(self, index) + } + + fn null_count(&self) -> usize { + T::null_count(self) + } + + fn get_buffer_memory_size(&self) -> usize { + T::get_buffer_memory_size(self) + } + + fn get_array_memory_size(&self) -> usize { + T::get_array_memory_size(self) + } + + fn to_raw( + &self, + ) -> Result<(*const ffi::FFI_ArrowArray, *const ffi::FFI_ArrowSchema)> { + T::to_raw(self) + } +} + +/// A generic trait for accessing the values of an [`Array`] +pub trait ArrayAccessor: Array { + type Item: Send + Sync; + + fn value(&self, index: usize) -> Self::Item; + + unsafe fn value_unchecked(&self, index: usize) -> Self::Item; +} + /// Constructs an array using the input `data`. /// Returns a reference-counted `Array` instance. pub fn make_array(data: ArrayData) -> ArrayRef { diff --git a/arrow/src/array/array_binary.rs b/arrow/src/array/array_binary.rs index d9cad1cce66..b01696b0334 100644 --- a/arrow/src/array/array_binary.rs +++ b/arrow/src/array/array_binary.rs @@ -23,6 +23,7 @@ use super::{ array::print_long_array, raw_pointer::RawPtrBox, Array, ArrayData, FixedSizeListArray, GenericBinaryIter, GenericListArray, OffsetSizeTrait, }; +use crate::array::array::ArrayAccessor; pub use crate::array::DecimalIter; use crate::buffer::Buffer; use crate::error::{ArrowError, Result}; @@ -245,6 +246,20 @@ impl Array for GenericBinaryArray { } } +impl<'a, OffsetSize: OffsetSizeTrait> ArrayAccessor + for &'a GenericBinaryArray +{ + type Item = &'a [u8]; + + fn value(&self, index: usize) -> Self::Item { + GenericBinaryArray::value(self, index) + } + + unsafe fn value_unchecked(&self, index: usize) -> Self::Item { + GenericBinaryArray::value_unchecked(self, index) + } +} + impl From for GenericBinaryArray { fn from(data: ArrayData) -> Self { assert_eq!( diff --git a/arrow/src/array/array_boolean.rs b/arrow/src/array/array_boolean.rs index 6e11ff8cb9a..5d1e20705e7 100644 --- a/arrow/src/array/array_boolean.rs +++ b/arrow/src/array/array_boolean.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use crate::array::array::ArrayAccessor; use std::borrow::Borrow; use std::convert::From; use std::iter::{FromIterator, IntoIterator}; @@ -157,6 +158,18 @@ impl Array for BooleanArray { } } +impl<'a> ArrayAccessor for &'a BooleanArray { + type Item = bool; + + fn value(&self, index: usize) -> Self::Item { + BooleanArray::value(self, index) + } + + unsafe fn value_unchecked(&self, index: usize) -> Self::Item { + BooleanArray::value_unchecked(self, index) + } +} + impl From> for BooleanArray { fn from(data: Vec) -> Self { let mut mut_buf = MutableBuffer::new_null(data.len()); diff --git a/arrow/src/array/array_list.rs b/arrow/src/array/array_list.rs index ac37754e9bf..22aa81ba783 100644 --- a/arrow/src/array/array_list.rs +++ b/arrow/src/array/array_list.rs @@ -24,6 +24,7 @@ use super::{ array::print_long_array, make_array, raw_pointer::RawPtrBox, Array, ArrayData, ArrayRef, BooleanBufferBuilder, GenericListArrayIter, PrimitiveArray, }; +use crate::array::array::ArrayAccessor; use crate::{ buffer::MutableBuffer, datatypes::{ArrowNativeType, ArrowPrimitiveType, DataType, Field}, @@ -245,7 +246,7 @@ impl GenericListArray { } } -impl Array for GenericListArray { +impl Array for GenericListArray { fn as_any(&self) -> &dyn Any { self } @@ -259,6 +260,18 @@ impl Array for GenericListArray ArrayAccessor for &'a GenericListArray { + type Item = ArrayRef; + + fn value(&self, index: usize) -> Self::Item { + GenericListArray::value(self, index) + } + + unsafe fn value_unchecked(&self, index: usize) -> Self::Item { + GenericListArray::value(self, index) + } +} + impl fmt::Debug for GenericListArray { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { let prefix = if OffsetSize::IS_LARGE { "Large" } else { "" }; @@ -466,6 +479,18 @@ impl Array for FixedSizeListArray { } } +impl ArrayAccessor for FixedSizeListArray { + type Item = ArrayRef; + + fn value(&self, index: usize) -> Self::Item { + FixedSizeListArray::value(self, index) + } + + unsafe fn value_unchecked(&self, index: usize) -> Self::Item { + FixedSizeListArray::value(self, index) + } +} + impl fmt::Debug for FixedSizeListArray { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "FixedSizeListArray<{}>\n[\n", self.value_length())?; diff --git a/arrow/src/array/array_primitive.rs b/arrow/src/array/array_primitive.rs index 4ab8d4f46b9..eb731a2b2f1 100644 --- a/arrow/src/array/array_primitive.rs +++ b/arrow/src/array/array_primitive.rs @@ -33,6 +33,7 @@ use crate::{ util::trusted_len_unzip, }; +use crate::array::array::ArrayAccessor; use half::f16; /// Array whose elements are of primitive types. @@ -188,6 +189,18 @@ impl Array for PrimitiveArray { } } +impl<'a, T: ArrowPrimitiveType> ArrayAccessor for &'a PrimitiveArray { + type Item = T::Native; + + fn value(&self, index: usize) -> Self::Item { + PrimitiveArray::value(self, index) + } + + unsafe fn value_unchecked(&self, index: usize) -> Self::Item { + PrimitiveArray::value_unchecked(self, index) + } +} + fn as_datetime(v: i64) -> Option { match T::DATA_TYPE { DataType::Date32 => Some(temporal_conversions::date32_to_datetime(v as i32)), diff --git a/arrow/src/array/array_string.rs b/arrow/src/array/array_string.rs index 85cb346a7e8..df858d858e2 100644 --- a/arrow/src/array/array_string.rs +++ b/arrow/src/array/array_string.rs @@ -23,6 +23,7 @@ use super::{ array::print_long_array, raw_pointer::RawPtrBox, Array, ArrayData, GenericListArray, GenericStringIter, OffsetSizeTrait, }; +use crate::array::array::ArrayAccessor; use crate::buffer::Buffer; use crate::util::bit_util; use crate::{buffer::MutableBuffer, datatypes::DataType}; @@ -298,6 +299,20 @@ impl Array for GenericStringArray { } } +impl<'a, OffsetSize: OffsetSizeTrait> ArrayAccessor + for &'a GenericStringArray +{ + type Item = &'a str; + + fn value(&self, index: usize) -> Self::Item { + GenericStringArray::value(self, index) + } + + unsafe fn value_unchecked(&self, index: usize) -> Self::Item { + GenericStringArray::value_unchecked(self, index) + } +} + impl From for GenericStringArray { fn from(data: ArrayData) -> Self { assert_eq!( diff --git a/arrow/src/array/equal_json.rs b/arrow/src/array/equal_json.rs index 30d9c8f84ab..e7d14aae81a 100644 --- a/arrow/src/array/equal_json.rs +++ b/arrow/src/array/equal_json.rs @@ -37,6 +37,16 @@ pub trait JsonEqual { } } +impl<'a, T: JsonEqual> JsonEqual for &'a T { + fn equals_json(&self, json: &[&Value]) -> bool { + T::equals_json(self, json) + } + + fn equals_json_values(&self, json: &[Value]) -> bool { + T::equals_json_values(self, json) + } +} + /// Implement array equals for numeric type impl JsonEqual for PrimitiveArray { fn equals_json(&self, json: &[&Value]) -> bool { diff --git a/arrow/src/array/iterator.rs b/arrow/src/array/iterator.rs index 8e45de28636..a4853d7d73b 100644 --- a/arrow/src/array/iterator.rs +++ b/arrow/src/array/iterator.rs @@ -15,36 +15,37 @@ // specific language governing permissions and limitations // under the License. +use crate::array::array::ArrayAccessor; use crate::array::BasicDecimalArray; -use crate::datatypes::ArrowPrimitiveType; use super::{ - Array, ArrayRef, BooleanArray, Decimal128Array, GenericBinaryArray, GenericListArray, - GenericStringArray, OffsetSizeTrait, PrimitiveArray, + Array, BooleanArray, Decimal128Array, GenericBinaryArray, GenericListArray, + GenericStringArray, PrimitiveArray, }; -/// an iterator that returns Some(T) or None, that can be used on any PrimitiveArray +/// an iterator that returns Some(T) or None, that can be used on any [`ArrayAccessor`] // Note: This implementation is based on std's [Vec]s' [IntoIter]. #[derive(Debug)] -pub struct PrimitiveIter<'a, T: ArrowPrimitiveType> { - array: &'a PrimitiveArray, +pub struct ArrayIter { + array: T, current: usize, current_end: usize, } -impl<'a, T: ArrowPrimitiveType> PrimitiveIter<'a, T> { +impl ArrayIter { /// create a new iterator - pub fn new(array: &'a PrimitiveArray) -> Self { - PrimitiveIter:: { + pub fn new(array: T) -> Self { + let len = array.len(); + ArrayIter { array, current: 0, - current_end: array.len(), + current_end: len, } } } -impl<'a, T: ArrowPrimitiveType> std::iter::Iterator for PrimitiveIter<'a, T> { - type Item = Option; +impl Iterator for ArrayIter { + type Item = Option; #[inline] fn next(&mut self) -> Option { @@ -73,7 +74,7 @@ impl<'a, T: ArrowPrimitiveType> std::iter::Iterator for PrimitiveIter<'a, T> { } } -impl<'a, T: ArrowPrimitiveType> std::iter::DoubleEndedIterator for PrimitiveIter<'a, T> { +impl DoubleEndedIterator for ArrayIter { fn next_back(&mut self) -> Option { if self.current_end == self.current { None @@ -94,304 +95,14 @@ impl<'a, T: ArrowPrimitiveType> std::iter::DoubleEndedIterator for PrimitiveIter } /// all arrays have known size. -impl<'a, T: ArrowPrimitiveType> std::iter::ExactSizeIterator for PrimitiveIter<'a, T> {} - -/// an iterator that returns Some(bool) or None. -// Note: This implementation is based on std's [Vec]s' [IntoIter]. -#[derive(Debug)] -pub struct BooleanIter<'a> { - array: &'a BooleanArray, - current: usize, - current_end: usize, -} - -impl<'a> BooleanIter<'a> { - /// create a new iterator - pub fn new(array: &'a BooleanArray) -> Self { - BooleanIter { - array, - current: 0, - current_end: array.len(), - } - } -} - -impl<'a> std::iter::Iterator for BooleanIter<'a> { - type Item = Option; - - fn next(&mut self) -> Option { - if self.current == self.current_end { - None - } else if self.array.is_null(self.current) { - self.current += 1; - Some(None) - } else { - let old = self.current; - self.current += 1; - // Safety: - // we just checked bounds in `self.current_end == self.current` - // this is safe on the premise that this struct is initialized with - // current = array.len() - // and that current_end is ever only decremented - unsafe { Some(Some(self.array.value_unchecked(old))) } - } - } +impl ExactSizeIterator for ArrayIter {} - fn size_hint(&self) -> (usize, Option) { - ( - self.array.len() - self.current, - Some(self.array.len() - self.current), - ) - } -} - -impl<'a> std::iter::DoubleEndedIterator for BooleanIter<'a> { - fn next_back(&mut self) -> Option { - if self.current_end == self.current { - None - } else { - self.current_end -= 1; - Some(if self.array.is_null(self.current_end) { - None - } else { - // Safety: - // we just checked bounds in `self.current_end == self.current` - // this is safe on the premise that this struct is initialized with - // current = array.len() - // and that current_end is ever only decremented - unsafe { Some(self.array.value_unchecked(self.current_end)) } - }) - } - } -} - -/// all arrays have known size. -impl<'a> std::iter::ExactSizeIterator for BooleanIter<'a> {} - -/// an iterator that returns `Some(&str)` or `None`, for string arrays -#[derive(Debug)] -pub struct GenericStringIter<'a, T> -where - T: OffsetSizeTrait, -{ - array: &'a GenericStringArray, - current: usize, - current_end: usize, -} - -impl<'a, T: OffsetSizeTrait> GenericStringIter<'a, T> { - /// create a new iterator - pub fn new(array: &'a GenericStringArray) -> Self { - GenericStringIter:: { - array, - current: 0, - current_end: array.len(), - } - } -} - -impl<'a, T: OffsetSizeTrait> std::iter::Iterator for GenericStringIter<'a, T> { - type Item = Option<&'a str>; - - fn next(&mut self) -> Option { - let i = self.current; - if i >= self.current_end { - None - } else if self.array.is_null(i) { - self.current += 1; - Some(None) - } else { - self.current += 1; - // Safety: - // we just checked bounds in `self.current_end == self.current` - // this is safe on the premise that this struct is initialized with - // current = array.len() - // and that current_end is ever only decremented - unsafe { Some(Some(self.array.value_unchecked(i))) } - } - } - - fn size_hint(&self) -> (usize, Option) { - ( - self.current_end - self.current, - Some(self.current_end - self.current), - ) - } -} - -impl<'a, T: OffsetSizeTrait> std::iter::DoubleEndedIterator for GenericStringIter<'a, T> { - fn next_back(&mut self) -> Option { - if self.current_end == self.current { - None - } else { - self.current_end -= 1; - Some(if self.array.is_null(self.current_end) { - None - } else { - // Safety: - // we just checked bounds in `self.current_end == self.current` - // this is safe on the premise that this struct is initialized with - // current = array.len() - // and that current_end is ever only decremented - unsafe { Some(self.array.value_unchecked(self.current_end)) } - }) - } - } -} - -/// all arrays have known size. -impl<'a, T: OffsetSizeTrait> std::iter::ExactSizeIterator for GenericStringIter<'a, T> {} - -/// an iterator that returns `Some(&[u8])` or `None`, for binary arrays -#[derive(Debug)] -pub struct GenericBinaryIter<'a, T> -where - T: OffsetSizeTrait, -{ - array: &'a GenericBinaryArray, - current: usize, - current_end: usize, -} - -impl<'a, T: OffsetSizeTrait> GenericBinaryIter<'a, T> { - /// create a new iterator - pub fn new(array: &'a GenericBinaryArray) -> Self { - GenericBinaryIter:: { - array, - current: 0, - current_end: array.len(), - } - } -} - -impl<'a, T: OffsetSizeTrait> std::iter::Iterator for GenericBinaryIter<'a, T> { - type Item = Option<&'a [u8]>; - - fn next(&mut self) -> Option { - let i = self.current; - if i >= self.current_end { - None - } else if self.array.is_null(i) { - self.current += 1; - Some(None) - } else { - self.current += 1; - // Safety: - // we just checked bounds in `self.current_end == self.current` - // this is safe on the premise that this struct is initialized with - // current = array.len() - // and that current_end is ever only decremented - unsafe { Some(Some(self.array.value_unchecked(i))) } - } - } - - fn size_hint(&self) -> (usize, Option) { - ( - self.current_end - self.current, - Some(self.current_end - self.current), - ) - } -} - -impl<'a, T: OffsetSizeTrait> std::iter::DoubleEndedIterator for GenericBinaryIter<'a, T> { - fn next_back(&mut self) -> Option { - if self.current_end == self.current { - None - } else { - self.current_end -= 1; - Some(if self.array.is_null(self.current_end) { - None - } else { - // Safety: - // we just checked bounds in `self.current_end == self.current` - // this is safe on the premise that this struct is initialized with - // current = array.len() - // and that current_end is ever only decremented - unsafe { Some(self.array.value_unchecked(self.current_end)) } - }) - } - } -} - -/// all arrays have known size. -impl<'a, T: OffsetSizeTrait> std::iter::ExactSizeIterator for GenericBinaryIter<'a, T> {} - -#[derive(Debug)] -pub struct GenericListArrayIter<'a, S> -where - S: OffsetSizeTrait, -{ - array: &'a GenericListArray, - current: usize, - current_end: usize, -} - -impl<'a, S: OffsetSizeTrait> GenericListArrayIter<'a, S> { - pub fn new(array: &'a GenericListArray) -> Self { - GenericListArrayIter:: { - array, - current: 0, - current_end: array.len(), - } - } -} - -impl<'a, S: OffsetSizeTrait> std::iter::Iterator for GenericListArrayIter<'a, S> { - type Item = Option; - - fn next(&mut self) -> Option { - let i = self.current; - if i >= self.current_end { - None - } else if self.array.is_null(i) { - self.current += 1; - Some(None) - } else { - self.current += 1; - // Safety: - // we just checked bounds in `self.current_end == self.current` - // this is safe on the premise that this struct is initialized with - // current = array.len() - // and that current_end is ever only decremented - unsafe { Some(Some(self.array.value_unchecked(i))) } - } - } - - fn size_hint(&self) -> (usize, Option) { - ( - self.current_end - self.current, - Some(self.current_end - self.current), - ) - } -} - -impl<'a, S: OffsetSizeTrait> std::iter::DoubleEndedIterator - for GenericListArrayIter<'a, S> -{ - fn next_back(&mut self) -> Option { - if self.current_end == self.current { - None - } else { - self.current_end -= 1; - Some(if self.array.is_null(self.current_end) { - None - } else { - // Safety: - // we just checked bounds in `self.current_end == self.current` - // this is safe on the premise that this struct is initialized with - // current = array.len() - // and that current_end is ever only decremented - unsafe { Some(self.array.value_unchecked(self.current_end)) } - }) - } - } -} - -/// all arrays have known size. -impl<'a, S: OffsetSizeTrait> std::iter::ExactSizeIterator - for GenericListArrayIter<'a, S> -{ -} +/// an iterator that returns Some(T) or None, that can be used on any PrimitiveArray +pub type PrimitiveIter<'a, T> = ArrayIter<&'a PrimitiveArray>; +pub type BooleanIter<'a> = ArrayIter<&'a BooleanArray>; +pub type GenericStringIter<'a, T> = ArrayIter<&'a GenericStringArray>; +pub type GenericBinaryIter<'a, T> = ArrayIter<&'a GenericBinaryArray>; +pub type GenericListArrayIter<'a, O> = ArrayIter<&'a GenericListArray>; /// an iterator that returns `Some(i128)` or `None`, that can be used on a /// [`Decimal128Array`] From 5c3081da3e75bfc8ec8765694934042d3b12379c Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Fri, 22 Jul 2022 13:07:45 -0400 Subject: [PATCH 2/3] Re-export ArrayAccessor --- arrow/src/array/mod.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/arrow/src/array/mod.rs b/arrow/src/array/mod.rs index d805710ccf5..8acc33c7b87 100644 --- a/arrow/src/array/mod.rs +++ b/arrow/src/array/mod.rs @@ -186,6 +186,7 @@ use crate::datatypes::*; // --------------------- Array & ArrayData --------------------- pub use self::array::Array; +pub use self::array::ArrayAccessor; pub use self::array::ArrayRef; pub(crate) use self::data::layout; pub use self::data::ArrayData; From 1d3c9d3b782abe6bef2989c7d3e92586711752f5 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Fri, 22 Jul 2022 14:19:38 -0400 Subject: [PATCH 3/3] Add docs --- arrow/src/array/array.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arrow/src/array/array.rs b/arrow/src/array/array.rs index 3e1d443b0d0..5c523129631 100644 --- a/arrow/src/array/array.rs +++ b/arrow/src/array/array.rs @@ -365,8 +365,14 @@ impl<'a, T: Array> Array for &'a T { pub trait ArrayAccessor: Array { type Item: Send + Sync; + /// Returns the element at index `i` + /// # Panics + /// Panics if the value is outside the bounds of the array fn value(&self, index: usize) -> Self::Item; + /// Returns the element at index `i` + /// # Safety + /// Caller is responsible for ensuring that the index is within the bounds of the array unsafe fn value_unchecked(&self, index: usize) -> Self::Item; }