From b6f08a87e02144277bb0a7aa3708e42f6faf7a26 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 28 Oct 2022 16:04:24 +1300 Subject: [PATCH] Add GenericByteArray (#2946) (#2947) * Add GenericByteArray (#2946) * Lint * Review feedback * Review feedback --- arrow-array/src/array/binary_array.rs | 191 +---------------------- arrow-array/src/array/byte_array.rs | 208 ++++++++++++++++++++++++++ arrow-array/src/array/mod.rs | 3 + arrow-array/src/array/string_array.rs | 178 +--------------------- arrow-array/src/types.rs | 90 ++++++++++- 5 files changed, 309 insertions(+), 361 deletions(-) create mode 100644 arrow-array/src/array/byte_array.rs diff --git a/arrow-array/src/array/binary_array.rs b/arrow-array/src/array/binary_array.rs index c8407b252ef..259d949d42a 100644 --- a/arrow-array/src/array/binary_array.rs +++ b/arrow-array/src/array/binary_array.rs @@ -15,118 +15,23 @@ // specific language governing permissions and limitations // under the License. -use crate::iterator::GenericBinaryIter; -use crate::raw_pointer::RawPtrBox; -use crate::{ - empty_offsets, print_long_array, Array, ArrayAccessor, GenericListArray, - OffsetSizeTrait, -}; +use crate::types::GenericBinaryType; +use crate::{Array, GenericByteArray, GenericListArray, OffsetSizeTrait}; use arrow_buffer::{bit_util, Buffer, MutableBuffer}; use arrow_data::ArrayData; use arrow_schema::DataType; -use std::any::Any; /// See [`BinaryArray`] and [`LargeBinaryArray`] for storing /// binary data. -pub struct GenericBinaryArray { - data: ArrayData, - value_offsets: RawPtrBox, - value_data: RawPtrBox, -} +pub type GenericBinaryArray = GenericByteArray>; impl GenericBinaryArray { - /// Data type of the array. - pub const DATA_TYPE: DataType = if OffsetSize::IS_LARGE { - DataType::LargeBinary - } else { - DataType::Binary - }; - /// Get the data type of the array. #[deprecated(note = "please use `Self::DATA_TYPE` instead")] pub const fn get_data_type() -> DataType { Self::DATA_TYPE } - /// Returns the length for value at index `i`. - #[inline] - pub fn value_length(&self, i: usize) -> OffsetSize { - let offsets = self.value_offsets(); - offsets[i + 1] - offsets[i] - } - - /// Returns a clone of the value data buffer - pub fn value_data(&self) -> Buffer { - self.data.buffers()[1].clone() - } - - /// Returns the offset values in the offsets buffer - #[inline] - pub fn value_offsets(&self) -> &[OffsetSize] { - // Soundness - // pointer alignment & location is ensured by RawPtrBox - // buffer bounds/offset is ensured by the ArrayData instance. - unsafe { - std::slice::from_raw_parts( - self.value_offsets.as_ptr().add(self.data.offset()), - self.len() + 1, - ) - } - } - - /// Returns the element at index `i` as bytes slice - /// # Safety - /// Caller is responsible for ensuring that the index is within the bounds of the array - pub unsafe fn value_unchecked(&self, i: usize) -> &[u8] { - let end = *self.value_offsets().get_unchecked(i + 1); - let start = *self.value_offsets().get_unchecked(i); - - // Soundness - // pointer alignment & location is ensured by RawPtrBox - // buffer bounds/offset is ensured by the value_offset invariants - - // Safety of `to_isize().unwrap()` - // `start` and `end` are &OffsetSize, which is a generic type that implements the - // OffsetSizeTrait. Currently, only i32 and i64 implement OffsetSizeTrait, - // both of which should cleanly cast to isize on an architecture that supports - // 32/64-bit offsets - std::slice::from_raw_parts( - self.value_data.as_ptr().offset(start.to_isize().unwrap()), - (end - start).to_usize().unwrap(), - ) - } - - /// Returns the element at index `i` as bytes slice - /// # Panics - /// Panics if index `i` is out of bounds. - pub fn value(&self, i: usize) -> &[u8] { - assert!( - i < self.data.len(), - "Trying to access an element at index {} from a BinaryArray of length {}", - i, - self.len() - ); - //Soundness: length checked above, offset buffer length is 1 larger than logical array length - let end = unsafe { self.value_offsets().get_unchecked(i + 1) }; - let start = unsafe { self.value_offsets().get_unchecked(i) }; - - // Soundness - // pointer alignment & location is ensured by RawPtrBox - // buffer bounds/offset is ensured by the value_offset invariants - - // Safety of `to_isize().unwrap()` - // `start` and `end` are &OffsetSize, which is a generic type that implements the - // OffsetSizeTrait. Currently, only i32 and i64 implement OffsetSizeTrait, - // both of which should cleanly cast to isize on an architecture that supports - // 32/64-bit offsets - unsafe { - std::slice::from_raw_parts( - self.value_data.as_ptr().offset(start.to_isize().unwrap()), - (*end - *start).to_usize().unwrap(), - ) - } - } - /// Creates a [GenericBinaryArray] from a vector of byte slices /// /// See also [`Self::from_iter_values`] @@ -230,85 +135,6 @@ impl GenericBinaryArray { ) -> impl Iterator> + 'a { indexes.map(|opt_index| opt_index.map(|index| self.value_unchecked(index))) } - - /// constructs a new iterator - pub fn iter(&self) -> GenericBinaryIter<'_, OffsetSize> { - GenericBinaryIter::<'_, OffsetSize>::new(self) - } -} - -impl std::fmt::Debug for GenericBinaryArray { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - let prefix = OffsetSize::PREFIX; - - write!(f, "{}BinaryArray\n[\n", prefix)?; - print_long_array(self, f, |array, index, f| { - std::fmt::Debug::fmt(&array.value(index), f) - })?; - write!(f, "]") - } -} - -impl Array for GenericBinaryArray { - fn as_any(&self) -> &dyn Any { - self - } - - fn data(&self) -> &ArrayData { - &self.data - } - - fn into_data(self) -> ArrayData { - self.into() - } -} - -impl<'a, OffsetSize: OffsetSizeTrait> ArrayAccessor - for &'a GenericBinaryArray -{ - type Item = &'a [u8]; - - fn value(&self, index: usize) -> Self::Item { - GenericBinaryArray::value(self, index) - } - - unsafe fn value_unchecked(&self, index: usize) -> Self::Item { - GenericBinaryArray::value_unchecked(self, index) - } -} - -impl From for GenericBinaryArray { - fn from(data: ArrayData) -> Self { - assert_eq!( - data.data_type(), - &Self::DATA_TYPE, - "[Large]BinaryArray expects Datatype::[Large]Binary" - ); - assert_eq!( - data.buffers().len(), - 2, - "BinaryArray data should contain 2 buffers only (offsets and values)" - ); - // Handle case of empty offsets - let offsets = match data.is_empty() && data.buffers()[0].is_empty() { - true => empty_offsets::().as_ptr() as *const _, - false => data.buffers()[0].as_ptr(), - }; - let values = data.buffers()[1].as_ptr(); - Self { - data, - // SAFETY: - // ArrayData must be valid, and validated data type above - value_offsets: unsafe { RawPtrBox::new(offsets) }, - value_data: unsafe { RawPtrBox::new(values) }, - } - } -} - -impl From> for ArrayData { - fn from(array: GenericBinaryArray) -> Self { - array.data - } } impl From>> @@ -374,15 +200,6 @@ where } } -impl<'a, T: OffsetSizeTrait> IntoIterator for &'a GenericBinaryArray { - type Item = Option<&'a [u8]>; - type IntoIter = GenericBinaryIter<'a, T>; - - fn into_iter(self) -> Self::IntoIter { - GenericBinaryIter::<'a, T>::new(self) - } -} - /// An array where each element contains 0 or more bytes. /// The byte length of each element is represented by an i32. /// @@ -836,7 +653,7 @@ mod tests { } #[test] - #[should_panic(expected = "[Large]BinaryArray expects Datatype::[Large]Binary")] + #[should_panic(expected = "LargeBinaryArray expects DataType::LargeBinary")] fn test_binary_array_validation() { let array = BinaryArray::from_iter_values(&[&[1, 2]]); let _ = LargeBinaryArray::from(array.into_data()); diff --git a/arrow-array/src/array/byte_array.rs b/arrow-array/src/array/byte_array.rs new file mode 100644 index 00000000000..8dd206bd263 --- /dev/null +++ b/arrow-array/src/array/byte_array.rs @@ -0,0 +1,208 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::array::{empty_offsets, print_long_array}; +use crate::iterator::ArrayIter; +use crate::raw_pointer::RawPtrBox; +use crate::types::bytes::ByteArrayNativeType; +use crate::types::ByteArrayType; +use crate::{Array, ArrayAccessor, OffsetSizeTrait}; +use arrow_buffer::{ArrowNativeType, Buffer}; +use arrow_data::ArrayData; +use arrow_schema::DataType; +use std::any::Any; + +/// Generic struct for variable-size byte arrays +/// +/// See [`StringArray`] and [`LargeStringArray`] for storing utf8 encoded string data +/// +/// See [`BinaryArray`] and [`LargeBinaryArray`] for storing arbitrary bytes +/// +/// [`StringArray`]: crate::StringArray +/// [`LargeStringArray`]: crate::LargeStringArray +/// [`BinaryArray`]: crate::BinaryArray +/// [`LargeBinaryArray`]: crate::LargeBinaryArray +pub struct GenericByteArray { + data: ArrayData, + value_offsets: RawPtrBox, + value_data: RawPtrBox, +} + +impl GenericByteArray { + /// Data type of the array. + pub const DATA_TYPE: DataType = T::DATA_TYPE; + + /// Returns the length for value at index `i`. + /// # Panics + /// Panics if index `i` is out of bounds. + #[inline] + pub fn value_length(&self, i: usize) -> T::Offset { + let offsets = self.value_offsets(); + offsets[i + 1] - offsets[i] + } + + /// Returns a clone of the value data buffer + pub fn value_data(&self) -> Buffer { + self.data.buffers()[1].clone() + } + + /// Returns the offset values in the offsets buffer + #[inline] + pub fn value_offsets(&self) -> &[T::Offset] { + // Soundness + // pointer alignment & location is ensured by RawPtrBox + // buffer bounds/offset is ensured by the ArrayData instance. + unsafe { + std::slice::from_raw_parts( + self.value_offsets.as_ptr().add(self.data.offset()), + self.len() + 1, + ) + } + } + + /// Returns the element at index `i` + /// # Safety + /// Caller is responsible for ensuring that the index is within the bounds of the array + pub unsafe fn value_unchecked(&self, i: usize) -> &T::Native { + let end = *self.value_offsets().get_unchecked(i + 1); + let start = *self.value_offsets().get_unchecked(i); + + // Soundness + // pointer alignment & location is ensured by RawPtrBox + // buffer bounds/offset is ensured by the value_offset invariants + + // Safety of `to_isize().unwrap()` + // `start` and `end` are &OffsetSize, which is a generic type that implements the + // OffsetSizeTrait. Currently, only i32 and i64 implement OffsetSizeTrait, + // both of which should cleanly cast to isize on an architecture that supports + // 32/64-bit offsets + let b = std::slice::from_raw_parts( + self.value_data.as_ptr().offset(start.to_isize().unwrap()), + (end - start).to_usize().unwrap(), + ); + + // SAFETY: + // ArrayData is valid + T::Native::from_bytes_unchecked(b) + } + + /// Returns the element at index `i` + /// # Panics + /// Panics if index `i` is out of bounds. + pub fn value(&self, i: usize) -> &T::Native { + assert!( + i < self.data.len(), + "Trying to access an element at index {} from a {}{}Array of length {}", + i, + T::Offset::PREFIX, + T::PREFIX, + self.len() + ); + // SAFETY: + // Verified length above + unsafe { self.value_unchecked(i) } + } + + /// constructs a new iterator + pub fn iter(&self) -> ArrayIter<&Self> { + ArrayIter::new(self) + } +} + +impl std::fmt::Debug for GenericByteArray { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "{}{}Array\n[\n", T::Offset::PREFIX, T::PREFIX)?; + print_long_array(self, f, |array, index, f| { + std::fmt::Debug::fmt(&array.value(index), f) + })?; + write!(f, "]") + } +} + +impl Array for GenericByteArray { + fn as_any(&self) -> &dyn Any { + self + } + + fn data(&self) -> &ArrayData { + &self.data + } + + fn into_data(self) -> ArrayData { + self.into() + } +} + +impl<'a, T: ByteArrayType> ArrayAccessor for &'a GenericByteArray { + type Item = &'a T::Native; + + fn value(&self, index: usize) -> Self::Item { + GenericByteArray::value(self, index) + } + + unsafe fn value_unchecked(&self, index: usize) -> Self::Item { + GenericByteArray::value_unchecked(self, index) + } +} + +impl From for GenericByteArray { + fn from(data: ArrayData) -> Self { + assert_eq!( + data.data_type(), + &Self::DATA_TYPE, + "{}{}Array expects DataType::{}", + T::Offset::PREFIX, + T::PREFIX, + Self::DATA_TYPE + ); + assert_eq!( + data.buffers().len(), + 2, + "{}{}Array data should contain 2 buffers only (offsets and values)", + T::Offset::PREFIX, + T::PREFIX, + ); + // Handle case of empty offsets + let offsets = match data.is_empty() && data.buffers()[0].is_empty() { + true => empty_offsets::().as_ptr() as *const _, + false => data.buffers()[0].as_ptr(), + }; + let values = data.buffers()[1].as_ptr(); + Self { + data, + // SAFETY: + // ArrayData must be valid, and validated data type above + value_offsets: unsafe { RawPtrBox::new(offsets) }, + value_data: unsafe { RawPtrBox::new(values) }, + } + } +} + +impl From> for ArrayData { + fn from(array: GenericByteArray) -> Self { + array.data + } +} + +impl<'a, T: ByteArrayType> IntoIterator for &'a GenericByteArray { + type Item = Option<&'a T::Native>; + type IntoIter = ArrayIter; + + fn into_iter(self) -> Self::IntoIter { + ArrayIter::new(self) + } +} diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index 1613e4a69b8..41aa438c9fb 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -31,6 +31,9 @@ pub use binary_array::*; mod boolean_array; pub use boolean_array::*; +mod byte_array; +pub use byte_array::*; + mod dictionary_array; pub use dictionary_array::*; diff --git a/arrow-array/src/array/string_array.rs b/arrow-array/src/array/string_array.rs index 0cf45a44859..94fcbae02e5 100644 --- a/arrow-array/src/array/string_array.rs +++ b/arrow-array/src/array/string_array.rs @@ -15,67 +15,27 @@ // specific language governing permissions and limitations // under the License. -use crate::iterator::GenericStringIter; -use crate::raw_pointer::RawPtrBox; +use crate::types::GenericStringType; use crate::{ - empty_offsets, print_long_array, Array, ArrayAccessor, GenericBinaryArray, - GenericListArray, OffsetSizeTrait, + Array, GenericBinaryArray, GenericByteArray, GenericListArray, OffsetSizeTrait, }; -use arrow_buffer::{bit_util, Buffer, MutableBuffer}; +use arrow_buffer::{bit_util, MutableBuffer}; use arrow_data::ArrayData; use arrow_schema::DataType; -use std::any::Any; /// Generic struct for \[Large\]StringArray /// /// See [`StringArray`] and [`LargeStringArray`] for storing /// specific string data. -pub struct GenericStringArray { - data: ArrayData, - value_offsets: RawPtrBox, - value_data: RawPtrBox, -} +pub type GenericStringArray = GenericByteArray>; impl GenericStringArray { - /// Data type of the array. - pub const DATA_TYPE: DataType = if OffsetSize::IS_LARGE { - DataType::LargeUtf8 - } else { - DataType::Utf8 - }; - /// Get the data type of the array. #[deprecated(note = "please use `Self::DATA_TYPE` instead")] pub const fn get_data_type() -> DataType { Self::DATA_TYPE } - /// Returns the length for the element at index `i`. - #[inline] - pub fn value_length(&self, i: usize) -> OffsetSize { - let offsets = self.value_offsets(); - offsets[i + 1] - offsets[i] - } - - /// Returns the offset values in the offsets buffer - #[inline] - pub fn value_offsets(&self) -> &[OffsetSize] { - // Soundness - // pointer alignment & location is ensured by RawPtrBox - // buffer bounds/offset is ensured by the ArrayData instance. - unsafe { - std::slice::from_raw_parts( - self.value_offsets.as_ptr().add(self.data.offset()), - self.len() + 1, - ) - } - } - - /// Returns a clone of the value data buffer - pub fn value_data(&self) -> Buffer { - self.data.buffers()[1].clone() - } - /// Returns the number of `Unicode Scalar Value` in the string at index `i`. /// # Performance /// This function has `O(n)` time complexity where `n` is the string length. @@ -85,45 +45,6 @@ impl GenericStringArray { self.value(i).chars().count() } - /// Returns the element at index - /// # Safety - /// caller is responsible for ensuring that index is within the array bounds - #[inline] - pub unsafe fn value_unchecked(&self, i: usize) -> &str { - let end = self.value_offsets().get_unchecked(i + 1).as_usize(); - let start = self.value_offsets().get_unchecked(i).as_usize(); - - // Soundness - // pointer alignment & location is ensured by RawPtrBox - // buffer bounds/offset is ensured by the value_offset invariants - // ISSUE: utf-8 well formedness is not checked - - // Safety of `to_isize().unwrap()` - // `start` and `end` are &OffsetSize, which is a generic type that implements the - // OffsetSizeTrait. Currently, only i32 and i64 implement OffsetSizeTrait, - // both of which should cleanly cast to isize on an architecture that supports - // 32/64-bit offsets - let slice = - std::slice::from_raw_parts(self.value_data.as_ptr().add(start), end - start); - std::str::from_utf8_unchecked(slice) - } - - /// Returns the element at index `i` as &str - /// # Panics - /// Panics if index `i` is out of bounds. - #[inline] - pub fn value(&self, i: usize) -> &str { - assert!( - i < self.data.len(), - "Trying to access an element at index {} from a StringArray of length {}", - i, - self.len() - ); - // Safety: - // `i < self.data.len() - unsafe { self.value_unchecked(i) } - } - /// Convert a list array to a string array. /// /// Note: this performs potentially expensive UTF-8 validation, consider using @@ -283,62 +204,6 @@ where } } -impl<'a, T: OffsetSizeTrait> IntoIterator for &'a GenericStringArray { - type Item = Option<&'a str>; - type IntoIter = GenericStringIter<'a, T>; - - fn into_iter(self) -> Self::IntoIter { - GenericStringIter::<'a, T>::new(self) - } -} - -impl<'a, T: OffsetSizeTrait> GenericStringArray { - /// constructs a new iterator - pub fn iter(&'a self) -> GenericStringIter<'a, T> { - GenericStringIter::<'a, T>::new(self) - } -} - -impl std::fmt::Debug for GenericStringArray { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - let prefix = OffsetSize::PREFIX; - - write!(f, "{}StringArray\n[\n", prefix)?; - print_long_array(self, f, |array, index, f| { - std::fmt::Debug::fmt(&array.value(index), f) - })?; - write!(f, "]") - } -} - -impl Array for GenericStringArray { - fn as_any(&self) -> &dyn Any { - self - } - - fn data(&self) -> &ArrayData { - &self.data - } - - fn into_data(self) -> ArrayData { - self.into() - } -} - -impl<'a, OffsetSize: OffsetSizeTrait> ArrayAccessor - for &'a GenericStringArray -{ - type Item = &'a str; - - fn value(&self, index: usize) -> Self::Item { - GenericStringArray::value(self, index) - } - - unsafe fn value_unchecked(&self, index: usize) -> Self::Item { - GenericStringArray::value_unchecked(self, index) - } -} - impl From> for GenericStringArray { @@ -356,32 +221,6 @@ impl From> } } -impl From for GenericStringArray { - fn from(data: ArrayData) -> Self { - assert_eq!( - data.data_type(), - &Self::DATA_TYPE, - "[Large]StringArray expects Datatype::[Large]Utf8" - ); - assert_eq!( - data.buffers().len(), - 2, - "StringArray data should contain 2 buffers only (offsets and values)" - ); - // Handle case of empty offsets - let offsets = match data.is_empty() && data.buffers()[0].is_empty() { - true => empty_offsets::().as_ptr() as *const _, - false => data.buffers()[0].as_ptr(), - }; - let values = data.buffers()[1].as_ptr(); - Self { - data, - value_offsets: unsafe { RawPtrBox::new(offsets) }, - value_data: unsafe { RawPtrBox::new(values) }, - } - } -} - impl From>> for GenericStringArray { @@ -402,12 +241,6 @@ impl From> for GenericStringArray From> for ArrayData { - fn from(array: GenericStringArray) -> Self { - array.data - } -} - /// An array where each element is a variable-sized sequence of bytes representing a string /// whose maximum length (in bytes) is represented by a i32. /// @@ -436,6 +269,7 @@ pub type LargeStringArray = GenericStringArray; mod tests { use super::*; use crate::builder::{ListBuilder, StringBuilder}; + use arrow_buffer::Buffer; use arrow_schema::Field; #[test] @@ -464,7 +298,7 @@ mod tests { } #[test] - #[should_panic(expected = "[Large]StringArray expects Datatype::[Large]Utf8")] + #[should_panic(expected = "StringArray expects DataType::Utf8")] fn test_string_array_from_int() { let array = LargeStringArray::from(vec!["a", "b"]); drop(StringArray::from(array.into_data())); diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index edf6d40f3ae..e6197eed19c 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -19,6 +19,7 @@ use crate::array::ArrowPrimitiveType; use crate::delta::shift_months; +use crate::OffsetSizeTrait; use arrow_buffer::i256; use arrow_data::decimal::{ validate_decimal256_precision_with_lt_bytes, validate_decimal_precision, @@ -28,6 +29,7 @@ use arrow_data::decimal::{ use arrow_schema::{ArrowError, DataType, IntervalUnit, TimeUnit}; use chrono::{Duration, NaiveDate}; use half::f16; +use std::marker::PhantomData; use std::ops::{Add, Sub}; // BooleanType is special: its bit-width is not the size of the primitive type, and its `index` @@ -464,7 +466,10 @@ impl Date64Type { } } -mod private { +/// Crate private types for Decimal Arrays +/// +/// Not intended to be used outside this crate +mod decimal { use super::*; pub trait DecimalTypeSealed {} @@ -482,7 +487,7 @@ mod private { /// [`Decimal128Array`]: [crate::array::Decimal128Array] /// [`Decimal256Array`]: [crate::array::Decimal256Array] pub trait DecimalType: - 'static + Send + Sync + ArrowPrimitiveType + private::DecimalTypeSealed + 'static + Send + Sync + ArrowPrimitiveType + decimal::DecimalTypeSealed { const BYTE_LENGTH: usize; const MAX_PRECISION: u8; @@ -574,6 +579,87 @@ fn format_decimal_str(value_str: &str, precision: usize, scale: usize) -> String } } +/// Crate private types for Byte Arrays +/// +/// Not intended to be used outside this crate +pub(crate) mod bytes { + use super::*; + + pub trait ByteArrayTypeSealed {} + impl ByteArrayTypeSealed for GenericStringType {} + impl ByteArrayTypeSealed for GenericBinaryType {} + + pub trait ByteArrayNativeType: std::fmt::Debug + Send + Sync { + /// # Safety + /// + /// `b` must be a valid byte sequence for `Self` + unsafe fn from_bytes_unchecked(b: &[u8]) -> &Self; + } + + impl ByteArrayNativeType for [u8] { + unsafe fn from_bytes_unchecked(b: &[u8]) -> &Self { + b + } + } + + impl ByteArrayNativeType for str { + unsafe fn from_bytes_unchecked(b: &[u8]) -> &Self { + std::str::from_utf8_unchecked(b) + } + } +} + +/// A trait over the variable-size byte array types +/// +/// See [Variable Size Binary Layout](https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-layout) +pub trait ByteArrayType: 'static + Send + Sync + bytes::ByteArrayTypeSealed { + type Offset: OffsetSizeTrait; + type Native: bytes::ByteArrayNativeType + AsRef<[u8]> + ?Sized; + /// "Binary" or "String", for use in error messages + const PREFIX: &'static str; + const DATA_TYPE: DataType; +} + +/// [`ByteArrayType`] for string arrays +pub struct GenericStringType { + phantom: PhantomData, +} + +impl ByteArrayType for GenericStringType { + type Offset = O; + type Native = str; + const PREFIX: &'static str = "String"; + + const DATA_TYPE: DataType = if O::IS_LARGE { + DataType::LargeUtf8 + } else { + DataType::Utf8 + }; +} + +pub type Utf8Type = GenericStringType; +pub type LargeUtf8Type = GenericStringType; + +/// [`ByteArrayType`] for binary arrays +pub struct GenericBinaryType { + phantom: PhantomData, +} + +impl ByteArrayType for GenericBinaryType { + type Offset = O; + type Native = [u8]; + const PREFIX: &'static str = "Binary"; + + const DATA_TYPE: DataType = if O::IS_LARGE { + DataType::LargeBinary + } else { + DataType::Binary + }; +} + +pub type BinaryType = GenericBinaryType; +pub type LargeBinaryType = GenericBinaryType; + #[cfg(test)] mod tests { use super::*;