diff --git a/Cargo.toml b/Cargo.toml index 355c65a8b80..270d23f26c9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,6 +18,7 @@ [workspace] members = [ "arrow", + "arrow-data", "arrow-schema", "arrow-buffer", "arrow-flight", diff --git a/arrow-data/Cargo.toml b/arrow-data/Cargo.toml new file mode 100644 index 00000000000..289b1bbd0eb --- /dev/null +++ b/arrow-data/Cargo.toml @@ -0,0 +1,56 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "arrow-data" +version = "23.0.0" +description = "Array data abstractions for Apache Arrow" +homepage = "https://github.com/apache/arrow-rs" +repository = "https://github.com/apache/arrow-rs" +authors = ["Apache Arrow "] +license = "Apache-2.0" +keywords = ["arrow"] +include = [ + "benches/*.rs", + "src/**/*.rs", + "Cargo.toml", +] +edition = "2021" +rust-version = "1.62" + +[lib] +name = "arrow_data" +path = "src/lib.rs" +bench = false + +[features] +# force_validate runs full data validation for all arrays that are created +# this is not enabled by default as it is too computationally expensive +# but is run as part of our CI checks +force_validate = [] + +[dependencies] + +arrow-buffer = { version = "23.0.0", path = "../arrow-buffer" } +arrow-schema = { version = "23.0.0", path = "../arrow-schema" } + +num = { version = "0.4", default-features = false, features = ["std"] } +half = { version = "2.0", default-features = false } + +[dev-dependencies] + +[build-dependencies] diff --git a/arrow/src/util/bit_iterator.rs b/arrow-data/src/bit_iterator.rs similarity index 98% rename from arrow/src/util/bit_iterator.rs rename to arrow-data/src/bit_iterator.rs index ceefaa860cb..45a42c3910f 100644 --- a/arrow/src/util/bit_iterator.rs +++ b/arrow-data/src/bit_iterator.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::util::bit_chunk_iterator::{UnalignedBitChunk, UnalignedBitChunkIterator}; +use arrow_buffer::bit_chunk_iterator::{UnalignedBitChunk, UnalignedBitChunkIterator}; use std::result::Result; /// Iterator of contiguous ranges of set bits within a provided packed bitmask diff --git a/arrow/src/util/bit_mask.rs b/arrow-data/src/bit_mask.rs similarity index 98% rename from arrow/src/util/bit_mask.rs rename to arrow-data/src/bit_mask.rs index da542a2bb1f..6a0a4603899 100644 --- a/arrow/src/util/bit_mask.rs +++ b/arrow-data/src/bit_mask.rs @@ -17,8 +17,8 @@ //! Utils for working with packed bit masks -use crate::util::bit_chunk_iterator::BitChunks; -use crate::util::bit_util::{ceil, get_bit, set_bit}; +use arrow_buffer::bit_chunk_iterator::BitChunks; +use arrow_buffer::bit_util::{ceil, get_bit, set_bit}; /// Sets all bits on `write_data` in the range `[offset_write..offset_write+len]` to be equal to the /// bits in `data` in the range `[offset_read..offset_read+len]` diff --git a/arrow/src/bitmap.rs b/arrow-data/src/bitmap.rs similarity index 93% rename from arrow/src/bitmap.rs rename to arrow-data/src/bitmap.rs index dbf9706677a..0002ef02212 100644 --- a/arrow/src/bitmap.rs +++ b/arrow-data/src/bitmap.rs @@ -17,8 +17,8 @@ //! Defines [Bitmap] for tracking validity bitmaps -use crate::error::{ArrowError, Result}; -use crate::util::bit_util; +use arrow_buffer::bit_util; +use arrow_schema::ArrowError; use std::mem; use arrow_buffer::buffer::{buffer_bin_and, buffer_bin_or, Buffer}; @@ -56,6 +56,10 @@ impl Bitmap { unsafe { bit_util::get_bit_raw(self.bits.as_ptr(), i) } } + pub fn buffer(&self) -> &Buffer { + &self.bits + } + pub fn buffer_ref(&self) -> &Buffer { &self.bits } @@ -76,9 +80,9 @@ impl Bitmap { } impl<'a, 'b> BitAnd<&'b Bitmap> for &'a Bitmap { - type Output = Result; + type Output = Result; - fn bitand(self, rhs: &'b Bitmap) -> Result { + fn bitand(self, rhs: &'b Bitmap) -> Result { if self.bits.len() != rhs.bits.len() { return Err(ArrowError::ComputeError( "Buffers must be the same size to apply Bitwise AND.".to_string(), @@ -95,9 +99,9 @@ impl<'a, 'b> BitAnd<&'b Bitmap> for &'a Bitmap { } impl<'a, 'b> BitOr<&'b Bitmap> for &'a Bitmap { - type Output = Result; + type Output = Result; - fn bitor(self, rhs: &'b Bitmap) -> Result { + fn bitor(self, rhs: &'b Bitmap) -> Result { if self.bits.len() != rhs.bits.len() { return Err(ArrowError::ComputeError( "Buffers must be the same size to apply Bitwise OR.".to_string(), diff --git a/arrow/src/array/data.rs b/arrow-data/src/data.rs similarity index 62% rename from arrow/src/array/data.rs rename to arrow-data/src/data.rs index 7571ba210d7..37c059748fe 100644 --- a/arrow/src/array/data.rs +++ b/arrow-data/src/data.rs @@ -18,24 +18,19 @@ //! Contains `ArrayData`, a generic representation of Arrow array data which encapsulates //! common attributes and operations for Arrow array. -use crate::datatypes::{ - validate_decimal256_precision_with_lt_bytes, validate_decimal_precision, DataType, - IntervalUnit, UnionMode, -}; -use crate::error::{ArrowError, Result}; -use crate::util::bit_iterator::BitSliceIterator; -use crate::{bitmap::Bitmap, datatypes::ArrowNativeType}; -use crate::{ - buffer::{Buffer, MutableBuffer}, - util::bit_util, +use crate::decimal::{ + validate_decimal256_precision_with_lt_bytes, validate_decimal_precision, }; +use crate::{bit_iterator::BitSliceIterator, bitmap::Bitmap}; +use arrow_buffer::{bit_util, ArrowNativeType, Buffer, MutableBuffer}; +use arrow_schema::{ArrowError, DataType, IntervalUnit, UnionMode}; use half::f16; use std::convert::TryInto; use std::mem; use std::ops::Range; use std::sync::Arc; -use super::equal::equal; +use crate::equal; #[inline] pub(crate) fn contains_nulls( @@ -346,7 +341,7 @@ impl ArrayData { offset: usize, buffers: Vec, child_data: Vec, - ) -> Result { + ) -> Result { // we must check the length of `null_bit_buffer` first // because we use this buffer to calculate `null_count` // in `Self::new_unchecked`. @@ -390,33 +385,6 @@ impl ArrayData { &self.data_type } - /// Updates the [DataType] of this ArrayData/ - /// - /// panic's if the new DataType is not compatible with the - /// existing type. - /// - /// Note: currently only changing a [DataType::Decimal128]s or - /// [DataType::Decimal256]s precision and scale are supported - #[inline] - pub(crate) fn with_data_type(mut self, new_data_type: DataType) -> Self { - if matches!(self.data_type, DataType::Decimal128(_, _)) { - assert!( - matches!(new_data_type, DataType::Decimal128(_, _)), - "only 128-bit DecimalType is supported for new datatype" - ); - } else if matches!(self.data_type, DataType::Decimal256(_, _)) { - assert!( - matches!(new_data_type, DataType::Decimal256(_, _)), - "only 256-bit DecimalType is supported for new datatype" - ); - } else { - panic!("only DecimalType is supported.") - } - - self.data_type = new_data_type; - self - } - /// Returns a slice of buffers for this array data pub fn buffers(&self) -> &[Buffer] { &self.buffers[..] @@ -562,7 +530,7 @@ impl ArrayData { /// * the buffer is not byte-aligned with type T, or /// * the datatype is `Boolean` (it corresponds to a bit-packed buffer where the offset is not applicable) #[inline] - pub(crate) fn buffer(&self, buffer: usize) -> &[T] { + pub fn buffer(&self, buffer: usize) -> &[T] { let values = unsafe { self.buffers[buffer].as_slice().align_to::() }; if !values.0.is_empty() || !values.2.is_empty() { panic!("The buffer is not byte-aligned with its interpretation") @@ -654,7 +622,7 @@ impl ArrayData { /// /// See [ArrayData::validate_full] to validate fully the offset content /// and the validitiy of utf8 data - pub fn validate(&self) -> Result<()> { + pub fn validate(&self) -> Result<(), ArrowError> { // Need at least this mich space in each buffer let len_plus_offset = self.len + self.offset; @@ -769,7 +737,7 @@ impl ArrayData { /// entries. /// /// For an empty array, the `buffer` can also be empty. - fn typed_offsets(&self) -> Result<&[T]> { + fn typed_offsets(&self) -> Result<&[T], ArrowError> { // An empty list-like array can have 0 offsets if self.len == 0 && self.buffers[0].is_empty() { return Ok(&[]); @@ -783,7 +751,7 @@ impl ArrayData { &self, idx: usize, len: usize, - ) -> Result<&[T]> { + ) -> Result<&[T], ArrowError> { let buffer = &self.buffers[idx]; let required_len = (len + self.offset) * std::mem::size_of::(); @@ -806,7 +774,7 @@ impl ArrayData { fn validate_offsets( &self, values_length: usize, - ) -> Result<()> { + ) -> Result<(), ArrowError> { // Justification: buffer size was validated above let offsets = self.typed_offsets::()?; if offsets.is_empty() { @@ -852,7 +820,7 @@ impl ArrayData { } /// Validates the layout of `child_data` ArrayData structures - fn validate_child_data(&self) -> Result<()> { + fn validate_child_data(&self) -> Result<(), ArrowError> { match &self.data_type { DataType::List(field) | DataType::Map(field, _) => { let values_data = self.get_single_valid_child_data(field.data_type())?; @@ -943,13 +911,13 @@ impl ArrayData { fn get_single_valid_child_data( &self, expected_type: &DataType, - ) -> Result<&ArrayData> { + ) -> Result<&ArrayData, ArrowError> { self.validate_num_child_data(1)?; self.get_valid_child_data(0, expected_type) } /// Returns `Err` if self.child_data does not have exactly `expected_len` elements - fn validate_num_child_data(&self, expected_len: usize) -> Result<()> { + fn validate_num_child_data(&self, expected_len: usize) -> Result<(), ArrowError> { if self.child_data().len() != expected_len { Err(ArrowError::InvalidArgumentError(format!( "Value data for {} should contain {} child data array(s), had {}", @@ -968,7 +936,7 @@ impl ArrayData { &self, i: usize, expected_type: &DataType, - ) -> Result<&ArrayData> { + ) -> Result<&ArrayData, ArrowError> { let values_data = self.child_data .get(i) .ok_or_else(|| { @@ -999,7 +967,7 @@ impl ArrayData { /// Does not (yet) check /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85) /// Note calls `validate()` internally - pub fn validate_full(&self) -> Result<()> { + pub fn validate_full(&self) -> Result<(), ArrowError> { // Check all buffer sizes prior to looking at them more deeply in this function self.validate()?; @@ -1034,7 +1002,7 @@ impl ArrayData { Ok(()) } - pub fn validate_values(&self) -> Result<()> { + pub fn validate_values(&self) -> Result<(), ArrowError> { match &self.data_type { DataType::Decimal128(p, _) => { let values_buffer: &[i128] = self.typed_buffer(0, self.len)?; @@ -1106,10 +1074,14 @@ impl ArrayData { /// /// For example, the offsets buffer contained `[1, 2, 4]`, this /// function would call `validate([1,2])`, and `validate([2,4])` - fn validate_each_offset(&self, offset_limit: usize, validate: V) -> Result<()> + fn validate_each_offset( + &self, + offset_limit: usize, + validate: V, + ) -> Result<(), ArrowError> where T: ArrowNativeType + TryInto + num::Num + std::fmt::Display, - V: Fn(usize, Range) -> Result<()>, + V: Fn(usize, Range) -> Result<(), ArrowError>, { self.typed_offsets::()? .iter() @@ -1147,7 +1119,7 @@ impl ArrayData { } }) .skip(1) // the first element is meaningless - .try_for_each(|res: Result<(usize, Range)>| { + .try_for_each(|res: Result<(usize, Range), ArrowError>| { let (item_index, range) = res?; validate(item_index-1, range) }) @@ -1155,7 +1127,7 @@ impl ArrayData { /// Ensures that all strings formed by the offsets in `buffers[0]` /// into `buffers[1]` are valid utf8 sequences - fn validate_utf8(&self) -> Result<()> + fn validate_utf8(&self) -> Result<(), ArrowError> where T: ArrowNativeType + TryInto + num::Num + std::fmt::Display, { @@ -1195,7 +1167,7 @@ impl ArrayData { /// Ensures that all offsets in `buffers[0]` into `buffers[1]` are /// between `0` and `offset_limit` - fn validate_offsets_full(&self, offset_limit: usize) -> Result<()> + fn validate_offsets_full(&self, offset_limit: usize) -> Result<(), ArrowError> where T: ArrowNativeType + TryInto + num::Num + std::fmt::Display, { @@ -1208,7 +1180,7 @@ impl ArrayData { /// Validates that each value in self.buffers (typed as T) /// is within the range [0, max_value], inclusive - fn check_bounds(&self, max_value: i64) -> Result<()> + fn check_bounds(&self, max_value: i64) -> Result<(), ArrowError> where T: ArrowNativeType + TryInto + num::Num + std::fmt::Display, { @@ -1288,7 +1260,7 @@ impl ArrayData { /// Return the expected [`DataTypeLayout`] Arrays of this data /// type are expected to have -pub(crate) fn layout(data_type: &DataType) -> DataTypeLayout { +pub fn layout(data_type: &DataType) -> DataTypeLayout { // based on C/C++ implementation in // https://github.com/apache/arrow/blob/661c7d749150905a63dd3b52e0a04dac39030d95/cpp/src/arrow/type.h (and .cc) use std::mem::size_of; @@ -1381,9 +1353,9 @@ pub(crate) fn layout(data_type: &DataType) -> DataTypeLayout { } /// Layout specification for a data type -#[derive(Debug, PartialEq)] +#[derive(Debug, PartialEq, Eq)] // Note: Follows structure from C++: https://github.com/apache/arrow/blob/master/cpp/src/arrow/type.h#L91 -pub(crate) struct DataTypeLayout { +pub struct DataTypeLayout { /// A vector of buffer layout specifications, one for each expected buffer pub buffers: Vec, @@ -1429,8 +1401,8 @@ impl DataTypeLayout { } /// Layout specification for a single data type buffer -#[derive(Debug, PartialEq)] -pub(crate) enum BufferSpec { +#[derive(Debug, PartialEq, Eq)] +pub enum BufferSpec { /// each element has a fixed width FixedWidth { byte_width: usize }, /// Variable width, such as string data for utf8 data @@ -1449,7 +1421,7 @@ pub(crate) enum BufferSpec { impl PartialEq for ArrayData { fn eq(&self, other: &Self) -> bool { - equal(self, other) + equal::equal(self, other) } } @@ -1545,7 +1517,7 @@ impl ArrayDataBuilder { } /// Creates an array data, validating all inputs - pub fn build(self) -> Result { + pub fn build(self) -> Result { ArrayData::try_new( self.data_type, self.len, @@ -1576,16 +1548,19 @@ impl From for ArrayDataBuilder { #[cfg(test)] mod tests { use super::*; - use std::ptr::NonNull; + use arrow_schema::Field; + + // See arrow/tests/array_data_validation.rs for test of array validation + + /// returns a buffer initialized with some constant value for tests + fn make_i32_buffer(n: usize) -> Buffer { + Buffer::from_slice_ref(&vec![42i32; n]) + } - use crate::array::{ - make_array, Array, BooleanBuilder, Decimal128Builder, FixedSizeListBuilder, - Int32Array, Int32Builder, Int64Array, StringArray, StructBuilder, UInt64Array, - UInt8Builder, - }; - use crate::buffer::Buffer; - use crate::datatypes::Field; - use crate::util::bit_util; + /// returns a buffer initialized with some constant value for tests + fn make_f32_buffer(n: usize) -> Buffer { + Buffer::from_slice_ref(&vec![42f32; n]) + } #[test] fn test_builder() { @@ -1771,1124 +1746,30 @@ mod tests { } #[test] - #[should_panic( - expected = "Need at least 80 bytes in buffers[0] in array of type Int64, but got 8" - )] - fn test_buffer_too_small() { - let buffer = Buffer::from_slice_ref(&[0i32, 2i32]); - // should fail as the declared size (10*8 = 80) is larger than the underlying bfufer (8) - ArrayData::try_new(DataType::Int64, 10, None, 0, vec![buffer], vec![]).unwrap(); - } - - #[test] - #[should_panic( - expected = "Need at least 16 bytes in buffers[0] in array of type Int64, but got 8" - )] - fn test_buffer_too_small_offset() { - let buffer = Buffer::from_slice_ref(&[0i32, 2i32]); - // should fail -- size is ok, but also has offset - ArrayData::try_new(DataType::Int64, 1, None, 1, vec![buffer], vec![]).unwrap(); - } - - #[test] - #[should_panic(expected = "Expected 1 buffers in array of type Int64, got 2")] - fn test_bad_number_of_buffers() { - let buffer1 = Buffer::from_slice_ref(&[0i32, 2i32]); - let buffer2 = Buffer::from_slice_ref(&[0i32, 2i32]); - ArrayData::try_new(DataType::Int64, 1, None, 0, vec![buffer1, buffer2], vec![]) - .unwrap(); - } - - #[test] - #[should_panic(expected = "integer overflow computing min buffer size")] - fn test_fixed_width_overflow() { - let buffer = Buffer::from_slice_ref(&[0i32, 2i32]); - ArrayData::try_new(DataType::Int64, usize::MAX, None, 0, vec![buffer], vec![]) - .unwrap(); - } - - #[test] - #[should_panic(expected = "null_bit_buffer size too small. got 1 needed 2")] - fn test_bitmap_too_small() { - let buffer = make_i32_buffer(9); - let null_bit_buffer = Buffer::from(vec![0b11111111]); - - ArrayData::try_new( - DataType::Int32, - 9, - Some(null_bit_buffer), - 0, - vec![buffer], - vec![], - ) - .unwrap(); - } - - // Test creating a dictionary with a non integer type - #[test] - #[should_panic(expected = "Dictionary key type must be integer, but was Utf8")] - fn test_non_int_dictionary() { - let i32_buffer = Buffer::from_slice_ref(&[0i32, 2i32]); - let data_type = - DataType::Dictionary(Box::new(DataType::Utf8), Box::new(DataType::Int32)); - let child_data = ArrayData::try_new( - DataType::Int32, - 1, - None, - 0, - vec![i32_buffer.clone()], - vec![], - ) - .unwrap(); - ArrayData::try_new( - data_type, - 1, - None, - 0, - vec![i32_buffer.clone(), i32_buffer], - vec![child_data], - ) - .unwrap(); - } - - #[test] - #[should_panic(expected = "Expected LargeUtf8 but child data had Utf8")] - fn test_mismatched_dictionary_types() { - // test w/ dictionary created with a child array data that has type different than declared - let string_array: StringArray = - vec![Some("foo"), Some("bar")].into_iter().collect(); - let i32_buffer = Buffer::from_slice_ref(&[0i32, 1i32]); - // Dict says LargeUtf8 but array is Utf8 - let data_type = DataType::Dictionary( - Box::new(DataType::Int32), - Box::new(DataType::LargeUtf8), - ); - let child_data = string_array.into_data(); - ArrayData::try_new(data_type, 1, None, 0, vec![i32_buffer], vec![child_data]) - .unwrap(); - } - - #[test] - fn test_empty_utf8_array_with_empty_offsets_buffer() { - let data_buffer = Buffer::from(&[]); - let offsets_buffer = Buffer::from(&[]); - ArrayData::try_new( - DataType::Utf8, - 0, - None, - 0, - vec![offsets_buffer, data_buffer], - vec![], - ) - .unwrap(); - } - - #[test] - fn test_empty_utf8_array_with_single_zero_offset() { - let data_buffer = Buffer::from(&[]); - let offsets_buffer = Buffer::from_slice_ref(&[0i32]); - ArrayData::try_new( - DataType::Utf8, - 0, - None, - 0, - vec![offsets_buffer, data_buffer], - vec![], - ) - .unwrap(); - } - - #[test] - #[should_panic(expected = "First offset 1 of Utf8 is larger than values length 0")] - fn test_empty_utf8_array_with_invalid_offset() { - let data_buffer = Buffer::from(&[]); - let offsets_buffer = Buffer::from_slice_ref(&[1i32]); - ArrayData::try_new( - DataType::Utf8, - 0, - None, - 0, - vec![offsets_buffer, data_buffer], - vec![], - ) - .unwrap(); - } - - #[test] - fn test_empty_utf8_array_with_non_zero_offset() { - let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); - let offsets_buffer = Buffer::from_slice_ref(&[0i32, 2, 6, 0]); - ArrayData::try_new( - DataType::Utf8, - 0, - None, - 3, - vec![offsets_buffer, data_buffer], - vec![], - ) - .unwrap(); - } - - #[test] - #[should_panic( - expected = "Buffer 0 of LargeUtf8 isn't large enough. Expected 8 bytes got 4" - )] - fn test_empty_large_utf8_array_with_wrong_type_offsets() { - let data_buffer = Buffer::from(&[]); - let offsets_buffer = Buffer::from_slice_ref(&[0i32]); - ArrayData::try_new( - DataType::LargeUtf8, - 0, - None, - 0, - vec![offsets_buffer, data_buffer], - vec![], - ) - .unwrap(); - } - - #[test] - #[should_panic( - expected = "Buffer 0 of Utf8 isn't large enough. Expected 12 bytes got 8" - )] - fn test_validate_offsets_i32() { - let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); - let offsets_buffer = Buffer::from_slice_ref(&[0i32, 2i32]); - ArrayData::try_new( - DataType::Utf8, - 2, - None, - 0, - vec![offsets_buffer, data_buffer], - vec![], - ) - .unwrap(); - } - - #[test] - #[should_panic( - expected = "Buffer 0 of LargeUtf8 isn't large enough. Expected 24 bytes got 16" - )] - fn test_validate_offsets_i64() { - let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); - let offsets_buffer = Buffer::from_slice_ref(&[0i64, 2i64]); - ArrayData::try_new( - DataType::LargeUtf8, - 2, - None, - 0, - vec![offsets_buffer, data_buffer], - vec![], - ) - .unwrap(); - } - - #[test] - #[should_panic(expected = "Error converting offset[0] (-2) to usize for Utf8")] - fn test_validate_offsets_negative_first_i32() { - let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); - let offsets_buffer = Buffer::from_slice_ref(&[-2i32, 1i32, 3i32]); - ArrayData::try_new( - DataType::Utf8, - 2, - None, - 0, - vec![offsets_buffer, data_buffer], - vec![], - ) - .unwrap(); - } - - #[test] - #[should_panic(expected = "Error converting offset[2] (-3) to usize for Utf8")] - fn test_validate_offsets_negative_last_i32() { - let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); - let offsets_buffer = Buffer::from_slice_ref(&[0i32, 2i32, -3i32]); - ArrayData::try_new( - DataType::Utf8, - 2, - None, - 0, - vec![offsets_buffer, data_buffer], - vec![], - ) - .unwrap(); - } - - #[test] - #[should_panic(expected = "First offset 4 in Utf8 is smaller than last offset 3")] - fn test_validate_offsets_range_too_small() { - let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); - // start offset is larger than end - let offsets_buffer = Buffer::from_slice_ref(&[4i32, 2i32, 3i32]); - ArrayData::try_new( - DataType::Utf8, - 2, - None, - 0, - vec![offsets_buffer, data_buffer], - vec![], - ) - .unwrap(); - } - - #[test] - #[should_panic(expected = "Last offset 10 of Utf8 is larger than values length 6")] - fn test_validate_offsets_range_too_large() { - let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); - // 10 is off the end of the buffer - let offsets_buffer = Buffer::from_slice_ref(&[0i32, 2i32, 10i32]); - ArrayData::try_new( - DataType::Utf8, - 2, - None, - 0, - vec![offsets_buffer, data_buffer], - vec![], - ) - .unwrap(); - } - - #[test] - #[should_panic(expected = "First offset 10 of Utf8 is larger than values length 6")] - fn test_validate_offsets_first_too_large() { - let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); - // 10 is off the end of the buffer - let offsets_buffer = Buffer::from_slice_ref(&[10i32, 2i32, 10i32]); - ArrayData::try_new( - DataType::Utf8, - 2, - None, - 0, - vec![offsets_buffer, data_buffer], - vec![], - ) - .unwrap(); - } - - #[test] - fn test_validate_offsets_first_too_large_skipped() { - let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); - // 10 is off the end of the buffer, but offset starts at 1 so it is skipped - let offsets_buffer = Buffer::from_slice_ref(&[10i32, 2i32, 3i32, 4i32]); - let data = ArrayData::try_new( - DataType::Utf8, - 2, - None, - 1, - vec![offsets_buffer, data_buffer], - vec![], - ) - .unwrap(); - let array: StringArray = data.into(); - let expected: StringArray = vec![Some("c"), Some("d")].into_iter().collect(); - assert_eq!(array, expected); - } + fn test_contains_nulls() { + let buffer: Buffer = + MutableBuffer::from_iter([false, false, false, true, true, false]).into(); - #[test] - #[should_panic(expected = "Last offset 8 of Utf8 is larger than values length 6")] - fn test_validate_offsets_last_too_large() { - let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); - // 10 is off the end of the buffer - let offsets_buffer = Buffer::from_slice_ref(&[5i32, 7i32, 8i32]); - ArrayData::try_new( - DataType::Utf8, - 2, - None, - 0, - vec![offsets_buffer, data_buffer], - vec![], - ) - .unwrap(); + assert!(contains_nulls(Some(&buffer), 0, 6)); + assert!(contains_nulls(Some(&buffer), 0, 3)); + assert!(!contains_nulls(Some(&buffer), 3, 2)); + assert!(!contains_nulls(Some(&buffer), 0, 0)); } #[test] - #[should_panic( - expected = "Values length 4 is less than the length (2) multiplied by the value size (2) for FixedSizeList" - )] - fn test_validate_fixed_size_list() { - // child has 4 elements, - let child_array = vec![Some(1), Some(2), Some(3), None] - .into_iter() - .collect::(); - - // but claim we have 3 elements for a fixed size of 2 - // 10 is off the end of the buffer - let field = Field::new("field", DataType::Int32, true); - ArrayData::try_new( - DataType::FixedSizeList(Box::new(field), 2), - 3, - None, - 0, - vec![], - vec![child_array.into_data()], - ) - .unwrap(); - } + fn test_into_buffers() { + let data_types = vec![ + DataType::Union(vec![], vec![], UnionMode::Dense), + DataType::Union(vec![], vec![], UnionMode::Sparse), + ]; - #[test] - #[should_panic(expected = "Child type mismatch for Struct")] - fn test_validate_struct_child_type() { - let field1 = vec![Some(1), Some(2), Some(3), None] - .into_iter() - .collect::(); - - // validate the the type of struct fields matches child fields - ArrayData::try_new( - DataType::Struct(vec![Field::new("field1", DataType::Int64, true)]), - 3, - None, - 0, - vec![], - vec![field1.into_data()], - ) - .unwrap(); - } - - #[test] - #[should_panic( - expected = "child array #0 for field field1 has length smaller than expected for struct array (4 < 6)" - )] - fn test_validate_struct_child_length() { - // field length only has 4 items, but array claims to have 6 - let field1 = vec![Some(1), Some(2), Some(3), None] - .into_iter() - .collect::(); - - ArrayData::try_new( - DataType::Struct(vec![Field::new("field1", DataType::Int32, true)]), - 6, - None, - 0, - vec![], - vec![field1.into_data()], - ) - .unwrap(); - } - - /// Test that the array of type `data_type` that has invalid utf8 data errors - fn check_utf8_validation(data_type: DataType) { - // 0x80 is a utf8 continuation sequence and is not a valid utf8 sequence itself - let data_buffer = Buffer::from_slice_ref(&[b'a', b'a', 0x80, 0x00]); - let offsets: Vec = [0, 2, 3] - .iter() - .map(|&v| T::from_usize(v).unwrap()) - .collect(); - - let offsets_buffer = Buffer::from_slice_ref(&offsets); - ArrayData::try_new( - data_type, - 2, - None, - 0, - vec![offsets_buffer, data_buffer], - vec![], - ) - .unwrap(); - } - - #[test] - #[should_panic(expected = "Invalid UTF8 sequence at string index 1 (2..3)")] - fn test_validate_utf8_content() { - check_utf8_validation::(DataType::Utf8); - } - - #[test] - #[should_panic(expected = "Invalid UTF8 sequence at string index 1 (2..3)")] - fn test_validate_large_utf8_content() { - check_utf8_validation::(DataType::LargeUtf8); - } - - /// Tests that offsets are at valid codepoint boundaries - fn check_utf8_char_boundary(data_type: DataType) { - let data_buffer = Buffer::from("🙀".as_bytes()); - let offsets: Vec = [0, 1, data_buffer.len()] - .iter() - .map(|&v| T::from_usize(v).unwrap()) - .collect(); - - let offsets_buffer = Buffer::from_slice_ref(&offsets); - ArrayData::try_new( - data_type, - 2, - None, - 0, - vec![offsets_buffer, data_buffer], - vec![], - ) - .unwrap(); - } - - #[test] - #[should_panic(expected = "incomplete utf-8 byte sequence from index 0")] - fn test_validate_utf8_char_boundary() { - check_utf8_char_boundary::(DataType::Utf8); - } - - #[test] - #[should_panic(expected = "incomplete utf-8 byte sequence from index 0")] - fn test_validate_large_utf8_char_boundary() { - check_utf8_char_boundary::(DataType::LargeUtf8); - } - - /// Test that the array of type `data_type` that has invalid indexes (out of bounds) - fn check_index_out_of_bounds_validation(data_type: DataType) { - let data_buffer = Buffer::from_slice_ref(&[b'a', b'b', b'c', b'd']); - // First two offsets are fine, then 5 is out of bounds - let offsets: Vec = [0, 1, 2, 5, 2] - .iter() - .map(|&v| T::from_usize(v).unwrap()) - .collect(); - - let offsets_buffer = Buffer::from_slice_ref(&offsets); - ArrayData::try_new( - data_type, - 4, - None, - 0, - vec![offsets_buffer, data_buffer], - vec![], - ) - .unwrap(); - } - - #[test] - #[should_panic( - expected = "Offset invariant failure: offset at position 3 out of bounds: 5 > 4" - )] - fn test_validate_utf8_out_of_bounds() { - check_index_out_of_bounds_validation::(DataType::Utf8); - } - - #[test] - #[should_panic( - expected = "Offset invariant failure: offset at position 3 out of bounds: 5 > 4" - )] - fn test_validate_large_utf8_out_of_bounds() { - check_index_out_of_bounds_validation::(DataType::LargeUtf8); - } - - #[test] - #[should_panic( - expected = "Offset invariant failure: offset at position 3 out of bounds: 5 > 4" - )] - fn test_validate_binary_out_of_bounds() { - check_index_out_of_bounds_validation::(DataType::Binary); - } - - #[test] - #[should_panic( - expected = "Offset invariant failure: offset at position 3 out of bounds: 5 > 4" - )] - fn test_validate_large_binary_out_of_bounds() { - check_index_out_of_bounds_validation::(DataType::LargeBinary); - } - - // validate that indexes don't go bacwards check indexes that go backwards - fn check_index_backwards_validation(data_type: DataType) { - let data_buffer = Buffer::from_slice_ref(&[b'a', b'b', b'c', b'd']); - // First three offsets are fine, then 1 goes backwards - let offsets: Vec = [0, 1, 2, 2, 1] - .iter() - .map(|&v| T::from_usize(v).unwrap()) - .collect(); - - let offsets_buffer = Buffer::from_slice_ref(&offsets); - ArrayData::try_new( - data_type, - 4, - None, - 0, - vec![offsets_buffer, data_buffer], - vec![], - ) - .unwrap(); - } - - #[test] - #[should_panic( - expected = "Offset invariant failure: non-monotonic offset at slot 3: 2 > 1" - )] - fn test_validate_utf8_index_backwards() { - check_index_backwards_validation::(DataType::Utf8); - } - - #[test] - #[should_panic( - expected = "Offset invariant failure: non-monotonic offset at slot 3: 2 > 1" - )] - fn test_validate_large_utf8_index_backwards() { - check_index_backwards_validation::(DataType::LargeUtf8); - } - - #[test] - #[should_panic( - expected = "Offset invariant failure: non-monotonic offset at slot 3: 2 > 1" - )] - fn test_validate_binary_index_backwards() { - check_index_backwards_validation::(DataType::Binary); - } - - #[test] - #[should_panic( - expected = "Offset invariant failure: non-monotonic offset at slot 3: 2 > 1" - )] - fn test_validate_large_binary_index_backwards() { - check_index_backwards_validation::(DataType::LargeBinary); - } - - #[test] - #[should_panic( - expected = "Value at position 1 out of bounds: 3 (should be in [0, 1])" - )] - fn test_validate_dictionary_index_too_large() { - let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect(); - - // 3 is not a valid index into the values (only 0 and 1) - let keys: Int32Array = [Some(1), Some(3)].into_iter().collect(); - - let data_type = DataType::Dictionary( - Box::new(keys.data_type().clone()), - Box::new(values.data_type().clone()), - ); - - ArrayData::try_new( - data_type, - 2, - None, - 0, - vec![keys.data().buffers[0].clone()], - vec![values.into_data()], - ) - .unwrap(); - } - - #[test] - #[should_panic( - expected = "Value at position 1 out of bounds: -1 (should be in [0, 1]" - )] - fn test_validate_dictionary_index_negative() { - let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect(); - - // -1 is not a valid index at all! - let keys: Int32Array = [Some(1), Some(-1)].into_iter().collect(); - - let data_type = DataType::Dictionary( - Box::new(keys.data_type().clone()), - Box::new(values.data_type().clone()), - ); - - ArrayData::try_new( - data_type, - 2, - None, - 0, - vec![keys.data().buffers[0].clone()], - vec![values.into_data()], - ) - .unwrap(); - } - - #[test] - fn test_validate_dictionary_index_negative_but_not_referenced() { - let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect(); - - // -1 is not a valid index at all, but the array is length 1 - // so the -1 should not be looked at - let keys: Int32Array = [Some(1), Some(-1)].into_iter().collect(); - - let data_type = DataType::Dictionary( - Box::new(keys.data_type().clone()), - Box::new(values.data_type().clone()), - ); - - // Expect this not to panic - ArrayData::try_new( - data_type, - 1, - None, - 0, - vec![keys.data().buffers[0].clone()], - vec![values.into_data()], - ) - .unwrap(); - } - - #[test] - #[should_panic( - expected = "Value at position 0 out of bounds: 18446744073709551615 (can not convert to i64)" - )] - fn test_validate_dictionary_index_giant_negative() { - let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect(); - - // -1 is not a valid index at all! - let keys: UInt64Array = [Some(u64::MAX), Some(1)].into_iter().collect(); - - let data_type = DataType::Dictionary( - Box::new(keys.data_type().clone()), - Box::new(values.data_type().clone()), - ); - - ArrayData::try_new( - data_type, - 2, - None, - 0, - vec![keys.data().buffers[0].clone()], - vec![values.into_data()], - ) - .unwrap(); - } - - /// Test that the list of type `data_type` generates correct offset out of bounds errors - fn check_list_offsets(data_type: DataType) { - let values: Int32Array = - [Some(1), Some(2), Some(3), Some(4)].into_iter().collect(); - - // 5 is an invalid offset into a list of only three values - let offsets: Vec = [0, 2, 5, 4] - .iter() - .map(|&v| T::from_usize(v).unwrap()) - .collect(); - let offsets_buffer = Buffer::from_slice_ref(&offsets); - - ArrayData::try_new( - data_type, - 3, - None, - 0, - vec![offsets_buffer], - vec![values.into_data()], - ) - .unwrap(); - } - - #[test] - #[should_panic( - expected = "Offset invariant failure: offset at position 2 out of bounds: 5 > 4" - )] - fn test_validate_list_offsets() { - let field_type = Field::new("f", DataType::Int32, true); - check_list_offsets::(DataType::List(Box::new(field_type))); - } - - #[test] - #[should_panic( - expected = "Offset invariant failure: offset at position 2 out of bounds: 5 > 4" - )] - fn test_validate_large_list_offsets() { - let field_type = Field::new("f", DataType::Int32, true); - check_list_offsets::(DataType::LargeList(Box::new(field_type))); - } - - /// Test that the list of type `data_type` generates correct errors for negative offsets - #[test] - #[should_panic( - expected = "Offset invariant failure: Could not convert offset -1 to usize at position 2" - )] - fn test_validate_list_negative_offsets() { - let values: Int32Array = - [Some(1), Some(2), Some(3), Some(4)].into_iter().collect(); - let field_type = Field::new("f", values.data_type().clone(), true); - let data_type = DataType::List(Box::new(field_type)); - - // -1 is an invalid offset any way you look at it - let offsets: Vec = vec![0, 2, -1, 4]; - let offsets_buffer = Buffer::from_slice_ref(&offsets); - - ArrayData::try_new( - data_type, - 3, - None, - 0, - vec![offsets_buffer], - vec![values.into_data()], - ) - .unwrap(); - } - - #[test] - #[should_panic( - expected = "Value at position 1 out of bounds: -1 (should be in [0, 1])" - )] - /// test that children are validated recursively (aka bugs in child data of struct also are flagged) - fn test_validate_recursive() { - // Form invalid dictionary array - let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect(); - // -1 is not a valid index - let keys: Int32Array = [Some(1), Some(-1), Some(1)].into_iter().collect(); - - let dict_data_type = DataType::Dictionary( - Box::new(keys.data_type().clone()), - Box::new(values.data_type().clone()), - ); - - // purposely create an invalid child data - let dict_data = unsafe { - ArrayData::new_unchecked( - dict_data_type, - 2, - None, - None, - 0, - vec![keys.data().buffers[0].clone()], - vec![values.into_data()], - ) - }; - - // Now, try and create a struct with this invalid child data (and expect an error) - let data_type = - DataType::Struct(vec![Field::new("d", dict_data.data_type().clone(), true)]); - - ArrayData::try_new(data_type, 1, None, 0, vec![], vec![dict_data]).unwrap(); - } - - /// returns a buffer initialized with some constant value for tests - fn make_i32_buffer(n: usize) -> Buffer { - Buffer::from_slice_ref(&vec![42i32; n]) - } - - /// returns a buffer initialized with some constant value for tests - fn make_f32_buffer(n: usize) -> Buffer { - Buffer::from_slice_ref(&vec![42f32; n]) - } - - #[test] - #[should_panic(expected = "Expected Int64 but child data had Int32")] - fn test_validate_union_different_types() { - let field1 = vec![Some(1), Some(2)].into_iter().collect::(); - - let field2 = vec![Some(1), Some(2)].into_iter().collect::(); - - let type_ids = Buffer::from_slice_ref(&[0i8, 1i8]); - - ArrayData::try_new( - DataType::Union( - vec![ - Field::new("field1", DataType::Int32, true), - Field::new("field2", DataType::Int64, true), // data is int32 - ], - vec![0, 1], - UnionMode::Sparse, - ), - 2, - None, - 0, - vec![type_ids], - vec![field1.into_data(), field2.into_data()], - ) - .unwrap(); - } - - // sparse with wrong sized children - #[test] - #[should_panic( - expected = "Sparse union child array #1 has length smaller than expected for union array (1 < 2)" - )] - fn test_validate_union_sparse_different_child_len() { - let field1 = vec![Some(1), Some(2)].into_iter().collect::(); - - // field 2 only has 1 item but array should have 2 - let field2 = vec![Some(1)].into_iter().collect::(); - - let type_ids = Buffer::from_slice_ref(&[0i8, 1i8]); - - ArrayData::try_new( - DataType::Union( - vec![ - Field::new("field1", DataType::Int32, true), - Field::new("field2", DataType::Int64, true), - ], - vec![0, 1], - UnionMode::Sparse, - ), - 2, - None, - 0, - vec![type_ids], - vec![field1.into_data(), field2.into_data()], - ) - .unwrap(); - } - - #[test] - #[should_panic(expected = "Expected 2 buffers in array of type Union")] - fn test_validate_union_dense_without_offsets() { - let field1 = vec![Some(1), Some(2)].into_iter().collect::(); - - let field2 = vec![Some(1)].into_iter().collect::(); - - let type_ids = Buffer::from_slice_ref(&[0i8, 1i8]); - - ArrayData::try_new( - DataType::Union( - vec![ - Field::new("field1", DataType::Int32, true), - Field::new("field2", DataType::Int64, true), - ], - vec![0, 1], - UnionMode::Dense, - ), - 2, - None, - 0, - vec![type_ids], // need offsets buffer here too - vec![field1.into_data(), field2.into_data()], - ) - .unwrap(); - } - - #[test] - #[should_panic( - expected = "Need at least 8 bytes in buffers[1] in array of type Union" - )] - fn test_validate_union_dense_with_bad_len() { - let field1 = vec![Some(1), Some(2)].into_iter().collect::(); - - let field2 = vec![Some(1)].into_iter().collect::(); - - let type_ids = Buffer::from_slice_ref(&[0i8, 1i8]); - let offsets = Buffer::from_slice_ref(&[0i32]); // should have 2 offsets, but only have 1 - - ArrayData::try_new( - DataType::Union( - vec![ - Field::new("field1", DataType::Int32, true), - Field::new("field2", DataType::Int64, true), - ], - vec![0, 1], - UnionMode::Dense, - ), - 2, - None, - 0, - vec![type_ids, offsets], - vec![field1.into_data(), field2.into_data()], - ) - .unwrap(); - } - - #[test] - fn test_try_new_sliced_struct() { - let mut builder = StructBuilder::new( - vec![ - Field::new("a", DataType::Int32, true), - Field::new("b", DataType::Boolean, true), - ], - vec![ - Box::new(Int32Builder::with_capacity(5)), - Box::new(BooleanBuilder::with_capacity(5)), - ], - ); - - // struct[0] = { a: 10, b: true } - builder - .field_builder::(0) - .unwrap() - .append_option(Some(10)); - builder - .field_builder::(1) - .unwrap() - .append_option(Some(true)); - builder.append(true); - - // struct[1] = null - builder - .field_builder::(0) - .unwrap() - .append_option(None); - builder - .field_builder::(1) - .unwrap() - .append_option(None); - builder.append(false); - - // struct[2] = { a: null, b: false } - builder - .field_builder::(0) - .unwrap() - .append_option(None); - builder - .field_builder::(1) - .unwrap() - .append_option(Some(false)); - builder.append(true); - - // struct[3] = { a: 21, b: null } - builder - .field_builder::(0) - .unwrap() - .append_option(Some(21)); - builder - .field_builder::(1) - .unwrap() - .append_option(None); - builder.append(true); - - // struct[4] = { a: 18, b: false } - builder - .field_builder::(0) - .unwrap() - .append_option(Some(18)); - builder - .field_builder::(1) - .unwrap() - .append_option(Some(false)); - builder.append(true); - - let struct_array = builder.finish(); - let struct_array_slice = struct_array.slice(1, 3); - let struct_array_data = struct_array_slice.data(); - - let cloned_data = ArrayData::try_new( - struct_array_slice.data_type().clone(), - struct_array_slice.len(), - struct_array_data.null_buffer().cloned(), - struct_array_slice.offset(), - struct_array_data.buffers().to_vec(), - struct_array_data.child_data().to_vec(), - ) - .unwrap(); - let cloned = crate::array::make_array(cloned_data); - - assert_eq!(&struct_array_slice, &cloned); - } - - #[test] - fn test_into_buffers() { - let data_types = vec![ - DataType::Union(vec![], vec![], UnionMode::Dense), - DataType::Union(vec![], vec![], UnionMode::Sparse), - ]; - - for data_type in data_types { - let buffers = new_buffers(&data_type, 0); - let [buffer1, buffer2] = buffers; - let buffers = into_buffers(&data_type, buffer1, buffer2); + for data_type in data_types { + let buffers = new_buffers(&data_type, 0); + let [buffer1, buffer2] = buffers; + let buffers = into_buffers(&data_type, buffer1, buffer2); let layout = layout(&data_type); assert_eq!(buffers.len(), layout.buffers.len()); } } - - #[test] - fn test_string_data_from_foreign() { - let mut strings = "foobarfoobar".to_owned(); - let mut offsets = vec![0_i32, 0, 3, 6, 12]; - let mut bitmap = vec![0b1110_u8]; - - let strings_buffer = unsafe { - Buffer::from_custom_allocation( - NonNull::new_unchecked(strings.as_mut_ptr()), - strings.len(), - Arc::new(strings), - ) - }; - let offsets_buffer = unsafe { - Buffer::from_custom_allocation( - NonNull::new_unchecked(offsets.as_mut_ptr() as *mut u8), - offsets.len() * std::mem::size_of::(), - Arc::new(offsets), - ) - }; - let null_buffer = unsafe { - Buffer::from_custom_allocation( - NonNull::new_unchecked(bitmap.as_mut_ptr()), - bitmap.len(), - Arc::new(bitmap), - ) - }; - - let data = ArrayData::try_new( - DataType::Utf8, - 4, - Some(null_buffer), - 0, - vec![offsets_buffer, strings_buffer], - vec![], - ) - .unwrap(); - - let array = make_array(data); - let array = array.as_any().downcast_ref::().unwrap(); - - let expected = - StringArray::from(vec![None, Some("foo"), Some("bar"), Some("foobar")]); - - assert_eq!(array, &expected); - } - - #[test] - #[cfg(not(feature = "force_validate"))] - fn test_decimal_full_validation() { - let values_builder = UInt8Builder::with_capacity(10); - let byte_width = 16; - let mut fixed_size_builder = - FixedSizeListBuilder::new(values_builder, byte_width); - let value_as_bytes = 123456_i128.to_le_bytes(); - fixed_size_builder - .values() - .append_slice(value_as_bytes.as_slice()); - fixed_size_builder.append(true); - let fixed_size_array = fixed_size_builder.finish(); - - // Build ArrayData for Decimal - let builder = ArrayData::builder(DataType::Decimal128(5, 3)) - .len(fixed_size_array.len()) - .add_buffer(fixed_size_array.data_ref().child_data()[0].buffers()[0].clone()); - let array_data = unsafe { builder.build_unchecked() }; - let validation_result = array_data.validate_full(); - let error = validation_result.unwrap_err(); - assert_eq!( - "Invalid argument error: 123456 is too large to store in a Decimal128 of precision 5. Max is 99999", - error.to_string() - ); - } - - #[test] - fn test_decimal_validation() { - let mut builder = Decimal128Builder::with_capacity(4, 10, 4); - builder.append_value(10000).unwrap(); - builder.append_value(20000).unwrap(); - let array = builder.finish(); - - array.data().validate_full().unwrap(); - } - - #[test] - #[cfg(not(feature = "force_validate"))] - fn test_sliced_array_child() { - let values = Int32Array::from_iter_values([1, 2, 3]); - let values_sliced = values.slice(1, 2); - let offsets = Buffer::from_iter([1_i32, 3_i32]); - - let list_field = Field::new("element", DataType::Int32, false); - let data_type = DataType::List(Box::new(list_field)); - - let data = unsafe { - ArrayData::new_unchecked( - data_type, - 1, - None, - None, - 0, - vec![offsets], - vec![values_sliced.into_data()], - ) - }; - - let err = data.validate_values().unwrap_err(); - assert_eq!(err.to_string(), "Invalid argument error: Offset invariant failure: offset at position 1 out of bounds: 3 > 2"); - } - - #[test] - fn test_contains_nulls() { - let buffer: Buffer = - MutableBuffer::from_iter([false, false, false, true, true, false]).into(); - - assert!(contains_nulls(Some(&buffer), 0, 6)); - assert!(contains_nulls(Some(&buffer), 0, 3)); - assert!(!contains_nulls(Some(&buffer), 3, 2)); - assert!(!contains_nulls(Some(&buffer), 0, 0)); - } } diff --git a/arrow/src/datatypes/decimal.rs b/arrow-data/src/decimal.rs similarity index 95% rename from arrow/src/datatypes/decimal.rs rename to arrow-data/src/decimal.rs index ffdb04e0d77..592a461ad5c 100644 --- a/arrow/src/datatypes/decimal.rs +++ b/arrow-data/src/decimal.rs @@ -15,8 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::error::{ArrowError, Result}; -use crate::util::decimal::singed_cmp_le_bytes; +use arrow_schema::ArrowError; use num::BigInt; use std::cmp::Ordering; @@ -745,7 +744,7 @@ pub const DECIMAL_DEFAULT_SCALE: u8 = 10; /// Validates that the specified `i128` value can be properly /// interpreted as a Decimal number with precision `precision` #[inline] -pub(crate) fn validate_decimal_precision(value: i128, precision: u8) -> Result<()> { +pub fn validate_decimal_precision(value: i128, precision: u8) -> Result<(), ArrowError> { if precision > DECIMAL128_MAX_PRECISION { return Err(ArrowError::InvalidArgumentError(format!( "Max precision of a Decimal128 is {}, but got {}", @@ -774,10 +773,10 @@ pub(crate) fn validate_decimal_precision(value: i128, precision: u8) -> Result<( /// Validates that the specified `byte_array` of little-endian format /// value can be properly interpreted as a Decimal256 number with precision `precision` #[inline] -pub(crate) fn validate_decimal256_precision_with_lt_bytes( +pub fn validate_decimal256_precision_with_lt_bytes( lt_value: &[u8], precision: u8, -) -> Result<()> { +) -> Result<(), ArrowError> { if precision > DECIMAL256_MAX_PRECISION { return Err(ArrowError::InvalidArgumentError(format!( "Max precision of a Decimal256 is {}, but got {}", @@ -806,28 +805,44 @@ pub(crate) fn validate_decimal256_precision_with_lt_bytes( } } -#[cfg(test)] -mod test { - use super::*; - use crate::util::decimal::Decimal256; - use num::{BigInt, Num}; - - #[test] - fn test_decimal256_min_max_for_precision() { - // The precision from 1 to 76 - let mut max_value = "9".to_string(); - let mut min_value = "-9".to_string(); - for i in 1..77 { - let max_decimal = - Decimal256::from(BigInt::from_str_radix(max_value.as_str(), 10).unwrap()); - let min_decimal = - Decimal256::from(BigInt::from_str_radix(min_value.as_str(), 10).unwrap()); - let max_bytes = MAX_DECIMAL_BYTES_FOR_LARGER_EACH_PRECISION[i - 1]; - let min_bytes = MIN_DECIMAL_BYTES_FOR_LARGER_EACH_PRECISION[i - 1]; - max_value += "9"; - min_value += "9"; - assert_eq!(max_decimal.raw_value(), &max_bytes); - assert_eq!(min_decimal.raw_value(), &min_bytes); +// compare two signed integer which are encoded with little endian. +// left bytes and right bytes must have the same length. +#[inline] +pub fn singed_cmp_le_bytes(left: &[u8], right: &[u8]) -> Ordering { + assert_eq!( + left.len(), + right.len(), + "Can't compare bytes array with different len: {}, {}", + left.len(), + right.len() + ); + assert_ne!(left.len(), 0, "Can't compare bytes array of length 0"); + let len = left.len(); + // the sign bit is 1, the value is negative + let left_negative = left[len - 1] >= 0x80_u8; + let right_negative = right[len - 1] >= 0x80_u8; + if left_negative != right_negative { + return match left_negative { + true => { + // left is negative value + // right is positive value + Ordering::Less + } + false => Ordering::Greater, + }; + } + for i in 0..len { + let l_byte = left[len - 1 - i]; + let r_byte = right[len - 1 - i]; + match l_byte.cmp(&r_byte) { + Ordering::Less => { + return Ordering::Less; + } + Ordering::Greater => { + return Ordering::Greater; + } + Ordering::Equal => {} } } + Ordering::Equal } diff --git a/arrow/src/array/equal/boolean.rs b/arrow-data/src/equal/boolean.rs similarity index 77% rename from arrow/src/array/equal/boolean.rs rename to arrow-data/src/equal/boolean.rs index fddf21b963a..52e822f03f3 100644 --- a/arrow/src/array/equal/boolean.rs +++ b/arrow-data/src/equal/boolean.rs @@ -15,9 +15,9 @@ // specific language governing permissions and limitations // under the License. -use crate::array::{data::contains_nulls, ArrayData}; -use crate::util::bit_iterator::BitIndexIterator; -use crate::util::bit_util::get_bit; +use crate::bit_iterator::BitIndexIterator; +use crate::data::{contains_nulls, ArrayData}; +use arrow_buffer::bit_util::get_bit; use super::utils::{equal_bits, equal_len}; @@ -88,30 +88,3 @@ pub(super) fn boolean_equal( }) } } - -#[cfg(test)] -mod tests { - use crate::array::{Array, BooleanArray}; - - #[test] - fn test_boolean_slice() { - let array = BooleanArray::from(vec![true; 32]); - let slice = array.slice(4, 12); - assert_eq!(slice.data(), slice.data()); - - let slice = array.slice(8, 12); - assert_eq!(slice.data(), slice.data()); - - let slice = array.slice(8, 24); - assert_eq!(slice.data(), slice.data()); - } - - #[test] - fn test_sliced_nullable_boolean_array() { - let a = BooleanArray::from(vec![None; 32]); - let b = BooleanArray::from(vec![true; 32]); - let slice_a = a.slice(1, 12); - let slice_b = b.slice(1, 12); - assert_ne!(slice_a.data(), slice_b.data()); - } -} diff --git a/arrow/src/array/equal/decimal.rs b/arrow-data/src/equal/decimal.rs similarity index 95% rename from arrow/src/array/equal/decimal.rs rename to arrow-data/src/equal/decimal.rs index 49112608c3a..15703389cb8 100644 --- a/arrow/src/array/equal/decimal.rs +++ b/arrow-data/src/equal/decimal.rs @@ -15,9 +15,9 @@ // specific language governing permissions and limitations // under the License. -use crate::array::{data::contains_nulls, ArrayData}; -use crate::datatypes::DataType; -use crate::util::bit_util::get_bit; +use crate::data::{contains_nulls, ArrayData}; +use arrow_buffer::bit_util::get_bit; +use arrow_schema::DataType; use super::utils::equal_len; diff --git a/arrow/src/array/equal/dictionary.rs b/arrow-data/src/equal/dictionary.rs similarity index 95% rename from arrow/src/array/equal/dictionary.rs rename to arrow-data/src/equal/dictionary.rs index 1474da5e2d2..5638c5c91c5 100644 --- a/arrow/src/array/equal/dictionary.rs +++ b/arrow-data/src/equal/dictionary.rs @@ -15,9 +15,8 @@ // specific language governing permissions and limitations // under the License. -use crate::array::{data::contains_nulls, ArrayData}; -use crate::datatypes::ArrowNativeType; -use crate::util::bit_util::get_bit; +use crate::data::{contains_nulls, ArrayData}; +use arrow_buffer::{bit_util::get_bit, ArrowNativeType}; use super::equal_range; diff --git a/arrow/src/array/equal/fixed_binary.rs b/arrow-data/src/equal/fixed_binary.rs similarity index 95% rename from arrow/src/array/equal/fixed_binary.rs rename to arrow-data/src/equal/fixed_binary.rs index 58eb22bb19b..d6af208016f 100644 --- a/arrow/src/array/equal/fixed_binary.rs +++ b/arrow-data/src/equal/fixed_binary.rs @@ -15,9 +15,9 @@ // specific language governing permissions and limitations // under the License. -use crate::array::{data::contains_nulls, ArrayData}; -use crate::datatypes::DataType; -use crate::util::bit_util::get_bit; +use crate::data::{contains_nulls, ArrayData}; +use arrow_buffer::bit_util::get_bit; +use arrow_schema::DataType; use super::utils::equal_len; diff --git a/arrow/src/array/equal/fixed_list.rs b/arrow-data/src/equal/fixed_list.rs similarity index 95% rename from arrow/src/array/equal/fixed_list.rs rename to arrow-data/src/equal/fixed_list.rs index 055bcece135..204a8658e74 100644 --- a/arrow/src/array/equal/fixed_list.rs +++ b/arrow-data/src/equal/fixed_list.rs @@ -15,9 +15,9 @@ // specific language governing permissions and limitations // under the License. -use crate::array::{data::contains_nulls, ArrayData}; -use crate::datatypes::DataType; -use crate::util::bit_util::get_bit; +use crate::data::{contains_nulls, ArrayData}; +use arrow_buffer::bit_util::get_bit; +use arrow_schema::DataType; use super::equal_range; diff --git a/arrow/src/array/equal/list.rs b/arrow-data/src/equal/list.rs similarity index 75% rename from arrow/src/array/equal/list.rs rename to arrow-data/src/equal/list.rs index b3bca9a6922..25273f8bad6 100644 --- a/arrow/src/array/equal/list.rs +++ b/arrow-data/src/equal/list.rs @@ -15,15 +15,14 @@ // specific language governing permissions and limitations // under the License. -use crate::{ - array::ArrayData, - array::{data::count_nulls, OffsetSizeTrait}, - util::bit_util::get_bit, -}; +use crate::data::{count_nulls, ArrayData}; +use arrow_buffer::bit_util::get_bit; +use arrow_buffer::ArrowNativeType; +use num::Integer; use super::equal_range; -fn lengths_equal(lhs: &[T], rhs: &[T]) -> bool { +fn lengths_equal(lhs: &[T], rhs: &[T]) -> bool { // invariant from `base_equal` debug_assert_eq!(lhs.len(), rhs.len()); @@ -45,7 +44,7 @@ fn lengths_equal(lhs: &[T], rhs: &[T]) -> bool { }) } -pub(super) fn list_equal( +pub(super) fn list_equal( lhs: &ArrayData, rhs: &ArrayData, lhs_start: usize, @@ -149,52 +148,3 @@ pub(super) fn list_equal( }) } } - -#[cfg(test)] -mod tests { - use crate::{ - array::{Array, Int64Builder, ListArray, ListBuilder}, - datatypes::Int32Type, - }; - - #[test] - fn list_array_non_zero_nulls() { - // Tests handling of list arrays with non-empty null ranges - let mut builder = ListBuilder::new(Int64Builder::with_capacity(10)); - builder.values().append_value(1); - builder.values().append_value(2); - builder.values().append_value(3); - builder.append(true); - builder.append(false); - let array1 = builder.finish(); - - let mut builder = ListBuilder::new(Int64Builder::with_capacity(10)); - builder.values().append_value(1); - builder.values().append_value(2); - builder.values().append_value(3); - builder.append(true); - builder.values().append_null(); - builder.values().append_null(); - builder.append(false); - let array2 = builder.finish(); - - assert_eq!(array1, array2); - } - - #[test] - fn test_list_different_offsets() { - let a = ListArray::from_iter_primitive::([ - Some([Some(0), Some(0)]), - Some([Some(1), Some(2)]), - Some([None, None]), - ]); - let b = ListArray::from_iter_primitive::([ - Some([Some(1), Some(2)]), - Some([None, None]), - Some([None, None]), - ]); - let a_slice = a.slice(1, 2); - let b_slice = b.slice(0, 2); - assert_eq!(&a_slice, &b_slice); - } -} diff --git a/arrow-data/src/equal/mod.rs b/arrow-data/src/equal/mod.rs new file mode 100644 index 00000000000..063ef64d4d8 --- /dev/null +++ b/arrow-data/src/equal/mod.rs @@ -0,0 +1,171 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Module containing functionality to compute array equality. +//! This module uses [ArrayData] and does not +//! depend on dynamic casting of `Array`. + +use crate::data::ArrayData; +use arrow_schema::{DataType, IntervalUnit}; +use half::f16; + +mod boolean; +mod decimal; +mod dictionary; +mod fixed_binary; +mod fixed_list; +mod list; +mod null; +mod primitive; +mod structure; +mod union; +mod utils; +mod variable_size; + +// these methods assume the same type, len and null count. +// For this reason, they are not exposed and are instead used +// to build the generic functions below (`equal_range` and `equal`). +use boolean::boolean_equal; +use decimal::decimal_equal; +use dictionary::dictionary_equal; +use fixed_binary::fixed_binary_equal; +use fixed_list::fixed_list_equal; +use list::list_equal; +use null::null_equal; +use primitive::primitive_equal; +use structure::struct_equal; +use union::union_equal; +use variable_size::variable_sized_equal; + +/// Compares the values of two [ArrayData] starting at `lhs_start` and `rhs_start` respectively +/// for `len` slots. +#[inline] +fn equal_values( + lhs: &ArrayData, + rhs: &ArrayData, + lhs_start: usize, + rhs_start: usize, + len: usize, +) -> bool { + match lhs.data_type() { + DataType::Null => null_equal(lhs, rhs, lhs_start, rhs_start, len), + DataType::Boolean => boolean_equal(lhs, rhs, lhs_start, rhs_start, len), + DataType::UInt8 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::UInt16 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::UInt32 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::UInt64 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::Int8 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::Int16 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::Int32 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::Int64 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::Float32 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::Float64 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::Date32 + | DataType::Time32(_) + | DataType::Interval(IntervalUnit::YearMonth) => { + primitive_equal::(lhs, rhs, lhs_start, rhs_start, len) + } + DataType::Date64 + | DataType::Interval(IntervalUnit::DayTime) + | DataType::Time64(_) + | DataType::Timestamp(_, _) + | DataType::Duration(_) => { + primitive_equal::(lhs, rhs, lhs_start, rhs_start, len) + } + DataType::Interval(IntervalUnit::MonthDayNano) => { + primitive_equal::(lhs, rhs, lhs_start, rhs_start, len) + } + DataType::Utf8 | DataType::Binary => { + variable_sized_equal::(lhs, rhs, lhs_start, rhs_start, len) + } + DataType::LargeUtf8 | DataType::LargeBinary => { + variable_sized_equal::(lhs, rhs, lhs_start, rhs_start, len) + } + DataType::FixedSizeBinary(_) => { + fixed_binary_equal(lhs, rhs, lhs_start, rhs_start, len) + } + DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => { + decimal_equal(lhs, rhs, lhs_start, rhs_start, len) + } + DataType::List(_) => list_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::LargeList(_) => list_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::FixedSizeList(_, _) => { + fixed_list_equal(lhs, rhs, lhs_start, rhs_start, len) + } + DataType::Struct(_) => struct_equal(lhs, rhs, lhs_start, rhs_start, len), + DataType::Union(_, _, _) => union_equal(lhs, rhs, lhs_start, rhs_start, len), + DataType::Dictionary(data_type, _) => match data_type.as_ref() { + DataType::Int8 => dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::Int16 => { + dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len) + } + DataType::Int32 => { + dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len) + } + DataType::Int64 => { + dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len) + } + DataType::UInt8 => { + dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len) + } + DataType::UInt16 => { + dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len) + } + DataType::UInt32 => { + dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len) + } + DataType::UInt64 => { + dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len) + } + _ => unreachable!(), + }, + DataType::Float16 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::Map(_, _) => list_equal::(lhs, rhs, lhs_start, rhs_start, len), + } +} + +fn equal_range( + lhs: &ArrayData, + rhs: &ArrayData, + lhs_start: usize, + rhs_start: usize, + len: usize, +) -> bool { + utils::equal_nulls(lhs, rhs, lhs_start, rhs_start, len) + && equal_values(lhs, rhs, lhs_start, rhs_start, len) +} + +/// Logically compares two [ArrayData]. +/// Two arrays are logically equal if and only if: +/// * their data types are equal +/// * their lengths are equal +/// * their null counts are equal +/// * their null bitmaps are equal +/// * each of their items are equal +/// two items are equal when their in-memory representation is physically equal (i.e. same bit content). +/// The physical comparison depend on the data type. +/// # Panics +/// This function may panic whenever any of the [ArrayData] does not follow the Arrow specification. +/// (e.g. wrong number of buffers, buffer `len` does not correspond to the declared `len`) +pub fn equal(lhs: &ArrayData, rhs: &ArrayData) -> bool { + utils::base_equal(lhs, rhs) + && lhs.null_count() == rhs.null_count() + && utils::equal_nulls(lhs, rhs, 0, 0, lhs.len()) + && equal_values(lhs, rhs, 0, 0, lhs.len()) +} + +// See arrow/tests/array_equal.rs for tests diff --git a/arrow/src/array/equal/null.rs b/arrow-data/src/equal/null.rs similarity index 97% rename from arrow/src/array/equal/null.rs rename to arrow-data/src/equal/null.rs index f287a382507..1478e448cec 100644 --- a/arrow/src/array/equal/null.rs +++ b/arrow-data/src/equal/null.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::array::ArrayData; +use crate::data::ArrayData; #[inline] pub(super) fn null_equal( diff --git a/arrow/src/array/equal/primitive.rs b/arrow-data/src/equal/primitive.rs similarity index 96% rename from arrow/src/array/equal/primitive.rs rename to arrow-data/src/equal/primitive.rs index b82d3213ab0..e619375d531 100644 --- a/arrow/src/array/equal/primitive.rs +++ b/arrow-data/src/equal/primitive.rs @@ -17,8 +17,8 @@ use std::mem::size_of; -use crate::array::{data::contains_nulls, ArrayData}; -use crate::util::bit_util::get_bit; +use crate::data::{contains_nulls, ArrayData}; +use arrow_buffer::bit_util::get_bit; use super::utils::equal_len; diff --git a/arrow/src/array/equal/structure.rs b/arrow-data/src/equal/structure.rs similarity index 96% rename from arrow/src/array/equal/structure.rs rename to arrow-data/src/equal/structure.rs index 384376c3468..25ab340cd3f 100644 --- a/arrow/src/array/equal/structure.rs +++ b/arrow-data/src/equal/structure.rs @@ -15,7 +15,8 @@ // specific language governing permissions and limitations // under the License. -use crate::{array::data::contains_nulls, array::ArrayData, util::bit_util::get_bit}; +use crate::data::{contains_nulls, ArrayData}; +use arrow_buffer::bit_util::get_bit; use super::equal_range; diff --git a/arrow/src/array/equal/union.rs b/arrow-data/src/equal/union.rs similarity index 98% rename from arrow/src/array/equal/union.rs rename to arrow-data/src/equal/union.rs index e8b9d27b6f0..fdf77009686 100644 --- a/arrow/src/array/equal/union.rs +++ b/arrow-data/src/equal/union.rs @@ -15,7 +15,8 @@ // specific language governing permissions and limitations // under the License. -use crate::{array::ArrayData, datatypes::DataType, datatypes::UnionMode}; +use crate::data::ArrayData; +use arrow_schema::{DataType, UnionMode}; use super::equal_range; diff --git a/arrow/src/array/equal/utils.rs b/arrow-data/src/equal/utils.rs similarity index 96% rename from arrow/src/array/equal/utils.rs rename to arrow-data/src/equal/utils.rs index 449055d366e..b3f7fc0b06e 100644 --- a/arrow/src/array/equal/utils.rs +++ b/arrow-data/src/equal/utils.rs @@ -15,10 +15,9 @@ // specific language governing permissions and limitations // under the License. -use crate::array::data::contains_nulls; -use crate::array::ArrayData; -use crate::datatypes::DataType; -use crate::util::bit_chunk_iterator::BitChunks; +use crate::data::{contains_nulls, ArrayData}; +use arrow_buffer::bit_chunk_iterator::BitChunks; +use arrow_schema::DataType; // whether bits along the positions are equal // `lhs_start`, `rhs_start` and `len` are _measured in bits_. diff --git a/arrow/src/array/equal/variable_size.rs b/arrow-data/src/equal/variable_size.rs similarity index 92% rename from arrow/src/array/equal/variable_size.rs rename to arrow-data/src/equal/variable_size.rs index f40f79e404a..b4445db54bb 100644 --- a/arrow/src/array/equal/variable_size.rs +++ b/arrow-data/src/equal/variable_size.rs @@ -15,15 +15,14 @@ // specific language governing permissions and limitations // under the License. -use crate::util::bit_util::get_bit; -use crate::{ - array::data::count_nulls, - array::{ArrayData, OffsetSizeTrait}, -}; +use crate::data::{count_nulls, ArrayData}; +use arrow_buffer::bit_util::get_bit; +use arrow_buffer::ArrowNativeType; +use num::Integer; use super::utils::equal_len; -fn offset_value_equal( +fn offset_value_equal( lhs_values: &[u8], rhs_values: &[u8], lhs_offsets: &[T], @@ -47,7 +46,7 @@ fn offset_value_equal( ) } -pub(super) fn variable_sized_equal( +pub(super) fn variable_sized_equal( lhs: &ArrayData, rhs: &ArrayData, lhs_start: usize, diff --git a/arrow-data/src/lib.rs b/arrow-data/src/lib.rs new file mode 100644 index 00000000000..9b7e307db36 --- /dev/null +++ b/arrow-data/src/lib.rs @@ -0,0 +1,30 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Buffer abstractions for [Apache Arrow](https://docs.rs/arrow) + +mod bitmap; +pub use bitmap::Bitmap; +mod data; +pub use data::*; + +mod equal; +pub mod transform; + +pub mod bit_iterator; +pub mod bit_mask; +pub mod decimal; diff --git a/arrow/src/array/transform/boolean.rs b/arrow-data/src/transform/boolean.rs similarity index 95% rename from arrow/src/array/transform/boolean.rs rename to arrow-data/src/transform/boolean.rs index e0b6231a226..d93fa15a4e0 100644 --- a/arrow/src/array/transform/boolean.rs +++ b/arrow-data/src/transform/boolean.rs @@ -16,8 +16,8 @@ // under the License. use super::{Extend, _MutableArrayData, utils::resize_for_bits}; -use crate::array::ArrayData; -use crate::util::bit_mask::set_bits; +use crate::bit_mask::set_bits; +use crate::ArrayData; pub(super) fn build_extend(array: &ArrayData) -> Extend { let values = array.buffers()[0].as_slice(); diff --git a/arrow/src/array/transform/fixed_binary.rs b/arrow-data/src/transform/fixed_binary.rs similarity index 97% rename from arrow/src/array/transform/fixed_binary.rs rename to arrow-data/src/transform/fixed_binary.rs index 6d6262ca3c4..fe21a6bc382 100644 --- a/arrow/src/array/transform/fixed_binary.rs +++ b/arrow-data/src/transform/fixed_binary.rs @@ -15,9 +15,9 @@ // specific language governing permissions and limitations // under the License. -use crate::{array::ArrayData, datatypes::DataType}; - use super::{Extend, _MutableArrayData}; +use crate::ArrayData; +use arrow_schema::DataType; pub(super) fn build_extend(array: &ArrayData) -> Extend { let size = match array.data_type() { diff --git a/arrow/src/array/transform/fixed_size_list.rs b/arrow-data/src/transform/fixed_size_list.rs similarity index 97% rename from arrow/src/array/transform/fixed_size_list.rs rename to arrow-data/src/transform/fixed_size_list.rs index 77912a7026f..ad369c2be8a 100644 --- a/arrow/src/array/transform/fixed_size_list.rs +++ b/arrow-data/src/transform/fixed_size_list.rs @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -use crate::array::ArrayData; -use crate::datatypes::DataType; +use crate::ArrayData; +use arrow_schema::DataType; use super::{Extend, _MutableArrayData}; diff --git a/arrow/src/array/transform/list.rs b/arrow-data/src/transform/list.rs similarity index 92% rename from arrow/src/array/transform/list.rs rename to arrow-data/src/transform/list.rs index 8eb2bd1778d..f318d46f498 100644 --- a/arrow/src/array/transform/list.rs +++ b/arrow-data/src/transform/list.rs @@ -15,14 +15,15 @@ // specific language governing permissions and limitations // under the License. -use crate::array::{ArrayData, OffsetSizeTrait}; - use super::{ Extend, _MutableArrayData, utils::{extend_offsets, get_last_offset}, }; +use crate::ArrayData; +use arrow_buffer::ArrowNativeType; +use num::Integer; -pub(super) fn build_extend(array: &ArrayData) -> Extend { +pub(super) fn build_extend(array: &ArrayData) -> Extend { let offsets = array.buffer::(0); if array.null_count() == 0 { // fast case where we can copy regions without nullability checks @@ -69,7 +70,7 @@ pub(super) fn build_extend(array: &ArrayData) -> Extend { (start..start + len).for_each(|i| { if array.is_valid(i) { // compute the new offset - last_offset += offsets[i + 1] - offsets[i]; + last_offset = last_offset + offsets[i + 1] - offsets[i]; // append value child.extend( @@ -86,7 +87,7 @@ pub(super) fn build_extend(array: &ArrayData) -> Extend { } } -pub(super) fn extend_nulls( +pub(super) fn extend_nulls( mutable: &mut _MutableArrayData, len: usize, ) { diff --git a/arrow-data/src/transform/mod.rs b/arrow-data/src/transform/mod.rs new file mode 100644 index 00000000000..c34376aaba2 --- /dev/null +++ b/arrow-data/src/transform/mod.rs @@ -0,0 +1,672 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use super::{ + data::{into_buffers, new_buffers}, + ArrayData, ArrayDataBuilder, +}; +use crate::bit_mask::set_bits; +use arrow_buffer::{bit_util, ArrowNativeType, MutableBuffer}; +use arrow_schema::{ArrowError, DataType, IntervalUnit, UnionMode}; +use half::f16; +use num::Integer; +use std::mem; + +mod boolean; +mod fixed_binary; +mod fixed_size_list; +mod list; +mod null; +mod primitive; +mod structure; +mod union; +mod utils; +mod variable_size; + +type ExtendNullBits<'a> = Box; +// function that extends `[start..start+len]` to the mutable array. +// this is dynamic because different data_types influence how buffers and children are extended. +type Extend<'a> = Box; + +type ExtendNulls = Box; + +/// A mutable [ArrayData] that knows how to freeze itself into an [ArrayData]. +/// This is just a data container. +#[derive(Debug)] +struct _MutableArrayData<'a> { + pub data_type: DataType, + pub null_count: usize, + + pub len: usize, + pub null_buffer: MutableBuffer, + + // arrow specification only allows up to 3 buffers (2 ignoring the nulls above). + // Thus, we place them in the stack to avoid bound checks and greater data locality. + pub buffer1: MutableBuffer, + pub buffer2: MutableBuffer, + pub child_data: Vec>, +} + +impl<'a> _MutableArrayData<'a> { + fn freeze(self, dictionary: Option) -> ArrayDataBuilder { + let buffers = into_buffers(&self.data_type, self.buffer1, self.buffer2); + + let child_data = match self.data_type { + DataType::Dictionary(_, _) => vec![dictionary.unwrap()], + _ => { + let mut child_data = Vec::with_capacity(self.child_data.len()); + for child in self.child_data { + child_data.push(child.freeze()); + } + child_data + } + }; + + ArrayDataBuilder::new(self.data_type) + .offset(0) + .len(self.len) + .null_count(self.null_count) + .buffers(buffers) + .child_data(child_data) + .null_bit_buffer((self.null_count > 0).then(|| self.null_buffer.into())) + } +} + +fn build_extend_null_bits(array: &ArrayData, use_nulls: bool) -> ExtendNullBits { + if let Some(bitmap) = array.null_bitmap() { + let bytes = bitmap.buffer().as_slice(); + Box::new(move |mutable, start, len| { + utils::resize_for_bits(&mut mutable.null_buffer, mutable.len + len); + mutable.null_count += set_bits( + mutable.null_buffer.as_slice_mut(), + bytes, + mutable.len, + array.offset() + start, + len, + ); + }) + } else if use_nulls { + Box::new(|mutable, _, len| { + utils::resize_for_bits(&mut mutable.null_buffer, mutable.len + len); + let write_data = mutable.null_buffer.as_slice_mut(); + let offset = mutable.len; + (0..len).for_each(|i| { + bit_util::set_bit(write_data, offset + i); + }); + }) + } else { + Box::new(|_, _, _| {}) + } +} + +/// Struct to efficiently and interactively create an [ArrayData] from an existing [ArrayData] by +/// copying chunks. +/// +/// The main use case of this struct is to perform unary operations to arrays of arbitrary types, +/// such as `filter` and `take`. +pub struct MutableArrayData<'a> { + #[allow(dead_code)] + arrays: Vec<&'a ArrayData>, + // The attributes in [_MutableArrayData] cannot be in [MutableArrayData] due to + // mutability invariants (interior mutability): + // [MutableArrayData] contains a function that can only mutate [_MutableArrayData], not + // [MutableArrayData] itself + data: _MutableArrayData<'a>, + + // the child data of the `Array` in Dictionary arrays. + // This is not stored in `MutableArrayData` because these values constant and only needed + // at the end, when freezing [_MutableArrayData]. + dictionary: Option, + + // function used to extend values from arrays. This function's lifetime is bound to the array + // because it reads values from it. + extend_values: Vec>, + // function used to extend nulls from arrays. This function's lifetime is bound to the array + // because it reads nulls from it. + extend_null_bits: Vec>, + + // function used to extend nulls. + // this is independent of the arrays and therefore has no lifetime. + extend_nulls: ExtendNulls, +} + +impl<'a> std::fmt::Debug for MutableArrayData<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + // ignores the closures. + f.debug_struct("MutableArrayData") + .field("data", &self.data) + .finish() + } +} + +/// Builds an extend that adds `offset` to the source primitive +/// Additionally validates that `max` fits into the +/// the underlying primitive returning None if not +fn build_extend_dictionary( + array: &ArrayData, + offset: usize, + max: usize, +) -> Option { + macro_rules! validate_and_build { + ($dt: ty) => {{ + let _: $dt = max.try_into().ok()?; + let offset: $dt = offset.try_into().ok()?; + Some(primitive::build_extend_with_offset(array, offset)) + }}; + } + match array.data_type() { + DataType::Dictionary(child_data_type, _) => match child_data_type.as_ref() { + DataType::UInt8 => validate_and_build!(u8), + DataType::UInt16 => validate_and_build!(u16), + DataType::UInt32 => validate_and_build!(u32), + DataType::UInt64 => validate_and_build!(u64), + DataType::Int8 => validate_and_build!(i8), + DataType::Int16 => validate_and_build!(i16), + DataType::Int32 => validate_and_build!(i32), + DataType::Int64 => validate_and_build!(i64), + _ => unreachable!(), + }, + _ => None, + } +} + +fn build_extend(array: &ArrayData) -> Extend { + match array.data_type() { + DataType::Decimal128(_, _) => primitive::build_extend::(array), + DataType::Null => null::build_extend(array), + DataType::Boolean => boolean::build_extend(array), + DataType::UInt8 => primitive::build_extend::(array), + DataType::UInt16 => primitive::build_extend::(array), + DataType::UInt32 => primitive::build_extend::(array), + DataType::UInt64 => primitive::build_extend::(array), + DataType::Int8 => primitive::build_extend::(array), + DataType::Int16 => primitive::build_extend::(array), + DataType::Int32 => primitive::build_extend::(array), + DataType::Int64 => primitive::build_extend::(array), + DataType::Float32 => primitive::build_extend::(array), + DataType::Float64 => primitive::build_extend::(array), + DataType::Date32 + | DataType::Time32(_) + | DataType::Interval(IntervalUnit::YearMonth) => { + primitive::build_extend::(array) + } + DataType::Date64 + | DataType::Time64(_) + | DataType::Timestamp(_, _) + | DataType::Duration(_) + | DataType::Interval(IntervalUnit::DayTime) => { + primitive::build_extend::(array) + } + DataType::Interval(IntervalUnit::MonthDayNano) => { + primitive::build_extend::(array) + } + DataType::Utf8 | DataType::Binary => variable_size::build_extend::(array), + DataType::LargeUtf8 | DataType::LargeBinary => { + variable_size::build_extend::(array) + } + DataType::Map(_, _) | DataType::List(_) => list::build_extend::(array), + DataType::LargeList(_) => list::build_extend::(array), + DataType::Dictionary(_, _) => unreachable!("should use build_extend_dictionary"), + DataType::Struct(_) => structure::build_extend(array), + DataType::FixedSizeBinary(_) | DataType::Decimal256(_, _) => { + fixed_binary::build_extend(array) + } + DataType::Float16 => primitive::build_extend::(array), + DataType::FixedSizeList(_, _) => fixed_size_list::build_extend(array), + DataType::Union(_, _, mode) => match mode { + UnionMode::Sparse => union::build_extend_sparse(array), + UnionMode::Dense => union::build_extend_dense(array), + }, + } +} + +fn build_extend_nulls(data_type: &DataType) -> ExtendNulls { + Box::new(match data_type { + DataType::Decimal128(_, _) => primitive::extend_nulls::, + DataType::Null => null::extend_nulls, + DataType::Boolean => boolean::extend_nulls, + DataType::UInt8 => primitive::extend_nulls::, + DataType::UInt16 => primitive::extend_nulls::, + DataType::UInt32 => primitive::extend_nulls::, + DataType::UInt64 => primitive::extend_nulls::, + DataType::Int8 => primitive::extend_nulls::, + DataType::Int16 => primitive::extend_nulls::, + DataType::Int32 => primitive::extend_nulls::, + DataType::Int64 => primitive::extend_nulls::, + DataType::Float32 => primitive::extend_nulls::, + DataType::Float64 => primitive::extend_nulls::, + DataType::Date32 + | DataType::Time32(_) + | DataType::Interval(IntervalUnit::YearMonth) => primitive::extend_nulls::, + DataType::Date64 + | DataType::Time64(_) + | DataType::Timestamp(_, _) + | DataType::Duration(_) + | DataType::Interval(IntervalUnit::DayTime) => primitive::extend_nulls::, + DataType::Interval(IntervalUnit::MonthDayNano) => primitive::extend_nulls::, + DataType::Utf8 | DataType::Binary => variable_size::extend_nulls::, + DataType::LargeUtf8 | DataType::LargeBinary => variable_size::extend_nulls::, + DataType::Map(_, _) | DataType::List(_) => list::extend_nulls::, + DataType::LargeList(_) => list::extend_nulls::, + DataType::Dictionary(child_data_type, _) => match child_data_type.as_ref() { + DataType::UInt8 => primitive::extend_nulls::, + DataType::UInt16 => primitive::extend_nulls::, + DataType::UInt32 => primitive::extend_nulls::, + DataType::UInt64 => primitive::extend_nulls::, + DataType::Int8 => primitive::extend_nulls::, + DataType::Int16 => primitive::extend_nulls::, + DataType::Int32 => primitive::extend_nulls::, + DataType::Int64 => primitive::extend_nulls::, + _ => unreachable!(), + }, + DataType::Struct(_) => structure::extend_nulls, + DataType::FixedSizeBinary(_) | DataType::Decimal256(_, _) => { + fixed_binary::extend_nulls + } + DataType::Float16 => primitive::extend_nulls::, + DataType::FixedSizeList(_, _) => fixed_size_list::extend_nulls, + DataType::Union(_, _, mode) => match mode { + UnionMode::Sparse => union::extend_nulls_sparse, + UnionMode::Dense => union::extend_nulls_dense, + }, + }) +} + +fn preallocate_offset_and_binary_buffer( + capacity: usize, + binary_size: usize, +) -> [MutableBuffer; 2] { + // offsets + let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::()); + // safety: `unsafe` code assumes that this buffer is initialized with one element + buffer.push(Offset::zero()); + + [ + buffer, + MutableBuffer::new(binary_size * mem::size_of::()), + ] +} + +/// Define capacities of child data or data buffers. +#[derive(Debug, Clone)] +pub enum Capacities { + /// Binary, Utf8 and LargeUtf8 data types + /// Define + /// * the capacity of the array offsets + /// * the capacity of the binary/ str buffer + Binary(usize, Option), + /// List and LargeList data types + /// Define + /// * the capacity of the array offsets + /// * the capacity of the child data + List(usize, Option>), + /// Struct type + /// * the capacity of the array + /// * the capacities of the fields + Struct(usize, Option>), + /// Dictionary type + /// * the capacity of the array/keys + /// * the capacity of the values + Dictionary(usize, Option>), + /// Don't preallocate inner buffers and rely on array growth strategy + Array(usize), +} +impl<'a> MutableArrayData<'a> { + /// returns a new [MutableArrayData] with capacity to `capacity` slots and specialized to create an + /// [ArrayData] from multiple `arrays`. + /// + /// `use_nulls` is a flag used to optimize insertions. It should be `false` if the only source of nulls + /// are the arrays themselves and `true` if the user plans to call [MutableArrayData::extend_nulls]. + /// In other words, if `use_nulls` is `false`, calling [MutableArrayData::extend_nulls] should not be used. + pub fn new(arrays: Vec<&'a ArrayData>, use_nulls: bool, capacity: usize) -> Self { + Self::with_capacities(arrays, use_nulls, Capacities::Array(capacity)) + } + + /// Similar to [MutableArrayData::new], but lets users define the preallocated capacities of the array. + /// See also [MutableArrayData::new] for more information on the arguments. + /// + /// # Panic + /// This function panics if the given `capacities` don't match the data type of `arrays`. Or when + /// a [Capacities] variant is not yet supported. + pub fn with_capacities( + arrays: Vec<&'a ArrayData>, + use_nulls: bool, + capacities: Capacities, + ) -> Self { + let data_type = arrays[0].data_type(); + + // if any of the arrays has nulls, insertions from any array requires setting bits + // as there is at least one array with nulls. + let use_nulls = use_nulls | arrays.iter().any(|array| array.null_count() > 0); + + let mut array_capacity; + + let [buffer1, buffer2] = match (data_type, &capacities) { + ( + DataType::LargeUtf8 | DataType::LargeBinary, + Capacities::Binary(capacity, Some(value_cap)), + ) => { + array_capacity = *capacity; + preallocate_offset_and_binary_buffer::(*capacity, *value_cap) + } + ( + DataType::Utf8 | DataType::Binary, + Capacities::Binary(capacity, Some(value_cap)), + ) => { + array_capacity = *capacity; + preallocate_offset_and_binary_buffer::(*capacity, *value_cap) + } + (_, Capacities::Array(capacity)) => { + array_capacity = *capacity; + new_buffers(data_type, *capacity) + } + ( + DataType::List(_) | DataType::LargeList(_), + Capacities::List(capacity, _), + ) => { + array_capacity = *capacity; + new_buffers(data_type, *capacity) + } + _ => panic!("Capacities: {:?} not yet supported", capacities), + }; + + let child_data = match &data_type { + DataType::Decimal128(_, _) + | DataType::Decimal256(_, _) + | DataType::Null + | DataType::Boolean + | DataType::UInt8 + | DataType::UInt16 + | DataType::UInt32 + | DataType::UInt64 + | DataType::Int8 + | DataType::Int16 + | DataType::Int32 + | DataType::Int64 + | DataType::Float16 + | DataType::Float32 + | DataType::Float64 + | DataType::Date32 + | DataType::Date64 + | DataType::Time32(_) + | DataType::Time64(_) + | DataType::Duration(_) + | DataType::Timestamp(_, _) + | DataType::Utf8 + | DataType::Binary + | DataType::LargeUtf8 + | DataType::LargeBinary + | DataType::Interval(_) + | DataType::FixedSizeBinary(_) => vec![], + DataType::Map(_, _) | DataType::List(_) | DataType::LargeList(_) => { + let childs = arrays + .iter() + .map(|array| &array.child_data()[0]) + .collect::>(); + + let capacities = if let Capacities::List(capacity, ref child_capacities) = + capacities + { + child_capacities + .clone() + .map(|c| *c) + .unwrap_or(Capacities::Array(capacity)) + } else { + Capacities::Array(array_capacity) + }; + + vec![MutableArrayData::with_capacities( + childs, use_nulls, capacities, + )] + } + // the dictionary type just appends keys and clones the values. + DataType::Dictionary(_, _) => vec![], + DataType::Struct(fields) => match capacities { + Capacities::Struct(capacity, Some(ref child_capacities)) => { + array_capacity = capacity; + (0..fields.len()) + .zip(child_capacities) + .map(|(i, child_cap)| { + let child_arrays = arrays + .iter() + .map(|array| &array.child_data()[i]) + .collect::>(); + MutableArrayData::with_capacities( + child_arrays, + use_nulls, + child_cap.clone(), + ) + }) + .collect::>() + } + Capacities::Struct(capacity, None) => { + array_capacity = capacity; + (0..fields.len()) + .map(|i| { + let child_arrays = arrays + .iter() + .map(|array| &array.child_data()[i]) + .collect::>(); + MutableArrayData::new(child_arrays, use_nulls, capacity) + }) + .collect::>() + } + _ => (0..fields.len()) + .map(|i| { + let child_arrays = arrays + .iter() + .map(|array| &array.child_data()[i]) + .collect::>(); + MutableArrayData::new(child_arrays, use_nulls, array_capacity) + }) + .collect::>(), + }, + DataType::FixedSizeList(_, _) => { + let childs = arrays + .iter() + .map(|array| &array.child_data()[0]) + .collect::>(); + vec![MutableArrayData::new(childs, use_nulls, array_capacity)] + } + DataType::Union(fields, _, _) => (0..fields.len()) + .map(|i| { + let child_arrays = arrays + .iter() + .map(|array| &array.child_data()[i]) + .collect::>(); + MutableArrayData::new(child_arrays, use_nulls, array_capacity) + }) + .collect::>(), + }; + + // Get the dictionary if any, and if it is a concatenation of multiple + let (dictionary, dict_concat) = match &data_type { + DataType::Dictionary(_, _) => { + // If more than one dictionary, concatenate dictionaries together + let dict_concat = !arrays + .windows(2) + .all(|a| a[0].child_data()[0].ptr_eq(&a[1].child_data()[0])); + + match dict_concat { + false => (Some(arrays[0].child_data()[0].clone()), false), + true => { + if let Capacities::Dictionary(_, _) = capacities { + panic!("dictionary capacity not yet supported") + } + let dictionaries: Vec<_> = + arrays.iter().map(|array| &array.child_data()[0]).collect(); + let lengths: Vec<_> = dictionaries + .iter() + .map(|dictionary| dictionary.len()) + .collect(); + let capacity = lengths.iter().sum(); + + let mut mutable = + MutableArrayData::new(dictionaries, false, capacity); + + for (i, len) in lengths.iter().enumerate() { + mutable.extend(i, 0, *len) + } + + (Some(mutable.freeze()), true) + } + } + } + _ => (None, false), + }; + + let extend_nulls = build_extend_nulls(data_type); + + let extend_null_bits = arrays + .iter() + .map(|array| build_extend_null_bits(array, use_nulls)) + .collect(); + + let null_buffer = if use_nulls { + let null_bytes = bit_util::ceil(array_capacity, 8); + MutableBuffer::from_len_zeroed(null_bytes) + } else { + // create 0 capacity mutable buffer with the intention that it won't be used + MutableBuffer::with_capacity(0) + }; + + let extend_values = match &data_type { + DataType::Dictionary(_, _) => { + let mut next_offset = 0; + let extend_values: Result, _> = arrays + .iter() + .map(|array| { + let offset = next_offset; + let dict_len = array.child_data()[0].len(); + + if dict_concat { + next_offset += dict_len; + } + + build_extend_dictionary(array, offset, offset + dict_len) + .ok_or(ArrowError::DictionaryKeyOverflowError) + }) + .collect(); + + extend_values.expect("MutableArrayData::new is infallible") + } + _ => arrays.iter().map(|array| build_extend(array)).collect(), + }; + + let data = _MutableArrayData { + data_type: data_type.clone(), + len: 0, + null_count: 0, + null_buffer, + buffer1, + buffer2, + child_data, + }; + Self { + arrays, + data, + dictionary, + extend_values, + extend_null_bits, + extend_nulls, + } + } + + /// Extends this array with a chunk of its source arrays + /// + /// # Arguments + /// * `index` - the index of array that you what to copy values from + /// * `start` - the start index of the chunk (inclusive) + /// * `end` - the end index of the chunk (exclusive) + /// + /// # Panic + /// This function panics if there is an invalid index, + /// i.e. `index` >= the number of source arrays + /// or `end` > the length of the `index`th array + pub fn extend(&mut self, index: usize, start: usize, end: usize) { + let len = end - start; + (self.extend_null_bits[index])(&mut self.data, start, len); + (self.extend_values[index])(&mut self.data, index, start, len); + self.data.len += len; + } + + /// Extends this [MutableArrayData] with null elements, disregarding the bound arrays + pub fn extend_nulls(&mut self, len: usize) { + // TODO: null_buffer should probably be extended here as well + // otherwise is_valid() could later panic + // add test to confirm + self.data.null_count += len; + (self.extend_nulls)(&mut self.data, len); + self.data.len += len; + } + + /// Returns the current length + #[inline] + pub fn len(&self) -> usize { + self.data.len + } + + /// Returns true if len is 0 + #[inline] + pub fn is_empty(&self) -> bool { + self.data.len == 0 + } + + /// Returns the current null count + #[inline] + pub fn null_count(&self) -> usize { + self.data.null_count + } + + /// Creates a [ArrayData] from the pushed regions up to this point, consuming `self`. + pub fn freeze(self) -> ArrayData { + unsafe { self.data.freeze(self.dictionary).build_unchecked() } + } + + /// Creates a [ArrayDataBuilder] from the pushed regions up to this point, consuming `self`. + /// This is useful for extending the default behavior of MutableArrayData. + pub fn into_builder(self) -> ArrayDataBuilder { + self.data.freeze(self.dictionary) + } +} + +// See arrow/tests/array_transform.rs for tests of transform functionality + +#[cfg(test)] +mod test { + use super::*; + use arrow_schema::Field; + + #[test] + fn test_list_append_with_capacities() { + let array = ArrayData::new_empty(&DataType::List(Box::new(Field::new( + "element", + DataType::Int64, + false, + )))); + + let mutable = MutableArrayData::with_capacities( + vec![&array], + false, + Capacities::List(6, Some(Box::new(Capacities::Array(17)))), + ); + + // capacities are rounded up to multiples of 64 by MutableBuffer + assert_eq!(mutable.data.buffer1.capacity(), 64); + assert_eq!(mutable.data.child_data[0].data.buffer1.capacity(), 192); + } +} diff --git a/arrow/src/array/transform/null.rs b/arrow-data/src/transform/null.rs similarity index 97% rename from arrow/src/array/transform/null.rs rename to arrow-data/src/transform/null.rs index e1335e17971..5d1535564d9 100644 --- a/arrow/src/array/transform/null.rs +++ b/arrow-data/src/transform/null.rs @@ -15,9 +15,8 @@ // specific language governing permissions and limitations // under the License. -use crate::array::ArrayData; - use super::{Extend, _MutableArrayData}; +use crate::ArrayData; pub(super) fn build_extend(_: &ArrayData) -> Extend { Box::new(move |_, _, _, _| {}) diff --git a/arrow/src/array/transform/primitive.rs b/arrow-data/src/transform/primitive.rs similarity index 96% rename from arrow/src/array/transform/primitive.rs rename to arrow-data/src/transform/primitive.rs index 4c765c0c0d9..b5c826438bf 100644 --- a/arrow/src/array/transform/primitive.rs +++ b/arrow-data/src/transform/primitive.rs @@ -15,11 +15,11 @@ // specific language governing permissions and limitations // under the License. +use crate::ArrayData; +use arrow_buffer::ArrowNativeType; use std::mem::size_of; use std::ops::Add; -use crate::{array::ArrayData, datatypes::ArrowNativeType}; - use super::{Extend, _MutableArrayData}; pub(super) fn build_extend(array: &ArrayData) -> Extend { diff --git a/arrow/src/array/transform/structure.rs b/arrow-data/src/transform/structure.rs similarity index 98% rename from arrow/src/array/transform/structure.rs rename to arrow-data/src/transform/structure.rs index 5c41d76a7f1..c6841da4d83 100644 --- a/arrow/src/array/transform/structure.rs +++ b/arrow-data/src/transform/structure.rs @@ -15,9 +15,8 @@ // specific language governing permissions and limitations // under the License. -use crate::array::ArrayData; - use super::{Extend, _MutableArrayData}; +use crate::ArrayData; pub(super) fn build_extend(array: &ArrayData) -> Extend { if array.null_count() == 0 { diff --git a/arrow/src/array/transform/union.rs b/arrow-data/src/transform/union.rs similarity index 98% rename from arrow/src/array/transform/union.rs rename to arrow-data/src/transform/union.rs index bbea508219d..8d1ea34c314 100644 --- a/arrow/src/array/transform/union.rs +++ b/arrow-data/src/transform/union.rs @@ -15,9 +15,8 @@ // specific language governing permissions and limitations // under the License. -use crate::array::ArrayData; - use super::{Extend, _MutableArrayData}; +use crate::ArrayData; pub(super) fn build_extend_sparse(array: &ArrayData) -> Extend { let type_ids = array.buffer::(0); diff --git a/arrow/src/array/transform/utils.rs b/arrow-data/src/transform/utils.rs similarity index 89% rename from arrow/src/array/transform/utils.rs rename to arrow-data/src/transform/utils.rs index 68aee79c41b..6a4c240c9ae 100644 --- a/arrow/src/array/transform/utils.rs +++ b/arrow-data/src/transform/utils.rs @@ -15,7 +15,8 @@ // specific language governing permissions and limitations // under the License. -use crate::{array::OffsetSizeTrait, buffer::MutableBuffer, util::bit_util}; +use arrow_buffer::{bit_util, ArrowNativeType, MutableBuffer}; +use num::Integer; /// extends the `buffer` to be able to hold `len` bits, setting all bits of the new size to zero. #[inline] @@ -26,7 +27,7 @@ pub(super) fn resize_for_bits(buffer: &mut MutableBuffer, len: usize) { } } -pub(super) fn extend_offsets( +pub(super) fn extend_offsets( buffer: &mut MutableBuffer, mut last_offset: T, offsets: &[T], @@ -35,13 +36,13 @@ pub(super) fn extend_offsets( offsets.windows(2).for_each(|offsets| { // compute the new offset let length = offsets[1] - offsets[0]; - last_offset += length; + last_offset = last_offset + length; buffer.push(last_offset); }); } #[inline] -pub(super) unsafe fn get_last_offset( +pub(super) unsafe fn get_last_offset( offset_buffer: &MutableBuffer, ) -> T { // JUSTIFICATION diff --git a/arrow/src/array/transform/variable_size.rs b/arrow-data/src/transform/variable_size.rs similarity index 87% rename from arrow/src/array/transform/variable_size.rs rename to arrow-data/src/transform/variable_size.rs index c9304dbca20..73c4783189d 100644 --- a/arrow/src/array/transform/variable_size.rs +++ b/arrow-data/src/transform/variable_size.rs @@ -15,10 +15,10 @@ // specific language governing permissions and limitations // under the License. -use crate::{ - array::{ArrayData, OffsetSizeTrait}, - buffer::MutableBuffer, -}; +use crate::ArrayData; +use arrow_buffer::{ArrowNativeType, MutableBuffer}; +use num::traits::AsPrimitive; +use num::Integer; use super::{ Extend, _MutableArrayData, @@ -26,20 +26,22 @@ use super::{ }; #[inline] -fn extend_offset_values( +fn extend_offset_values>( buffer: &mut MutableBuffer, offsets: &[T], values: &[u8], start: usize, len: usize, ) { - let start_values = offsets[start].to_usize().unwrap(); - let end_values = offsets[start + len].to_usize().unwrap(); + let start_values = offsets[start].as_(); + let end_values = offsets[start + len].as_(); let new_values = &values[start_values..end_values]; buffer.extend_from_slice(new_values); } -pub(super) fn build_extend(array: &ArrayData) -> Extend { +pub(super) fn build_extend>( + array: &ArrayData, +) -> Extend { let offsets = array.buffer::(0); let values = array.buffers()[1].as_slice(); if array.null_count() == 0 { @@ -77,7 +79,7 @@ pub(super) fn build_extend(array: &ArrayData) -> Extend { if array.is_valid(i) { // compute the new offset let length = offsets[i + 1] - offsets[i]; - last_offset += length; + last_offset = last_offset + length; // append value let bytes = &values[offsets[i].to_usize().unwrap() @@ -92,7 +94,7 @@ pub(super) fn build_extend(array: &ArrayData) -> Extend { } } -pub(super) fn extend_nulls( +pub(super) fn extend_nulls( mutable: &mut _MutableArrayData, len: usize, ) { diff --git a/arrow-pyarrow-integration-testing/src/lib.rs b/arrow-pyarrow-integration-testing/src/lib.rs index f9e70eb8d77..edfe2c680da 100644 --- a/arrow-pyarrow-integration-testing/src/lib.rs +++ b/arrow-pyarrow-integration-testing/src/lib.rs @@ -23,7 +23,7 @@ use std::sync::Arc; use pyo3::prelude::*; use pyo3::wrap_pyfunction; -use arrow::array::{ArrayData, ArrayRef, Int64Array}; +use arrow::array::{ArrayData, ArrayRef, Int64Array, make_array}; use arrow::compute::kernels; use arrow::datatypes::{DataType, Field, Schema}; use arrow::error::ArrowError; @@ -39,7 +39,7 @@ fn to_py_err(err: ArrowError) -> PyErr { #[pyfunction] fn double(array: &PyAny, py: Python) -> PyResult { // import - let array = ArrayRef::from_pyarrow(array)?; + let array = make_array(ArrayData::from_pyarrow(array)?); // perform some operation let array = array @@ -65,7 +65,7 @@ fn double_py(lambda: &PyAny, py: Python) -> PyResult { // to py let pyarray = array.to_pyarrow(py)?; let pyarray = lambda.call1((pyarray,))?; - let array = ArrayRef::from_pyarrow(pyarray)?; + let array = make_array(ArrayData::from_pyarrow(pyarray)?); Ok(array == expected) } @@ -77,7 +77,7 @@ fn substring( start: i64, ) -> PyResult> { // import - let array = ArrayRef::from(array.0); + let array = make_array(array.0); // substring let array = kernels::substring::substring(array.as_ref(), start, None).map_err(to_py_err)?; @@ -88,12 +88,12 @@ fn substring( /// Returns the concatenate #[pyfunction] fn concatenate(array: PyArrowType, py: Python) -> PyResult { - let array = ArrayRef::from(array.0); + let array = make_array(array.0); // concat let array = kernels::concat::concat(&[array.as_ref(), array.as_ref()]).map_err(to_py_err)?; - array.to_pyarrow(py) + array.data().to_pyarrow(py) } #[pyfunction] diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index d49acef335d..f29c4e31791 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -45,6 +45,7 @@ ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] [dependencies] arrow-buffer = { version = "23.0.0", path = "../arrow-buffer" } +arrow-data = { version = "23.0.0", path = "../arrow-data" } arrow-schema = { version = "23.0.0", path = "../arrow-schema" } serde_json = { version = "1.0", default-features = false, features = ["std"], optional = true } indexmap = { version = "1.9", default-features = false, features = ["std"] } @@ -88,7 +89,7 @@ pyarrow = ["pyo3", "ffi"] # force_validate runs full data validation for all arrays that are created # this is not enabled by default as it is too computationally expensive # but is run as part of our CI checks -force_validate = [] +force_validate = ["arrow-data/force_validate"] # Enable ffi support ffi = [] # Enable dyn-comparison of dictionary arrays with other arrays diff --git a/arrow/src/array/array.rs b/arrow/src/array/array.rs index 38ba2025a2e..2c2969c925d 100644 --- a/arrow/src/array/array.rs +++ b/arrow/src/array/array.rs @@ -229,7 +229,7 @@ impl Array for ArrayRef { } fn into_data(self) -> ArrayData { - self.into() + self.data().clone() } fn data_ref(&self) -> &ArrayData { @@ -358,6 +358,90 @@ pub trait ArrayAccessor: Array { unsafe fn value_unchecked(&self, index: usize) -> Self::Item; } +impl PartialEq for dyn Array { + fn eq(&self, other: &Self) -> bool { + self.data().eq(other.data()) + } +} + +impl PartialEq for dyn Array { + fn eq(&self, other: &T) -> bool { + self.data().eq(other.data()) + } +} + +impl PartialEq for NullArray { + fn eq(&self, other: &NullArray) -> bool { + self.data().eq(other.data()) + } +} + +impl PartialEq for PrimitiveArray { + fn eq(&self, other: &PrimitiveArray) -> bool { + self.data().eq(other.data()) + } +} + +impl PartialEq for DictionaryArray { + fn eq(&self, other: &Self) -> bool { + self.data().eq(other.data()) + } +} + +impl PartialEq for BooleanArray { + fn eq(&self, other: &BooleanArray) -> bool { + self.data().eq(other.data()) + } +} + +impl PartialEq for GenericStringArray { + fn eq(&self, other: &Self) -> bool { + self.data().eq(other.data()) + } +} + +impl PartialEq for GenericBinaryArray { + fn eq(&self, other: &Self) -> bool { + self.data().eq(other.data()) + } +} + +impl PartialEq for FixedSizeBinaryArray { + fn eq(&self, other: &Self) -> bool { + self.data().eq(other.data()) + } +} + +impl PartialEq for Decimal128Array { + fn eq(&self, other: &Self) -> bool { + self.data().eq(other.data()) + } +} + +impl PartialEq for GenericListArray { + fn eq(&self, other: &Self) -> bool { + self.data().eq(other.data()) + } +} + +impl PartialEq for MapArray { + fn eq(&self, other: &Self) -> bool { + self.data().eq(other.data()) + } +} + +impl PartialEq for FixedSizeListArray { + fn eq(&self, other: &Self) -> bool { + self.data().eq(other.data()) + } +} + +impl PartialEq for StructArray { + fn eq(&self, other: &Self) -> bool { + self.data().eq(other.data()) + } +} + /// Constructs an array using the input `data`. /// Returns a reference-counted `Array` instance. pub fn make_array(data: ArrayData) -> ArrayRef { @@ -470,18 +554,6 @@ pub fn make_array(data: ArrayData) -> ArrayRef { } } -impl From for ArrayRef { - fn from(data: ArrayData) -> Self { - make_array(data) - } -} - -impl From for ArrayData { - fn from(array: ArrayRef) -> Self { - array.data().clone() - } -} - /// Creates a new empty array /// /// ``` diff --git a/arrow/src/array/array_decimal.rs b/arrow/src/array/array_decimal.rs index 543fda1b1a8..f6a2dda2da5 100644 --- a/arrow/src/array/array_decimal.rs +++ b/arrow/src/array/array_decimal.rs @@ -284,7 +284,11 @@ impl DecimalArray { // safety: self.data is valid DataType::Decimal as checked above let new_data_type = Self::TYPE_CONSTRUCTOR(precision, scale); - Ok(self.data().clone().with_data_type(new_data_type).into()) + let data = self.data().clone().into_builder().data_type(new_data_type); + + // SAFETY + // Validated data above + Ok(unsafe { data.build_unchecked().into() }) } // validate that the new precision and scale are valid or not diff --git a/arrow/src/array/equal/mod.rs b/arrow/src/array/equal/mod.rs deleted file mode 100644 index 52be64a3fa7..00000000000 --- a/arrow/src/array/equal/mod.rs +++ /dev/null @@ -1,1464 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Module containing functionality to compute array equality. -//! This module uses [ArrayData] and does not -//! depend on dynamic casting of `Array`. - -use super::{ - Array, ArrayData, BooleanArray, Decimal128Array, DictionaryArray, - FixedSizeBinaryArray, FixedSizeListArray, GenericBinaryArray, GenericListArray, - GenericStringArray, MapArray, NullArray, OffsetSizeTrait, PrimitiveArray, - StructArray, -}; -use crate::datatypes::{ArrowPrimitiveType, DataType, IntervalUnit}; -use half::f16; - -mod boolean; -mod decimal; -mod dictionary; -mod fixed_binary; -mod fixed_list; -mod list; -mod null; -mod primitive; -mod structure; -mod union; -mod utils; -mod variable_size; - -// these methods assume the same type, len and null count. -// For this reason, they are not exposed and are instead used -// to build the generic functions below (`equal_range` and `equal`). -use boolean::boolean_equal; -use decimal::decimal_equal; -use dictionary::dictionary_equal; -use fixed_binary::fixed_binary_equal; -use fixed_list::fixed_list_equal; -use list::list_equal; -use null::null_equal; -use primitive::primitive_equal; -use structure::struct_equal; -use union::union_equal; -use variable_size::variable_sized_equal; - -impl PartialEq for dyn Array { - fn eq(&self, other: &Self) -> bool { - equal(self.data(), other.data()) - } -} - -impl PartialEq for dyn Array { - fn eq(&self, other: &T) -> bool { - equal(self.data(), other.data()) - } -} - -impl PartialEq for NullArray { - fn eq(&self, other: &NullArray) -> bool { - equal(self.data(), other.data()) - } -} - -impl PartialEq for PrimitiveArray { - fn eq(&self, other: &PrimitiveArray) -> bool { - equal(self.data(), other.data()) - } -} - -impl PartialEq for DictionaryArray { - fn eq(&self, other: &Self) -> bool { - equal(self.data(), other.data()) - } -} - -impl PartialEq for BooleanArray { - fn eq(&self, other: &BooleanArray) -> bool { - equal(self.data(), other.data()) - } -} - -impl PartialEq for GenericStringArray { - fn eq(&self, other: &Self) -> bool { - equal(self.data(), other.data()) - } -} - -impl PartialEq for GenericBinaryArray { - fn eq(&self, other: &Self) -> bool { - equal(self.data(), other.data()) - } -} - -impl PartialEq for FixedSizeBinaryArray { - fn eq(&self, other: &Self) -> bool { - equal(self.data(), other.data()) - } -} - -impl PartialEq for Decimal128Array { - fn eq(&self, other: &Self) -> bool { - equal(self.data(), other.data()) - } -} - -impl PartialEq for GenericListArray { - fn eq(&self, other: &Self) -> bool { - equal(self.data(), other.data()) - } -} - -impl PartialEq for MapArray { - fn eq(&self, other: &Self) -> bool { - equal(self.data(), other.data()) - } -} - -impl PartialEq for FixedSizeListArray { - fn eq(&self, other: &Self) -> bool { - equal(self.data(), other.data()) - } -} - -impl PartialEq for StructArray { - fn eq(&self, other: &Self) -> bool { - equal(self.data(), other.data()) - } -} - -/// Compares the values of two [ArrayData] starting at `lhs_start` and `rhs_start` respectively -/// for `len` slots. -#[inline] -fn equal_values( - lhs: &ArrayData, - rhs: &ArrayData, - lhs_start: usize, - rhs_start: usize, - len: usize, -) -> bool { - match lhs.data_type() { - DataType::Null => null_equal(lhs, rhs, lhs_start, rhs_start, len), - DataType::Boolean => boolean_equal(lhs, rhs, lhs_start, rhs_start, len), - DataType::UInt8 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::UInt16 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::UInt32 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::UInt64 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::Int8 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::Int16 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::Int32 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::Int64 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::Float32 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::Float64 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::Date32 - | DataType::Time32(_) - | DataType::Interval(IntervalUnit::YearMonth) => { - primitive_equal::(lhs, rhs, lhs_start, rhs_start, len) - } - DataType::Date64 - | DataType::Interval(IntervalUnit::DayTime) - | DataType::Time64(_) - | DataType::Timestamp(_, _) - | DataType::Duration(_) => { - primitive_equal::(lhs, rhs, lhs_start, rhs_start, len) - } - DataType::Interval(IntervalUnit::MonthDayNano) => { - primitive_equal::(lhs, rhs, lhs_start, rhs_start, len) - } - DataType::Utf8 | DataType::Binary => { - variable_sized_equal::(lhs, rhs, lhs_start, rhs_start, len) - } - DataType::LargeUtf8 | DataType::LargeBinary => { - variable_sized_equal::(lhs, rhs, lhs_start, rhs_start, len) - } - DataType::FixedSizeBinary(_) => { - fixed_binary_equal(lhs, rhs, lhs_start, rhs_start, len) - } - DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => { - decimal_equal(lhs, rhs, lhs_start, rhs_start, len) - } - DataType::List(_) => list_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::LargeList(_) => list_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::FixedSizeList(_, _) => { - fixed_list_equal(lhs, rhs, lhs_start, rhs_start, len) - } - DataType::Struct(_) => struct_equal(lhs, rhs, lhs_start, rhs_start, len), - DataType::Union(_, _, _) => union_equal(lhs, rhs, lhs_start, rhs_start, len), - DataType::Dictionary(data_type, _) => match data_type.as_ref() { - DataType::Int8 => dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::Int16 => { - dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len) - } - DataType::Int32 => { - dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len) - } - DataType::Int64 => { - dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len) - } - DataType::UInt8 => { - dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len) - } - DataType::UInt16 => { - dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len) - } - DataType::UInt32 => { - dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len) - } - DataType::UInt64 => { - dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len) - } - _ => unreachable!(), - }, - DataType::Float16 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::Map(_, _) => list_equal::(lhs, rhs, lhs_start, rhs_start, len), - } -} - -fn equal_range( - lhs: &ArrayData, - rhs: &ArrayData, - lhs_start: usize, - rhs_start: usize, - len: usize, -) -> bool { - utils::equal_nulls(lhs, rhs, lhs_start, rhs_start, len) - && equal_values(lhs, rhs, lhs_start, rhs_start, len) -} - -/// Logically compares two [ArrayData]. -/// Two arrays are logically equal if and only if: -/// * their data types are equal -/// * their lengths are equal -/// * their null counts are equal -/// * their null bitmaps are equal -/// * each of their items are equal -/// two items are equal when their in-memory representation is physically equal (i.e. same bit content). -/// The physical comparison depend on the data type. -/// # Panics -/// This function may panic whenever any of the [ArrayData] does not follow the Arrow specification. -/// (e.g. wrong number of buffers, buffer `len` does not correspond to the declared `len`) -pub fn equal(lhs: &ArrayData, rhs: &ArrayData) -> bool { - utils::base_equal(lhs, rhs) - && lhs.null_count() == rhs.null_count() - && utils::equal_nulls(lhs, rhs, 0, 0, lhs.len()) - && equal_values(lhs, rhs, 0, 0, lhs.len()) -} - -#[cfg(test)] -mod tests { - use std::convert::TryFrom; - use std::sync::Arc; - - use crate::array::{ - array::Array, ArrayData, ArrayDataBuilder, ArrayRef, BooleanArray, - FixedSizeBinaryBuilder, FixedSizeListBuilder, GenericBinaryArray, Int32Builder, - ListBuilder, NullArray, StringArray, StringDictionaryBuilder, StructArray, - UnionBuilder, - }; - use crate::array::{GenericStringArray, Int32Array}; - use crate::buffer::Buffer; - use crate::datatypes::{Field, Int16Type, Int32Type, ToByteSlice}; - - use super::*; - - #[test] - fn test_null_equal() { - let a = NullArray::new(12); - let a = a.data(); - let b = NullArray::new(12); - let b = b.data(); - test_equal(a, b, true); - - let b = NullArray::new(10); - let b = b.data(); - test_equal(a, b, false); - - // Test the case where offset != 0 - - let a_slice = a.slice(2, 3); - let b_slice = b.slice(1, 3); - test_equal(&a_slice, &b_slice, true); - - let a_slice = a.slice(5, 4); - let b_slice = b.slice(3, 3); - test_equal(&a_slice, &b_slice, false); - } - - #[test] - fn test_boolean_equal() { - let a = BooleanArray::from(vec![false, false, true]); - let a = a.data(); - let b = BooleanArray::from(vec![false, false, true]); - let b = b.data(); - test_equal(a, b, true); - - let b = BooleanArray::from(vec![false, false, false]); - let b = b.data(); - test_equal(a, b, false); - } - - #[test] - fn test_boolean_equal_nulls() { - let a = BooleanArray::from(vec![Some(false), None, None, Some(true)]); - let a = a.data(); - let b = BooleanArray::from(vec![Some(false), None, None, Some(true)]); - let b = b.data(); - test_equal(a, b, true); - - let b = BooleanArray::from(vec![None, None, None, Some(true)]); - let b = b.data(); - test_equal(a, b, false); - - let b = BooleanArray::from(vec![Some(true), None, None, Some(true)]); - let b = b.data(); - test_equal(a, b, false); - } - - #[test] - fn test_boolean_equal_offset() { - let a = BooleanArray::from(vec![false, true, false, true, false, false, true]); - let a = a.data(); - let b = - BooleanArray::from(vec![true, false, false, false, true, false, true, true]); - let b = b.data(); - assert!(!equal(a, b)); - assert!(!equal(b, a)); - - let a_slice = a.slice(2, 3); - let b_slice = b.slice(3, 3); - assert!(equal(&a_slice, &b_slice)); - assert!(equal(&b_slice, &a_slice)); - - let a_slice = a.slice(3, 4); - let b_slice = b.slice(4, 4); - assert!(!equal(&a_slice, &b_slice)); - assert!(!equal(&b_slice, &a_slice)); - - // Test the optimization cases where null_count == 0 and starts at 0 and len >= size_of(u8) - - // Elements fill in `u8`'s exactly. - let mut vector = vec![false, false, true, true, true, true, true, true]; - let a = BooleanArray::from(vector.clone()); - let a = a.data(); - let b = BooleanArray::from(vector.clone()); - let b = b.data(); - test_equal(a, b, true); - - // Elements fill in `u8`s + suffix bits. - vector.push(true); - let a = BooleanArray::from(vector.clone()); - let a = a.data(); - let b = BooleanArray::from(vector); - let b = b.data(); - test_equal(a, b, true); - } - - #[test] - fn test_primitive() { - let cases = vec![ - ( - vec![Some(1), Some(2), Some(3)], - vec![Some(1), Some(2), Some(3)], - true, - ), - ( - vec![Some(1), Some(2), Some(3)], - vec![Some(1), Some(2), Some(4)], - false, - ), - ( - vec![Some(1), Some(2), None], - vec![Some(1), Some(2), None], - true, - ), - ( - vec![Some(1), None, Some(3)], - vec![Some(1), Some(2), None], - false, - ), - ( - vec![Some(1), None, None], - vec![Some(1), Some(2), None], - false, - ), - ]; - - for (lhs, rhs, expected) in cases { - let lhs = Int32Array::from(lhs); - let lhs = lhs.data(); - let rhs = Int32Array::from(rhs); - let rhs = rhs.data(); - test_equal(lhs, rhs, expected); - } - } - - #[test] - fn test_primitive_slice() { - let cases = vec![ - ( - vec![Some(1), Some(2), Some(3)], - (0, 1), - vec![Some(1), Some(2), Some(3)], - (0, 1), - true, - ), - ( - vec![Some(1), Some(2), Some(3)], - (1, 1), - vec![Some(1), Some(2), Some(3)], - (2, 1), - false, - ), - ( - vec![Some(1), Some(2), None], - (1, 1), - vec![Some(1), None, Some(2)], - (2, 1), - true, - ), - ( - vec![None, Some(2), None], - (1, 1), - vec![None, None, Some(2)], - (2, 1), - true, - ), - ( - vec![Some(1), None, Some(2), None, Some(3)], - (2, 2), - vec![None, Some(2), None, Some(3)], - (1, 2), - true, - ), - ( - vec![Some(1), Some(2), None, Some(0)], - (2, 2), - vec![Some(4), Some(5), Some(0), None], - (2, 2), - false, - ), - ]; - - for (lhs, slice_lhs, rhs, slice_rhs, expected) in cases { - let lhs = Int32Array::from(lhs); - let lhs = lhs.data(); - let lhs = lhs.slice(slice_lhs.0, slice_lhs.1); - let rhs = Int32Array::from(rhs); - let rhs = rhs.data(); - let rhs = rhs.slice(slice_rhs.0, slice_rhs.1); - - test_equal(&lhs, &rhs, expected); - } - } - - fn test_equal(lhs: &ArrayData, rhs: &ArrayData, expected: bool) { - // equality is symmetric - assert!(equal(lhs, lhs), "\n{:?}\n{:?}", lhs, lhs); - assert!(equal(rhs, rhs), "\n{:?}\n{:?}", rhs, rhs); - - assert_eq!(equal(lhs, rhs), expected, "\n{:?}\n{:?}", lhs, rhs); - assert_eq!(equal(rhs, lhs), expected, "\n{:?}\n{:?}", rhs, lhs); - } - - type OptionString = Option; - - fn binary_cases() -> Vec<(Vec, Vec, bool)> { - let base = vec![ - Some("hello".to_owned()), - None, - None, - Some("world".to_owned()), - None, - None, - ]; - let not_base = vec![ - Some("hello".to_owned()), - Some("foo".to_owned()), - None, - Some("world".to_owned()), - None, - None, - ]; - vec![ - ( - vec![Some("hello".to_owned()), Some("world".to_owned())], - vec![Some("hello".to_owned()), Some("world".to_owned())], - true, - ), - ( - vec![Some("hello".to_owned()), Some("world".to_owned())], - vec![Some("hello".to_owned()), Some("arrow".to_owned())], - false, - ), - (base.clone(), base.clone(), true), - (base, not_base, false), - ] - } - - fn test_generic_string_equal() { - let cases = binary_cases(); - - for (lhs, rhs, expected) in cases { - let lhs: GenericStringArray = lhs.into_iter().collect(); - let lhs = lhs.data(); - let rhs: GenericStringArray = rhs.into_iter().collect(); - let rhs = rhs.data(); - test_equal(lhs, rhs, expected); - } - } - - #[test] - fn test_string_equal() { - test_generic_string_equal::() - } - - #[test] - fn test_large_string_equal() { - test_generic_string_equal::() - } - - fn test_generic_binary_equal() { - let cases = binary_cases(); - - for (lhs, rhs, expected) in cases { - let lhs = lhs - .iter() - .map(|x| x.as_deref().map(|x| x.as_bytes())) - .collect(); - let rhs = rhs - .iter() - .map(|x| x.as_deref().map(|x| x.as_bytes())) - .collect(); - let lhs = GenericBinaryArray::::from_opt_vec(lhs); - let lhs = lhs.data(); - let rhs = GenericBinaryArray::::from_opt_vec(rhs); - let rhs = rhs.data(); - test_equal(lhs, rhs, expected); - } - } - - #[test] - fn test_binary_equal() { - test_generic_binary_equal::() - } - - #[test] - fn test_large_binary_equal() { - test_generic_binary_equal::() - } - - #[test] - fn test_fixed_size_binary_array() { - let a_input_arg = vec![vec![1, 2], vec![3, 4], vec![5, 6]]; - let a = FixedSizeBinaryArray::try_from_iter(a_input_arg.into_iter()).unwrap(); - let a = a.data(); - - let b_input_arg = vec![vec![1, 2], vec![3, 4], vec![5, 6]]; - let b = FixedSizeBinaryArray::try_from_iter(b_input_arg.into_iter()).unwrap(); - let b = b.data(); - - test_equal(a, b, true); - } - - #[test] - fn test_string_offset() { - let a = StringArray::from(vec![Some("a"), None, Some("b")]); - let a = a.data(); - let a = a.slice(2, 1); - let b = StringArray::from(vec![Some("b")]); - let b = b.data(); - - test_equal(&a, b, true); - } - - #[test] - fn test_string_offset_larger() { - let a = StringArray::from(vec![Some("a"), None, Some("b"), None, Some("c")]); - let a = a.data(); - let b = StringArray::from(vec![None, Some("b"), None, Some("c")]); - let b = b.data(); - - test_equal(&a.slice(2, 2), &b.slice(0, 2), false); - test_equal(&a.slice(2, 2), &b.slice(1, 2), true); - test_equal(&a.slice(2, 2), &b.slice(2, 2), false); - } - - #[test] - fn test_null() { - let a = NullArray::new(2); - let a = a.data(); - let b = NullArray::new(2); - let b = b.data(); - test_equal(a, b, true); - - let b = NullArray::new(1); - let b = b.data(); - test_equal(a, b, false); - } - - fn create_list_array, T: AsRef<[Option]>>(data: T) -> ArrayData { - let mut builder = ListBuilder::new(Int32Builder::with_capacity(10)); - for d in data.as_ref() { - if let Some(v) = d { - builder.values().append_slice(v.as_ref()); - builder.append(true); - } else { - builder.append(false); - } - } - builder.finish().into_data() - } - - #[test] - fn test_list_equal() { - let a = create_list_array(&[Some(&[1, 2, 3]), Some(&[4, 5, 6])]); - let b = create_list_array(&[Some(&[1, 2, 3]), Some(&[4, 5, 6])]); - test_equal(&a, &b, true); - - let b = create_list_array(&[Some(&[1, 2, 3]), Some(&[4, 5, 7])]); - test_equal(&a, &b, false); - } - - #[test] - fn test_empty_offsets_list_equal() { - let empty: Vec = vec![]; - let values = Int32Array::from(empty); - let empty_offsets: [u8; 0] = []; - - let a = ArrayDataBuilder::new(DataType::List(Box::new(Field::new( - "item", - DataType::Int32, - true, - )))) - .len(0) - .add_buffer(Buffer::from(&empty_offsets)) - .add_child_data(values.data().clone()) - .null_bit_buffer(Some(Buffer::from(&empty_offsets))) - .build() - .unwrap(); - - let b = ArrayDataBuilder::new(DataType::List(Box::new(Field::new( - "item", - DataType::Int32, - true, - )))) - .len(0) - .add_buffer(Buffer::from(&empty_offsets)) - .add_child_data(values.data().clone()) - .null_bit_buffer(Some(Buffer::from(&empty_offsets))) - .build() - .unwrap(); - - test_equal(&a, &b, true); - - let c = ArrayDataBuilder::new(DataType::List(Box::new(Field::new( - "item", - DataType::Int32, - true, - )))) - .len(0) - .add_buffer(Buffer::from(vec![0i32, 2, 3, 4, 6, 7, 8].to_byte_slice())) - .add_child_data( - Int32Array::from(vec![1, 2, -1, -2, 3, 4, -3, -4]) - .data() - .clone(), - ) - .null_bit_buffer(Some(Buffer::from(vec![0b00001001]))) - .build() - .unwrap(); - - test_equal(&a, &c, true); - } - - // Test the case where null_count > 0 - #[test] - fn test_list_null() { - let a = - create_list_array(&[Some(&[1, 2]), None, None, Some(&[3, 4]), None, None]); - let b = - create_list_array(&[Some(&[1, 2]), None, None, Some(&[3, 4]), None, None]); - test_equal(&a, &b, true); - - let b = create_list_array(&[ - Some(&[1, 2]), - None, - Some(&[5, 6]), - Some(&[3, 4]), - None, - None, - ]); - test_equal(&a, &b, false); - - let b = - create_list_array(&[Some(&[1, 2]), None, None, Some(&[3, 5]), None, None]); - test_equal(&a, &b, false); - - // a list where the nullness of values is determined by the list's bitmap - let c_values = Int32Array::from(vec![1, 2, -1, -2, 3, 4, -3, -4]); - let c = ArrayDataBuilder::new(DataType::List(Box::new(Field::new( - "item", - DataType::Int32, - true, - )))) - .len(6) - .add_buffer(Buffer::from(vec![0i32, 2, 3, 4, 6, 7, 8].to_byte_slice())) - .add_child_data(c_values.into_data()) - .null_bit_buffer(Some(Buffer::from(vec![0b00001001]))) - .build() - .unwrap(); - - let d_values = Int32Array::from(vec![ - Some(1), - Some(2), - None, - None, - Some(3), - Some(4), - None, - None, - ]); - let d = ArrayDataBuilder::new(DataType::List(Box::new(Field::new( - "item", - DataType::Int32, - true, - )))) - .len(6) - .add_buffer(Buffer::from(vec![0i32, 2, 3, 4, 6, 7, 8].to_byte_slice())) - .add_child_data(d_values.into_data()) - .null_bit_buffer(Some(Buffer::from(vec![0b00001001]))) - .build() - .unwrap(); - test_equal(&c, &d, true); - } - - // Test the case where offset != 0 - #[test] - fn test_list_offsets() { - let a = - create_list_array(&[Some(&[1, 2]), None, None, Some(&[3, 4]), None, None]); - let b = - create_list_array(&[Some(&[1, 2]), None, None, Some(&[3, 5]), None, None]); - - let a_slice = a.slice(0, 3); - let b_slice = b.slice(0, 3); - test_equal(&a_slice, &b_slice, true); - - let a_slice = a.slice(0, 5); - let b_slice = b.slice(0, 5); - test_equal(&a_slice, &b_slice, false); - - let a_slice = a.slice(4, 1); - let b_slice = b.slice(4, 1); - test_equal(&a_slice, &b_slice, true); - } - - fn create_fixed_size_binary_array, T: AsRef<[Option]>>( - data: T, - ) -> ArrayData { - let mut builder = FixedSizeBinaryBuilder::with_capacity(data.as_ref().len(), 5); - - for d in data.as_ref() { - if let Some(v) = d { - builder.append_value(v.as_ref()).unwrap(); - } else { - builder.append_null(); - } - } - builder.finish().into_data() - } - - #[test] - fn test_fixed_size_binary_equal() { - let a = create_fixed_size_binary_array(&[Some(b"hello"), Some(b"world")]); - let b = create_fixed_size_binary_array(&[Some(b"hello"), Some(b"world")]); - test_equal(&a, &b, true); - - let b = create_fixed_size_binary_array(&[Some(b"hello"), Some(b"arrow")]); - test_equal(&a, &b, false); - } - - // Test the case where null_count > 0 - #[test] - fn test_fixed_size_binary_null() { - let a = create_fixed_size_binary_array(&[Some(b"hello"), None, Some(b"world")]); - let b = create_fixed_size_binary_array(&[Some(b"hello"), None, Some(b"world")]); - test_equal(&a, &b, true); - - let b = create_fixed_size_binary_array(&[Some(b"hello"), Some(b"world"), None]); - test_equal(&a, &b, false); - - let b = create_fixed_size_binary_array(&[Some(b"hello"), None, Some(b"arrow")]); - test_equal(&a, &b, false); - } - - #[test] - fn test_fixed_size_binary_offsets() { - // Test the case where offset != 0 - let a = create_fixed_size_binary_array(&[ - Some(b"hello"), - None, - None, - Some(b"world"), - None, - None, - ]); - let b = create_fixed_size_binary_array(&[ - Some(b"hello"), - None, - None, - Some(b"arrow"), - None, - None, - ]); - - let a_slice = a.slice(0, 3); - let b_slice = b.slice(0, 3); - test_equal(&a_slice, &b_slice, true); - - let a_slice = a.slice(0, 5); - let b_slice = b.slice(0, 5); - test_equal(&a_slice, &b_slice, false); - - let a_slice = a.slice(4, 1); - let b_slice = b.slice(4, 1); - test_equal(&a_slice, &b_slice, true); - - let a_slice = a.slice(3, 1); - let b_slice = b.slice(3, 1); - test_equal(&a_slice, &b_slice, false); - } - - fn create_decimal_array(data: Vec>) -> ArrayData { - data.into_iter() - .collect::() - .with_precision_and_scale(23, 6) - .unwrap() - .into() - } - - #[test] - fn test_decimal_equal() { - let a = create_decimal_array(vec![Some(8_887_000_000), Some(-8_887_000_000)]); - let b = create_decimal_array(vec![Some(8_887_000_000), Some(-8_887_000_000)]); - test_equal(&a, &b, true); - - let b = create_decimal_array(vec![Some(15_887_000_000), Some(-8_887_000_000)]); - test_equal(&a, &b, false); - } - - // Test the case where null_count > 0 - #[test] - fn test_decimal_null() { - let a = - create_decimal_array(vec![Some(8_887_000_000), None, Some(-8_887_000_000)]); - let b = - create_decimal_array(vec![Some(8_887_000_000), None, Some(-8_887_000_000)]); - test_equal(&a, &b, true); - - let b = - create_decimal_array(vec![Some(8_887_000_000), Some(-8_887_000_000), None]); - test_equal(&a, &b, false); - - let b = - create_decimal_array(vec![Some(15_887_000_000), None, Some(-8_887_000_000)]); - test_equal(&a, &b, false); - } - - #[test] - fn test_decimal_offsets() { - // Test the case where offset != 0 - let a = create_decimal_array(vec![ - Some(8_887_000_000), - None, - None, - Some(-8_887_000_000), - None, - None, - ]); - let b = create_decimal_array(vec![ - None, - Some(8_887_000_000), - None, - None, - Some(15_887_000_000), - None, - None, - ]); - - let a_slice = a.slice(0, 3); - let b_slice = b.slice(1, 3); - test_equal(&a_slice, &b_slice, true); - - let a_slice = a.slice(0, 5); - let b_slice = b.slice(1, 5); - test_equal(&a_slice, &b_slice, false); - - let a_slice = a.slice(4, 1); - let b_slice = b.slice(5, 1); - test_equal(&a_slice, &b_slice, true); - - let a_slice = a.slice(3, 3); - let b_slice = b.slice(4, 3); - test_equal(&a_slice, &b_slice, false); - - let a_slice = a.slice(1, 3); - let b_slice = b.slice(2, 3); - test_equal(&a_slice, &b_slice, false); - - let b = create_decimal_array(vec![ - None, - None, - None, - Some(-8_887_000_000), - Some(-3_000), - None, - ]); - let a_slice = a.slice(1, 3); - let b_slice = b.slice(1, 3); - test_equal(&a_slice, &b_slice, true); - } - - /// Create a fixed size list of 2 value lengths - fn create_fixed_size_list_array, T: AsRef<[Option]>>( - data: T, - ) -> ArrayData { - let mut builder = FixedSizeListBuilder::new(Int32Builder::with_capacity(10), 3); - - for d in data.as_ref() { - if let Some(v) = d { - builder.values().append_slice(v.as_ref()); - builder.append(true); - } else { - for _ in 0..builder.value_length() { - builder.values().append_null(); - } - builder.append(false); - } - } - builder.finish().into_data() - } - - #[test] - fn test_fixed_size_list_equal() { - let a = create_fixed_size_list_array(&[Some(&[1, 2, 3]), Some(&[4, 5, 6])]); - let b = create_fixed_size_list_array(&[Some(&[1, 2, 3]), Some(&[4, 5, 6])]); - test_equal(&a, &b, true); - - let b = create_fixed_size_list_array(&[Some(&[1, 2, 3]), Some(&[4, 5, 7])]); - test_equal(&a, &b, false); - } - - // Test the case where null_count > 0 - #[test] - fn test_fixed_list_null() { - let a = create_fixed_size_list_array(&[ - Some(&[1, 2, 3]), - None, - None, - Some(&[4, 5, 6]), - None, - None, - ]); - let b = create_fixed_size_list_array(&[ - Some(&[1, 2, 3]), - None, - None, - Some(&[4, 5, 6]), - None, - None, - ]); - test_equal(&a, &b, true); - - let b = create_fixed_size_list_array(&[ - Some(&[1, 2, 3]), - None, - Some(&[7, 8, 9]), - Some(&[4, 5, 6]), - None, - None, - ]); - test_equal(&a, &b, false); - - let b = create_fixed_size_list_array(&[ - Some(&[1, 2, 3]), - None, - None, - Some(&[3, 6, 9]), - None, - None, - ]); - test_equal(&a, &b, false); - - let b = create_fixed_size_list_array(&[None, Some(&[4, 5, 6]), None, None]); - - test_equal(&a.slice(2, 4), &b, true); - test_equal(&a.slice(3, 3), &b.slice(1, 3), true); - } - - #[test] - fn test_fixed_list_offsets() { - // Test the case where offset != 0 - let a = create_fixed_size_list_array(&[ - Some(&[1, 2, 3]), - None, - None, - Some(&[4, 5, 6]), - None, - None, - ]); - let b = create_fixed_size_list_array(&[ - Some(&[1, 2, 3]), - None, - None, - Some(&[3, 6, 9]), - None, - None, - ]); - - let a_slice = a.slice(0, 3); - let b_slice = b.slice(0, 3); - test_equal(&a_slice, &b_slice, true); - - let a_slice = a.slice(0, 5); - let b_slice = b.slice(0, 5); - test_equal(&a_slice, &b_slice, false); - - let a_slice = a.slice(4, 1); - let b_slice = b.slice(4, 1); - test_equal(&a_slice, &b_slice, true); - } - - #[test] - fn test_struct_equal() { - let strings: ArrayRef = Arc::new(StringArray::from(vec![ - Some("joe"), - None, - None, - Some("mark"), - Some("doe"), - ])); - let ints: ArrayRef = Arc::new(Int32Array::from(vec![ - Some(1), - Some(2), - None, - Some(4), - Some(5), - ])); - - let a = - StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) - .unwrap(); - let a = a.data(); - - let b = StructArray::try_from(vec![("f1", strings), ("f2", ints)]).unwrap(); - let b = b.data(); - - test_equal(a, b, true); - } - - #[test] - fn test_struct_equal_null() { - let strings: ArrayRef = Arc::new(StringArray::from(vec![ - Some("joe"), - None, - None, - Some("mark"), - Some("doe"), - ])); - let ints: ArrayRef = Arc::new(Int32Array::from(vec![ - Some(1), - Some(2), - None, - Some(4), - Some(5), - ])); - let ints_non_null: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 0])); - - let a = ArrayData::builder(DataType::Struct(vec![ - Field::new("f1", DataType::Utf8, true), - Field::new("f2", DataType::Int32, true), - ])) - .null_bit_buffer(Some(Buffer::from(vec![0b00001011]))) - .len(5) - .add_child_data(strings.data_ref().clone()) - .add_child_data(ints.data_ref().clone()) - .build() - .unwrap(); - let a = crate::array::make_array(a); - - let b = ArrayData::builder(DataType::Struct(vec![ - Field::new("f1", DataType::Utf8, true), - Field::new("f2", DataType::Int32, true), - ])) - .null_bit_buffer(Some(Buffer::from(vec![0b00001011]))) - .len(5) - .add_child_data(strings.data_ref().clone()) - .add_child_data(ints_non_null.data_ref().clone()) - .build() - .unwrap(); - let b = crate::array::make_array(b); - - test_equal(a.data_ref(), b.data_ref(), true); - - // test with arrays that are not equal - let c_ints_non_null: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 0, 4])); - let c = ArrayData::builder(DataType::Struct(vec![ - Field::new("f1", DataType::Utf8, true), - Field::new("f2", DataType::Int32, true), - ])) - .null_bit_buffer(Some(Buffer::from(vec![0b00001011]))) - .len(5) - .add_child_data(strings.data_ref().clone()) - .add_child_data(c_ints_non_null.data_ref().clone()) - .build() - .unwrap(); - let c = crate::array::make_array(c); - - test_equal(a.data_ref(), c.data_ref(), false); - - // test a nested struct - let a = ArrayData::builder(DataType::Struct(vec![Field::new( - "f3", - a.data_type().clone(), - true, - )])) - .null_bit_buffer(Some(Buffer::from(vec![0b00011110]))) - .len(5) - .add_child_data(a.data_ref().clone()) - .build() - .unwrap(); - let a = crate::array::make_array(a); - - // reconstruct b, but with different data where the first struct is null - let strings: ArrayRef = Arc::new(StringArray::from(vec![ - Some("joanne"), // difference - None, - None, - Some("mark"), - Some("doe"), - ])); - let b = ArrayData::builder(DataType::Struct(vec![ - Field::new("f1", DataType::Utf8, true), - Field::new("f2", DataType::Int32, true), - ])) - .null_bit_buffer(Some(Buffer::from(vec![0b00001011]))) - .len(5) - .add_child_data(strings.data_ref().clone()) - .add_child_data(ints_non_null.data_ref().clone()) - .build() - .unwrap(); - - let b = ArrayData::builder(DataType::Struct(vec![Field::new( - "f3", - b.data_type().clone(), - true, - )])) - .null_bit_buffer(Some(Buffer::from(vec![0b00011110]))) - .len(5) - .add_child_data(b) - .build() - .unwrap(); - let b = crate::array::make_array(b); - - test_equal(a.data_ref(), b.data_ref(), true); - } - - #[test] - fn test_struct_equal_null_variable_size() { - // the string arrays differ, but where the struct array is null - let strings1: ArrayRef = Arc::new(StringArray::from(vec![ - Some("joe"), - None, - None, - Some("mark"), - Some("doel"), - ])); - let strings2: ArrayRef = Arc::new(StringArray::from(vec![ - Some("joel"), - None, - None, - Some("mark"), - Some("doe"), - ])); - - let a = ArrayData::builder(DataType::Struct(vec![Field::new( - "f1", - DataType::Utf8, - true, - )])) - .null_bit_buffer(Some(Buffer::from(vec![0b00001010]))) - .len(5) - .add_child_data(strings1.data_ref().clone()) - .build() - .unwrap(); - let a = crate::array::make_array(a); - - let b = ArrayData::builder(DataType::Struct(vec![Field::new( - "f1", - DataType::Utf8, - true, - )])) - .null_bit_buffer(Some(Buffer::from(vec![0b00001010]))) - .len(5) - .add_child_data(strings2.data_ref().clone()) - .build() - .unwrap(); - let b = crate::array::make_array(b); - - test_equal(a.data_ref(), b.data_ref(), true); - - // test with arrays that are not equal - let strings3: ArrayRef = Arc::new(StringArray::from(vec![ - Some("mark"), - None, - None, - Some("doe"), - Some("joe"), - ])); - let c = ArrayData::builder(DataType::Struct(vec![Field::new( - "f1", - DataType::Utf8, - true, - )])) - .null_bit_buffer(Some(Buffer::from(vec![0b00001011]))) - .len(5) - .add_child_data(strings3.data_ref().clone()) - .build() - .unwrap(); - let c = crate::array::make_array(c); - - test_equal(a.data_ref(), c.data_ref(), false); - } - - fn create_dictionary_array(values: &[&str], keys: &[Option<&str>]) -> ArrayData { - let values = StringArray::from(values.to_vec()); - let mut builder = StringDictionaryBuilder::::new_with_dictionary( - keys.len(), - &values, - ) - .unwrap(); - for key in keys { - if let Some(v) = key { - builder.append(v).unwrap(); - } else { - builder.append_null() - } - } - builder.finish().into_data() - } - - #[test] - fn test_dictionary_equal() { - // (a, b, c), (1, 2, 1, 3) => (a, b, a, c) - let a = create_dictionary_array( - &["a", "b", "c"], - &[Some("a"), Some("b"), Some("a"), Some("c")], - ); - // different representation (values and keys are swapped), same result - let b = create_dictionary_array( - &["a", "c", "b"], - &[Some("a"), Some("b"), Some("a"), Some("c")], - ); - test_equal(&a, &b, true); - - // different len - let b = - create_dictionary_array(&["a", "c", "b"], &[Some("a"), Some("b"), Some("a")]); - test_equal(&a, &b, false); - - // different key - let b = create_dictionary_array( - &["a", "c", "b"], - &[Some("a"), Some("b"), Some("a"), Some("a")], - ); - test_equal(&a, &b, false); - - // different values, same keys - let b = create_dictionary_array( - &["a", "b", "d"], - &[Some("a"), Some("b"), Some("a"), Some("d")], - ); - test_equal(&a, &b, false); - } - - #[test] - fn test_dictionary_equal_null() { - // (a, b, c), (1, 2, 1, 3) => (a, b, a, c) - let a = create_dictionary_array( - &["a", "b", "c"], - &[Some("a"), None, Some("a"), Some("c")], - ); - - // equal to self - test_equal(&a, &a, true); - - // different representation (values and keys are swapped), same result - let b = create_dictionary_array( - &["a", "c", "b"], - &[Some("a"), None, Some("a"), Some("c")], - ); - test_equal(&a, &b, true); - - // different null position - let b = create_dictionary_array( - &["a", "c", "b"], - &[Some("a"), Some("b"), Some("a"), None], - ); - test_equal(&a, &b, false); - - // different key - let b = create_dictionary_array( - &["a", "c", "b"], - &[Some("a"), None, Some("a"), Some("a")], - ); - test_equal(&a, &b, false); - - // different values, same keys - let b = create_dictionary_array( - &["a", "b", "d"], - &[Some("a"), None, Some("a"), Some("d")], - ); - test_equal(&a, &b, false); - } - - #[test] - fn test_non_null_empty_strings() { - let s = StringArray::from(vec![Some(""), Some(""), Some("")]); - - let string1 = s.data(); - - let string2 = ArrayData::builder(DataType::Utf8) - .len(string1.len()) - .buffers(string1.buffers().to_vec()) - .build() - .unwrap(); - - // string2 is identical to string1 except that it has no validity buffer but since there - // are no nulls, string1 and string2 are equal - test_equal(string1, &string2, true); - } - - #[test] - fn test_null_empty_strings() { - let s = StringArray::from(vec![Some(""), None, Some("")]); - - let string1 = s.data(); - - let string2 = ArrayData::builder(DataType::Utf8) - .len(string1.len()) - .buffers(string1.buffers().to_vec()) - .build() - .unwrap(); - - // string2 is identical to string1 except that it has no validity buffer since string1 has - // nulls in it, string1 and string2 are not equal - test_equal(string1, &string2, false); - } - - #[test] - fn test_union_equal_dense() { - let mut builder = UnionBuilder::new_dense(); - builder.append::("a", 1).unwrap(); - builder.append::("b", 2).unwrap(); - builder.append::("c", 3).unwrap(); - builder.append::("a", 4).unwrap(); - builder.append_null::("a").unwrap(); - builder.append::("a", 6).unwrap(); - builder.append::("b", 7).unwrap(); - let union1 = builder.build().unwrap(); - - builder = UnionBuilder::new_dense(); - builder.append::("a", 1).unwrap(); - builder.append::("b", 2).unwrap(); - builder.append::("c", 3).unwrap(); - builder.append::("a", 4).unwrap(); - builder.append_null::("a").unwrap(); - builder.append::("a", 6).unwrap(); - builder.append::("b", 7).unwrap(); - let union2 = builder.build().unwrap(); - - builder = UnionBuilder::new_dense(); - builder.append::("a", 1).unwrap(); - builder.append::("b", 2).unwrap(); - builder.append::("c", 3).unwrap(); - builder.append::("a", 5).unwrap(); - builder.append::("c", 4).unwrap(); - builder.append::("a", 6).unwrap(); - builder.append::("b", 7).unwrap(); - let union3 = builder.build().unwrap(); - - builder = UnionBuilder::new_dense(); - builder.append::("a", 1).unwrap(); - builder.append::("b", 2).unwrap(); - builder.append::("c", 3).unwrap(); - builder.append::("a", 4).unwrap(); - builder.append_null::("c").unwrap(); - builder.append_null::("b").unwrap(); - builder.append::("b", 7).unwrap(); - let union4 = builder.build().unwrap(); - - test_equal(union1.data(), union2.data(), true); - test_equal(union1.data(), union3.data(), false); - test_equal(union1.data(), union4.data(), false); - } - - #[test] - fn test_union_equal_sparse() { - let mut builder = UnionBuilder::new_sparse(); - builder.append::("a", 1).unwrap(); - builder.append::("b", 2).unwrap(); - builder.append::("c", 3).unwrap(); - builder.append::("a", 4).unwrap(); - builder.append_null::("a").unwrap(); - builder.append::("a", 6).unwrap(); - builder.append::("b", 7).unwrap(); - let union1 = builder.build().unwrap(); - - builder = UnionBuilder::new_sparse(); - builder.append::("a", 1).unwrap(); - builder.append::("b", 2).unwrap(); - builder.append::("c", 3).unwrap(); - builder.append::("a", 4).unwrap(); - builder.append_null::("a").unwrap(); - builder.append::("a", 6).unwrap(); - builder.append::("b", 7).unwrap(); - let union2 = builder.build().unwrap(); - - builder = UnionBuilder::new_sparse(); - builder.append::("a", 1).unwrap(); - builder.append::("b", 2).unwrap(); - builder.append::("c", 3).unwrap(); - builder.append::("a", 5).unwrap(); - builder.append::("c", 4).unwrap(); - builder.append::("a", 6).unwrap(); - builder.append::("b", 7).unwrap(); - let union3 = builder.build().unwrap(); - - builder = UnionBuilder::new_sparse(); - builder.append::("a", 1).unwrap(); - builder.append::("b", 2).unwrap(); - builder.append::("c", 3).unwrap(); - builder.append::("a", 4).unwrap(); - builder.append_null::("a").unwrap(); - builder.append_null::("a").unwrap(); - builder.append::("b", 7).unwrap(); - let union4 = builder.build().unwrap(); - - test_equal(union1.data(), union2.data(), true); - test_equal(union1.data(), union3.data(), false); - test_equal(union1.data(), union4.data(), false); - } -} diff --git a/arrow/src/array/mod.rs b/arrow/src/array/mod.rs index 8e9bc20b448..32a1da17f84 100644 --- a/arrow/src/array/mod.rs +++ b/arrow/src/array/mod.rs @@ -173,15 +173,12 @@ mod array_struct; mod array_union; mod builder; mod cast; -mod data; -mod equal; #[cfg(feature = "ffi")] mod ffi; mod iterator; mod null; mod ord; mod raw_pointer; -mod transform; use crate::datatypes::*; @@ -190,14 +187,9 @@ use crate::datatypes::*; pub use self::array::Array; pub use self::array::ArrayAccessor; pub use self::array::ArrayRef; -pub use self::data::ArrayData; -pub use self::data::ArrayDataBuilder; -pub use self::data::ArrayDataRef; - -#[cfg(any(feature = "ipc", feature = "ffi"))] -pub(crate) use self::data::layout; -#[cfg(feature = "ipc")] -pub(crate) use self::data::BufferSpec; +pub use arrow_data::{ + layout, ArrayData, ArrayDataBuilder, ArrayDataRef, BufferSpec, DataTypeLayout, +}; pub use self::array_binary::BinaryArray; pub use self::array_binary::LargeBinaryArray; @@ -592,7 +584,7 @@ pub type DurationMillisecondBuilder = PrimitiveBuilder; pub type DurationMicrosecondBuilder = PrimitiveBuilder; pub type DurationNanosecondBuilder = PrimitiveBuilder; -pub use self::transform::{Capacities, MutableArrayData}; +pub use arrow_data::transform::{Capacities, MutableArrayData}; // --------------------- Array Iterator --------------------- diff --git a/arrow/src/array/transform/mod.rs b/arrow/src/array/transform/mod.rs deleted file mode 100644 index 29d4434aafa..00000000000 --- a/arrow/src/array/transform/mod.rs +++ /dev/null @@ -1,1715 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use super::{ - data::{into_buffers, new_buffers}, - ArrayData, ArrayDataBuilder, OffsetSizeTrait, -}; -use crate::{ - buffer::MutableBuffer, - datatypes::DataType, - error::{ArrowError, Result}, - util::bit_util, -}; -use half::f16; -use std::mem; - -mod boolean; -mod fixed_binary; -mod fixed_size_list; -mod list; -mod null; -mod primitive; -mod structure; -mod union; -mod utils; -mod variable_size; - -type ExtendNullBits<'a> = Box; -// function that extends `[start..start+len]` to the mutable array. -// this is dynamic because different data_types influence how buffers and children are extended. -type Extend<'a> = Box; - -type ExtendNulls = Box; - -/// A mutable [ArrayData] that knows how to freeze itself into an [ArrayData]. -/// This is just a data container. -#[derive(Debug)] -struct _MutableArrayData<'a> { - pub data_type: DataType, - pub null_count: usize, - - pub len: usize, - pub null_buffer: MutableBuffer, - - // arrow specification only allows up to 3 buffers (2 ignoring the nulls above). - // Thus, we place them in the stack to avoid bound checks and greater data locality. - pub buffer1: MutableBuffer, - pub buffer2: MutableBuffer, - pub child_data: Vec>, -} - -impl<'a> _MutableArrayData<'a> { - fn freeze(self, dictionary: Option) -> ArrayDataBuilder { - let buffers = into_buffers(&self.data_type, self.buffer1, self.buffer2); - - let child_data = match self.data_type { - DataType::Dictionary(_, _) => vec![dictionary.unwrap()], - _ => { - let mut child_data = Vec::with_capacity(self.child_data.len()); - for child in self.child_data { - child_data.push(child.freeze()); - } - child_data - } - }; - - ArrayDataBuilder::new(self.data_type) - .offset(0) - .len(self.len) - .null_count(self.null_count) - .buffers(buffers) - .child_data(child_data) - .null_bit_buffer((self.null_count > 0).then(|| self.null_buffer.into())) - } -} - -fn build_extend_null_bits(array: &ArrayData, use_nulls: bool) -> ExtendNullBits { - if let Some(bitmap) = array.null_bitmap() { - let bytes = bitmap.bits.as_slice(); - Box::new(move |mutable, start, len| { - utils::resize_for_bits(&mut mutable.null_buffer, mutable.len + len); - mutable.null_count += crate::util::bit_mask::set_bits( - mutable.null_buffer.as_slice_mut(), - bytes, - mutable.len, - array.offset() + start, - len, - ); - }) - } else if use_nulls { - Box::new(|mutable, _, len| { - utils::resize_for_bits(&mut mutable.null_buffer, mutable.len + len); - let write_data = mutable.null_buffer.as_slice_mut(); - let offset = mutable.len; - (0..len).for_each(|i| { - bit_util::set_bit(write_data, offset + i); - }); - }) - } else { - Box::new(|_, _, _| {}) - } -} - -/// Struct to efficiently and interactively create an [ArrayData] from an existing [ArrayData] by -/// copying chunks. -/// The main use case of this struct is to perform unary operations to arrays of arbitrary types, such as `filter` and `take`. -/// # Example: -/// -/// ``` -/// use arrow::{array::{Int32Array, Array, MutableArrayData}}; -/// -/// let array = Int32Array::from(vec![1, 2, 3, 4, 5]); -/// let array = array.data(); -/// // Create a new `MutableArrayData` from an array and with a capacity of 4. -/// // Capacity here is equivalent to `Vec::with_capacity` -/// let arrays = vec![array]; -/// let mut mutable = MutableArrayData::new(arrays, false, 4); -/// mutable.extend(0, 1, 3); // extend from the slice [1..3], [2,3] -/// mutable.extend(0, 0, 3); // extend from the slice [0..3], [1,2,3] -/// // `.freeze()` to convert `MutableArrayData` into a `ArrayData`. -/// let new_array = Int32Array::from(mutable.freeze()); -/// assert_eq!(Int32Array::from(vec![2, 3, 1, 2, 3]), new_array); -/// ``` -pub struct MutableArrayData<'a> { - #[allow(dead_code)] - arrays: Vec<&'a ArrayData>, - // The attributes in [_MutableArrayData] cannot be in [MutableArrayData] due to - // mutability invariants (interior mutability): - // [MutableArrayData] contains a function that can only mutate [_MutableArrayData], not - // [MutableArrayData] itself - data: _MutableArrayData<'a>, - - // the child data of the `Array` in Dictionary arrays. - // This is not stored in `MutableArrayData` because these values constant and only needed - // at the end, when freezing [_MutableArrayData]. - dictionary: Option, - - // function used to extend values from arrays. This function's lifetime is bound to the array - // because it reads values from it. - extend_values: Vec>, - // function used to extend nulls from arrays. This function's lifetime is bound to the array - // because it reads nulls from it. - extend_null_bits: Vec>, - - // function used to extend nulls. - // this is independent of the arrays and therefore has no lifetime. - extend_nulls: ExtendNulls, -} - -impl<'a> std::fmt::Debug for MutableArrayData<'a> { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - // ignores the closures. - f.debug_struct("MutableArrayData") - .field("data", &self.data) - .finish() - } -} - -/// Builds an extend that adds `offset` to the source primitive -/// Additionally validates that `max` fits into the -/// the underlying primitive returning None if not -fn build_extend_dictionary( - array: &ArrayData, - offset: usize, - max: usize, -) -> Option { - use crate::datatypes::*; - macro_rules! validate_and_build { - ($dt: ty) => {{ - let _: $dt = max.try_into().ok()?; - let offset: $dt = offset.try_into().ok()?; - Some(primitive::build_extend_with_offset(array, offset)) - }}; - } - match array.data_type() { - DataType::Dictionary(child_data_type, _) => match child_data_type.as_ref() { - DataType::UInt8 => validate_and_build!(u8), - DataType::UInt16 => validate_and_build!(u16), - DataType::UInt32 => validate_and_build!(u32), - DataType::UInt64 => validate_and_build!(u64), - DataType::Int8 => validate_and_build!(i8), - DataType::Int16 => validate_and_build!(i16), - DataType::Int32 => validate_and_build!(i32), - DataType::Int64 => validate_and_build!(i64), - _ => unreachable!(), - }, - _ => None, - } -} - -fn build_extend(array: &ArrayData) -> Extend { - use crate::datatypes::*; - match array.data_type() { - DataType::Decimal128(_, _) => primitive::build_extend::(array), - DataType::Null => null::build_extend(array), - DataType::Boolean => boolean::build_extend(array), - DataType::UInt8 => primitive::build_extend::(array), - DataType::UInt16 => primitive::build_extend::(array), - DataType::UInt32 => primitive::build_extend::(array), - DataType::UInt64 => primitive::build_extend::(array), - DataType::Int8 => primitive::build_extend::(array), - DataType::Int16 => primitive::build_extend::(array), - DataType::Int32 => primitive::build_extend::(array), - DataType::Int64 => primitive::build_extend::(array), - DataType::Float32 => primitive::build_extend::(array), - DataType::Float64 => primitive::build_extend::(array), - DataType::Date32 - | DataType::Time32(_) - | DataType::Interval(IntervalUnit::YearMonth) => { - primitive::build_extend::(array) - } - DataType::Date64 - | DataType::Time64(_) - | DataType::Timestamp(_, _) - | DataType::Duration(_) - | DataType::Interval(IntervalUnit::DayTime) => { - primitive::build_extend::(array) - } - DataType::Interval(IntervalUnit::MonthDayNano) => { - primitive::build_extend::(array) - } - DataType::Utf8 | DataType::Binary => variable_size::build_extend::(array), - DataType::LargeUtf8 | DataType::LargeBinary => { - variable_size::build_extend::(array) - } - DataType::Map(_, _) | DataType::List(_) => list::build_extend::(array), - DataType::LargeList(_) => list::build_extend::(array), - DataType::Dictionary(_, _) => unreachable!("should use build_extend_dictionary"), - DataType::Struct(_) => structure::build_extend(array), - DataType::FixedSizeBinary(_) | DataType::Decimal256(_, _) => { - fixed_binary::build_extend(array) - } - DataType::Float16 => primitive::build_extend::(array), - DataType::FixedSizeList(_, _) => fixed_size_list::build_extend(array), - DataType::Union(_, _, mode) => match mode { - UnionMode::Sparse => union::build_extend_sparse(array), - UnionMode::Dense => union::build_extend_dense(array), - }, - } -} - -fn build_extend_nulls(data_type: &DataType) -> ExtendNulls { - use crate::datatypes::*; - Box::new(match data_type { - DataType::Decimal128(_, _) => primitive::extend_nulls::, - DataType::Null => null::extend_nulls, - DataType::Boolean => boolean::extend_nulls, - DataType::UInt8 => primitive::extend_nulls::, - DataType::UInt16 => primitive::extend_nulls::, - DataType::UInt32 => primitive::extend_nulls::, - DataType::UInt64 => primitive::extend_nulls::, - DataType::Int8 => primitive::extend_nulls::, - DataType::Int16 => primitive::extend_nulls::, - DataType::Int32 => primitive::extend_nulls::, - DataType::Int64 => primitive::extend_nulls::, - DataType::Float32 => primitive::extend_nulls::, - DataType::Float64 => primitive::extend_nulls::, - DataType::Date32 - | DataType::Time32(_) - | DataType::Interval(IntervalUnit::YearMonth) => primitive::extend_nulls::, - DataType::Date64 - | DataType::Time64(_) - | DataType::Timestamp(_, _) - | DataType::Duration(_) - | DataType::Interval(IntervalUnit::DayTime) => primitive::extend_nulls::, - DataType::Interval(IntervalUnit::MonthDayNano) => primitive::extend_nulls::, - DataType::Utf8 | DataType::Binary => variable_size::extend_nulls::, - DataType::LargeUtf8 | DataType::LargeBinary => variable_size::extend_nulls::, - DataType::Map(_, _) | DataType::List(_) => list::extend_nulls::, - DataType::LargeList(_) => list::extend_nulls::, - DataType::Dictionary(child_data_type, _) => match child_data_type.as_ref() { - DataType::UInt8 => primitive::extend_nulls::, - DataType::UInt16 => primitive::extend_nulls::, - DataType::UInt32 => primitive::extend_nulls::, - DataType::UInt64 => primitive::extend_nulls::, - DataType::Int8 => primitive::extend_nulls::, - DataType::Int16 => primitive::extend_nulls::, - DataType::Int32 => primitive::extend_nulls::, - DataType::Int64 => primitive::extend_nulls::, - _ => unreachable!(), - }, - DataType::Struct(_) => structure::extend_nulls, - DataType::FixedSizeBinary(_) | DataType::Decimal256(_, _) => { - fixed_binary::extend_nulls - } - DataType::Float16 => primitive::extend_nulls::, - DataType::FixedSizeList(_, _) => fixed_size_list::extend_nulls, - DataType::Union(_, _, mode) => match mode { - UnionMode::Sparse => union::extend_nulls_sparse, - UnionMode::Dense => union::extend_nulls_dense, - }, - }) -} - -fn preallocate_offset_and_binary_buffer( - capacity: usize, - binary_size: usize, -) -> [MutableBuffer; 2] { - // offsets - let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::()); - // safety: `unsafe` code assumes that this buffer is initialized with one element - buffer.push(Offset::zero()); - - [ - buffer, - MutableBuffer::new(binary_size * mem::size_of::()), - ] -} - -/// Define capacities of child data or data buffers. -#[derive(Debug, Clone)] -pub enum Capacities { - /// Binary, Utf8 and LargeUtf8 data types - /// Define - /// * the capacity of the array offsets - /// * the capacity of the binary/ str buffer - Binary(usize, Option), - /// List and LargeList data types - /// Define - /// * the capacity of the array offsets - /// * the capacity of the child data - List(usize, Option>), - /// Struct type - /// * the capacity of the array - /// * the capacities of the fields - Struct(usize, Option>), - /// Dictionary type - /// * the capacity of the array/keys - /// * the capacity of the values - Dictionary(usize, Option>), - /// Don't preallocate inner buffers and rely on array growth strategy - Array(usize), -} -impl<'a> MutableArrayData<'a> { - /// returns a new [MutableArrayData] with capacity to `capacity` slots and specialized to create an - /// [ArrayData] from multiple `arrays`. - /// - /// `use_nulls` is a flag used to optimize insertions. It should be `false` if the only source of nulls - /// are the arrays themselves and `true` if the user plans to call [MutableArrayData::extend_nulls]. - /// In other words, if `use_nulls` is `false`, calling [MutableArrayData::extend_nulls] should not be used. - pub fn new(arrays: Vec<&'a ArrayData>, use_nulls: bool, capacity: usize) -> Self { - Self::with_capacities(arrays, use_nulls, Capacities::Array(capacity)) - } - - /// Similar to [MutableArrayData::new], but lets users define the preallocated capacities of the array. - /// See also [MutableArrayData::new] for more information on the arguments. - /// - /// # Panic - /// This function panics if the given `capacities` don't match the data type of `arrays`. Or when - /// a [Capacities] variant is not yet supported. - pub fn with_capacities( - arrays: Vec<&'a ArrayData>, - use_nulls: bool, - capacities: Capacities, - ) -> Self { - let data_type = arrays[0].data_type(); - use crate::datatypes::*; - - // if any of the arrays has nulls, insertions from any array requires setting bits - // as there is at least one array with nulls. - let use_nulls = use_nulls | arrays.iter().any(|array| array.null_count() > 0); - - let mut array_capacity; - - let [buffer1, buffer2] = match (data_type, &capacities) { - ( - DataType::LargeUtf8 | DataType::LargeBinary, - Capacities::Binary(capacity, Some(value_cap)), - ) => { - array_capacity = *capacity; - preallocate_offset_and_binary_buffer::(*capacity, *value_cap) - } - ( - DataType::Utf8 | DataType::Binary, - Capacities::Binary(capacity, Some(value_cap)), - ) => { - array_capacity = *capacity; - preallocate_offset_and_binary_buffer::(*capacity, *value_cap) - } - (_, Capacities::Array(capacity)) => { - array_capacity = *capacity; - new_buffers(data_type, *capacity) - } - ( - DataType::List(_) | DataType::LargeList(_), - Capacities::List(capacity, _), - ) => { - array_capacity = *capacity; - new_buffers(data_type, *capacity) - } - _ => panic!("Capacities: {:?} not yet supported", capacities), - }; - - let child_data = match &data_type { - DataType::Decimal128(_, _) - | DataType::Decimal256(_, _) - | DataType::Null - | DataType::Boolean - | DataType::UInt8 - | DataType::UInt16 - | DataType::UInt32 - | DataType::UInt64 - | DataType::Int8 - | DataType::Int16 - | DataType::Int32 - | DataType::Int64 - | DataType::Float16 - | DataType::Float32 - | DataType::Float64 - | DataType::Date32 - | DataType::Date64 - | DataType::Time32(_) - | DataType::Time64(_) - | DataType::Duration(_) - | DataType::Timestamp(_, _) - | DataType::Utf8 - | DataType::Binary - | DataType::LargeUtf8 - | DataType::LargeBinary - | DataType::Interval(_) - | DataType::FixedSizeBinary(_) => vec![], - DataType::Map(_, _) | DataType::List(_) | DataType::LargeList(_) => { - let childs = arrays - .iter() - .map(|array| &array.child_data()[0]) - .collect::>(); - - let capacities = if let Capacities::List(capacity, ref child_capacities) = - capacities - { - child_capacities - .clone() - .map(|c| *c) - .unwrap_or(Capacities::Array(capacity)) - } else { - Capacities::Array(array_capacity) - }; - - vec![MutableArrayData::with_capacities( - childs, use_nulls, capacities, - )] - } - // the dictionary type just appends keys and clones the values. - DataType::Dictionary(_, _) => vec![], - DataType::Struct(fields) => match capacities { - Capacities::Struct(capacity, Some(ref child_capacities)) => { - array_capacity = capacity; - (0..fields.len()) - .zip(child_capacities) - .map(|(i, child_cap)| { - let child_arrays = arrays - .iter() - .map(|array| &array.child_data()[i]) - .collect::>(); - MutableArrayData::with_capacities( - child_arrays, - use_nulls, - child_cap.clone(), - ) - }) - .collect::>() - } - Capacities::Struct(capacity, None) => { - array_capacity = capacity; - (0..fields.len()) - .map(|i| { - let child_arrays = arrays - .iter() - .map(|array| &array.child_data()[i]) - .collect::>(); - MutableArrayData::new(child_arrays, use_nulls, capacity) - }) - .collect::>() - } - _ => (0..fields.len()) - .map(|i| { - let child_arrays = arrays - .iter() - .map(|array| &array.child_data()[i]) - .collect::>(); - MutableArrayData::new(child_arrays, use_nulls, array_capacity) - }) - .collect::>(), - }, - DataType::FixedSizeList(_, _) => { - let childs = arrays - .iter() - .map(|array| &array.child_data()[0]) - .collect::>(); - vec![MutableArrayData::new(childs, use_nulls, array_capacity)] - } - DataType::Union(fields, _, _) => (0..fields.len()) - .map(|i| { - let child_arrays = arrays - .iter() - .map(|array| &array.child_data()[i]) - .collect::>(); - MutableArrayData::new(child_arrays, use_nulls, array_capacity) - }) - .collect::>(), - }; - - // Get the dictionary if any, and if it is a concatenation of multiple - let (dictionary, dict_concat) = match &data_type { - DataType::Dictionary(_, _) => { - // If more than one dictionary, concatenate dictionaries together - let dict_concat = !arrays - .windows(2) - .all(|a| a[0].child_data()[0].ptr_eq(&a[1].child_data()[0])); - - match dict_concat { - false => (Some(arrays[0].child_data()[0].clone()), false), - true => { - if let Capacities::Dictionary(_, _) = capacities { - panic!("dictionary capacity not yet supported") - } - let dictionaries: Vec<_> = - arrays.iter().map(|array| &array.child_data()[0]).collect(); - let lengths: Vec<_> = dictionaries - .iter() - .map(|dictionary| dictionary.len()) - .collect(); - let capacity = lengths.iter().sum(); - - let mut mutable = - MutableArrayData::new(dictionaries, false, capacity); - - for (i, len) in lengths.iter().enumerate() { - mutable.extend(i, 0, *len) - } - - (Some(mutable.freeze()), true) - } - } - } - _ => (None, false), - }; - - let extend_nulls = build_extend_nulls(data_type); - - let extend_null_bits = arrays - .iter() - .map(|array| build_extend_null_bits(array, use_nulls)) - .collect(); - - let null_buffer = if use_nulls { - let null_bytes = bit_util::ceil(array_capacity, 8); - MutableBuffer::from_len_zeroed(null_bytes) - } else { - // create 0 capacity mutable buffer with the intention that it won't be used - MutableBuffer::with_capacity(0) - }; - - let extend_values = match &data_type { - DataType::Dictionary(_, _) => { - let mut next_offset = 0; - let extend_values: Result> = arrays - .iter() - .map(|array| { - let offset = next_offset; - let dict_len = array.child_data()[0].len(); - - if dict_concat { - next_offset += dict_len; - } - - build_extend_dictionary(array, offset, offset + dict_len) - .ok_or(ArrowError::DictionaryKeyOverflowError) - }) - .collect(); - - extend_values.expect("MutableArrayData::new is infallible") - } - _ => arrays.iter().map(|array| build_extend(array)).collect(), - }; - - let data = _MutableArrayData { - data_type: data_type.clone(), - len: 0, - null_count: 0, - null_buffer, - buffer1, - buffer2, - child_data, - }; - Self { - arrays, - data, - dictionary, - extend_values, - extend_null_bits, - extend_nulls, - } - } - - /// Extends this array with a chunk of its source arrays - /// - /// # Arguments - /// * `index` - the index of array that you what to copy values from - /// * `start` - the start index of the chunk (inclusive) - /// * `end` - the end index of the chunk (exclusive) - /// - /// # Panic - /// This function panics if there is an invalid index, - /// i.e. `index` >= the number of source arrays - /// or `end` > the length of the `index`th array - pub fn extend(&mut self, index: usize, start: usize, end: usize) { - let len = end - start; - (self.extend_null_bits[index])(&mut self.data, start, len); - (self.extend_values[index])(&mut self.data, index, start, len); - self.data.len += len; - } - - /// Extends this [MutableArrayData] with null elements, disregarding the bound arrays - pub fn extend_nulls(&mut self, len: usize) { - // TODO: null_buffer should probably be extended here as well - // otherwise is_valid() could later panic - // add test to confirm - self.data.null_count += len; - (self.extend_nulls)(&mut self.data, len); - self.data.len += len; - } - - /// Returns the current length - #[inline] - pub fn len(&self) -> usize { - self.data.len - } - - /// Returns true if len is 0 - #[inline] - pub fn is_empty(&self) -> bool { - self.data.len == 0 - } - - /// Returns the current null count - #[inline] - pub fn null_count(&self) -> usize { - self.data.null_count - } - - /// Creates a [ArrayData] from the pushed regions up to this point, consuming `self`. - pub fn freeze(self) -> ArrayData { - unsafe { self.data.freeze(self.dictionary).build_unchecked() } - } - - /// Creates a [ArrayDataBuilder] from the pushed regions up to this point, consuming `self`. - /// This is useful for extending the default behavior of MutableArrayData. - pub fn into_builder(self) -> ArrayDataBuilder { - self.data.freeze(self.dictionary) - } -} - -#[cfg(test)] -mod tests { - use std::{convert::TryFrom, sync::Arc}; - - use super::*; - use crate::array::Decimal128Array; - use crate::{ - array::{ - Array, ArrayData, ArrayRef, BooleanArray, DictionaryArray, - FixedSizeBinaryArray, Int16Array, Int16Type, Int32Array, Int64Array, - Int64Builder, ListBuilder, MapBuilder, NullArray, StringArray, - StringDictionaryBuilder, StructArray, UInt8Array, - }, - buffer::Buffer, - datatypes::Field, - }; - use crate::{ - array::{ListArray, StringBuilder}, - error::Result, - }; - - fn create_decimal_array( - array: Vec>, - precision: u8, - scale: u8, - ) -> Decimal128Array { - array - .into_iter() - .collect::() - .with_precision_and_scale(precision, scale) - .unwrap() - } - - #[test] - #[cfg(not(feature = "force_validate"))] - fn test_decimal() { - let decimal_array = - create_decimal_array(vec![Some(1), Some(2), None, Some(3)], 10, 3); - let arrays = vec![Array::data(&decimal_array)]; - let mut a = MutableArrayData::new(arrays, true, 3); - a.extend(0, 0, 3); - a.extend(0, 2, 3); - let result = a.freeze(); - let array = Decimal128Array::from(result); - let expected = create_decimal_array(vec![Some(1), Some(2), None, None], 10, 3); - assert_eq!(array, expected); - } - #[test] - #[cfg(not(feature = "force_validate"))] - fn test_decimal_offset() { - let decimal_array = - create_decimal_array(vec![Some(1), Some(2), None, Some(3)], 10, 3); - let decimal_array = decimal_array.slice(1, 3); // 2, null, 3 - let arrays = vec![decimal_array.data()]; - let mut a = MutableArrayData::new(arrays, true, 2); - a.extend(0, 0, 2); // 2, null - let result = a.freeze(); - let array = Decimal128Array::from(result); - let expected = create_decimal_array(vec![Some(2), None], 10, 3); - assert_eq!(array, expected); - } - - #[test] - #[cfg(not(feature = "force_validate"))] - fn test_decimal_null_offset_nulls() { - let decimal_array = - create_decimal_array(vec![Some(1), Some(2), None, Some(3)], 10, 3); - let decimal_array = decimal_array.slice(1, 3); // 2, null, 3 - let arrays = vec![decimal_array.data()]; - let mut a = MutableArrayData::new(arrays, true, 2); - a.extend(0, 0, 2); // 2, null - a.extend_nulls(3); // 2, null, null, null, null - a.extend(0, 1, 3); //2, null, null, null, null, null, 3 - let result = a.freeze(); - let array = Decimal128Array::from(result); - let expected = create_decimal_array( - vec![Some(2), None, None, None, None, None, Some(3)], - 10, - 3, - ); - assert_eq!(array, expected); - } - - /// tests extending from a primitive array w/ offset nor nulls - #[test] - fn test_primitive() { - let b = UInt8Array::from(vec![Some(1), Some(2), Some(3)]); - let arrays = vec![b.data()]; - let mut a = MutableArrayData::new(arrays, false, 3); - a.extend(0, 0, 2); - let result = a.freeze(); - let array = UInt8Array::from(result); - let expected = UInt8Array::from(vec![Some(1), Some(2)]); - assert_eq!(array, expected); - } - - /// tests extending from a primitive array with offset w/ nulls - #[test] - fn test_primitive_offset() { - let b = UInt8Array::from(vec![Some(1), Some(2), Some(3)]); - let b = b.slice(1, 2); - let arrays = vec![b.data()]; - let mut a = MutableArrayData::new(arrays, false, 2); - a.extend(0, 0, 2); - let result = a.freeze(); - let array = UInt8Array::from(result); - let expected = UInt8Array::from(vec![Some(2), Some(3)]); - assert_eq!(array, expected); - } - - /// tests extending from a primitive array with offset and nulls - #[test] - fn test_primitive_null_offset() { - let b = UInt8Array::from(vec![Some(1), None, Some(3)]); - let b = b.slice(1, 2); - let arrays = vec![b.data()]; - let mut a = MutableArrayData::new(arrays, false, 2); - a.extend(0, 0, 2); - let result = a.freeze(); - let array = UInt8Array::from(result); - let expected = UInt8Array::from(vec![None, Some(3)]); - assert_eq!(array, expected); - } - - #[test] - fn test_primitive_null_offset_nulls() { - let b = UInt8Array::from(vec![Some(1), Some(2), Some(3)]); - let b = b.slice(1, 2); - let arrays = vec![b.data()]; - let mut a = MutableArrayData::new(arrays, true, 2); - a.extend(0, 0, 2); - a.extend_nulls(3); - a.extend(0, 1, 2); - let result = a.freeze(); - let array = UInt8Array::from(result); - let expected = - UInt8Array::from(vec![Some(2), Some(3), None, None, None, Some(3)]); - assert_eq!(array, expected); - } - - #[test] - fn test_list_null_offset() { - let int_builder = Int64Builder::with_capacity(24); - let mut builder = ListBuilder::::new(int_builder); - builder.values().append_slice(&[1, 2, 3]); - builder.append(true); - builder.values().append_slice(&[4, 5]); - builder.append(true); - builder.values().append_slice(&[6, 7, 8]); - builder.append(true); - let array = builder.finish(); - let arrays = vec![array.data()]; - - let mut mutable = MutableArrayData::new(arrays, false, 0); - mutable.extend(0, 0, 1); - - let result = mutable.freeze(); - let array = ListArray::from(result); - - let int_builder = Int64Builder::with_capacity(24); - let mut builder = ListBuilder::::new(int_builder); - builder.values().append_slice(&[1, 2, 3]); - builder.append(true); - let expected = builder.finish(); - - assert_eq!(array, expected); - } - - /// tests extending from a variable-sized (strings and binary) array w/ offset with nulls - #[test] - fn test_variable_sized_nulls() { - let array = StringArray::from(vec![Some("a"), Some("bc"), None, Some("defh")]); - let arrays = vec![array.data()]; - - let mut mutable = MutableArrayData::new(arrays, false, 0); - - mutable.extend(0, 1, 3); - - let result = mutable.freeze(); - let result = StringArray::from(result); - - let expected = StringArray::from(vec![Some("bc"), None]); - assert_eq!(result, expected); - } - - /// tests extending from a variable-sized (strings and binary) array - /// with an offset and nulls - #[test] - fn test_variable_sized_offsets() { - let array = StringArray::from(vec![Some("a"), Some("bc"), None, Some("defh")]); - let array = array.slice(1, 3); - - let arrays = vec![array.data()]; - - let mut mutable = MutableArrayData::new(arrays, false, 0); - - mutable.extend(0, 0, 3); - - let result = mutable.freeze(); - let result = StringArray::from(result); - - let expected = StringArray::from(vec![Some("bc"), None, Some("defh")]); - assert_eq!(result, expected); - } - - #[test] - fn test_string_offsets() { - let array = StringArray::from(vec![Some("a"), Some("bc"), None, Some("defh")]); - let array = array.slice(1, 3); - - let arrays = vec![array.data()]; - - let mut mutable = MutableArrayData::new(arrays, false, 0); - - mutable.extend(0, 0, 3); - - let result = mutable.freeze(); - let result = StringArray::from(result); - - let expected = StringArray::from(vec![Some("bc"), None, Some("defh")]); - assert_eq!(result, expected); - } - - #[test] - fn test_multiple_with_nulls() { - let array1 = StringArray::from(vec!["hello", "world"]); - let array2 = StringArray::from(vec![Some("1"), None]); - - let arrays = vec![array1.data(), array2.data()]; - - let mut mutable = MutableArrayData::new(arrays, false, 5); - - mutable.extend(0, 0, 2); - mutable.extend(1, 0, 2); - - let result = mutable.freeze(); - let result = StringArray::from(result); - - let expected = - StringArray::from(vec![Some("hello"), Some("world"), Some("1"), None]); - assert_eq!(result, expected); - } - - #[test] - fn test_string_null_offset_nulls() { - let array = StringArray::from(vec![Some("a"), Some("bc"), None, Some("defh")]); - let array = array.slice(1, 3); - - let arrays = vec![array.data()]; - - let mut mutable = MutableArrayData::new(arrays, true, 0); - - mutable.extend(0, 1, 3); - mutable.extend_nulls(1); - - let result = mutable.freeze(); - let result = StringArray::from(result); - - let expected = StringArray::from(vec![None, Some("defh"), None]); - assert_eq!(result, expected); - } - - #[test] - fn test_bool() { - let array = BooleanArray::from(vec![Some(false), Some(true), None, Some(false)]); - let arrays = vec![array.data()]; - - let mut mutable = MutableArrayData::new(arrays, false, 0); - - mutable.extend(0, 1, 3); - - let result = mutable.freeze(); - let result = BooleanArray::from(result); - - let expected = BooleanArray::from(vec![Some(true), None]); - assert_eq!(result, expected); - } - - #[test] - fn test_null() { - let array1 = NullArray::new(10); - let array2 = NullArray::new(5); - let arrays = vec![array1.data(), array2.data()]; - - let mut mutable = MutableArrayData::new(arrays, false, 0); - - mutable.extend(0, 1, 3); - mutable.extend(1, 0, 1); - - let result = mutable.freeze(); - let result = NullArray::from(result); - - let expected = NullArray::new(3); - assert_eq!(result, expected); - } - - fn create_dictionary_array(values: &[&str], keys: &[Option<&str>]) -> ArrayData { - let values = StringArray::from(values.to_vec()); - let mut builder = StringDictionaryBuilder::::new_with_dictionary( - keys.len(), - &values, - ) - .unwrap(); - for key in keys { - if let Some(v) = key { - builder.append(v).unwrap(); - } else { - builder.append_null() - } - } - builder.finish().into_data() - } - - #[test] - fn test_dictionary() { - // (a, b, c), (0, 1, 0, 2) => (a, b, a, c) - let array = create_dictionary_array( - &["a", "b", "c"], - &[Some("a"), Some("b"), None, Some("c")], - ); - let arrays = vec![&array]; - - let mut mutable = MutableArrayData::new(arrays, false, 0); - - mutable.extend(0, 1, 3); - - let result = mutable.freeze(); - let result = DictionaryArray::from(result); - - let expected = Int16Array::from(vec![Some(1), None]); - assert_eq!(result.keys(), &expected); - } - - #[test] - fn test_struct() { - let strings: ArrayRef = Arc::new(StringArray::from(vec![ - Some("joe"), - None, - None, - Some("mark"), - Some("doe"), - ])); - let ints: ArrayRef = Arc::new(Int32Array::from(vec![ - Some(1), - Some(2), - Some(3), - Some(4), - Some(5), - ])); - - let array = - StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) - .unwrap(); - let arrays = vec![array.data()]; - let mut mutable = MutableArrayData::new(arrays, false, 0); - - mutable.extend(0, 1, 3); - let data = mutable.freeze(); - let array = StructArray::from(data); - - let expected = StructArray::try_from(vec![ - ("f1", strings.slice(1, 2)), - ("f2", ints.slice(1, 2)), - ]) - .unwrap(); - assert_eq!(array, expected) - } - - #[test] - fn test_struct_offset() { - let strings: ArrayRef = Arc::new(StringArray::from(vec![ - Some("joe"), - None, - None, - Some("mark"), - Some("doe"), - ])); - let ints: ArrayRef = Arc::new(Int32Array::from(vec![ - Some(1), - Some(2), - Some(3), - Some(4), - Some(5), - ])); - - let array = - StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) - .unwrap() - .slice(1, 3); - let arrays = vec![array.data()]; - let mut mutable = MutableArrayData::new(arrays, false, 0); - - mutable.extend(0, 1, 3); - let data = mutable.freeze(); - let array = StructArray::from(data); - - let expected_strings: ArrayRef = - Arc::new(StringArray::from(vec![None, Some("mark")])); - let expected = StructArray::try_from(vec![ - ("f1", expected_strings), - ("f2", ints.slice(2, 2)), - ]) - .unwrap(); - - assert_eq!(array, expected); - } - - #[test] - fn test_struct_nulls() { - let strings: ArrayRef = Arc::new(StringArray::from(vec![ - Some("joe"), - None, - None, - Some("mark"), - Some("doe"), - ])); - let ints: ArrayRef = Arc::new(Int32Array::from(vec![ - Some(1), - Some(2), - None, - Some(4), - Some(5), - ])); - - let array = - StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) - .unwrap(); - let arrays = vec![array.data()]; - - let mut mutable = MutableArrayData::new(arrays, false, 0); - - mutable.extend(0, 1, 3); - let data = mutable.freeze(); - let array = StructArray::from(data); - - let expected_string = Arc::new(StringArray::from(vec![None, None])) as ArrayRef; - let expected_int = Arc::new(Int32Array::from(vec![Some(2), None])) as ArrayRef; - - let expected = - StructArray::try_from(vec![("f1", expected_string), ("f2", expected_int)]) - .unwrap(); - assert_eq!(array, expected) - } - - #[test] - fn test_struct_many() { - let strings: ArrayRef = Arc::new(StringArray::from(vec![ - Some("joe"), - None, - None, - Some("mark"), - Some("doe"), - ])); - let ints: ArrayRef = Arc::new(Int32Array::from(vec![ - Some(1), - Some(2), - None, - Some(4), - Some(5), - ])); - - let array = - StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) - .unwrap(); - let arrays = vec![array.data(), array.data()]; - let mut mutable = MutableArrayData::new(arrays, false, 0); - - mutable.extend(0, 1, 3); - mutable.extend(1, 0, 2); - let data = mutable.freeze(); - let array = StructArray::from(data); - - let expected_string = - Arc::new(StringArray::from(vec![None, None, Some("joe"), None])) as ArrayRef; - let expected_int = - Arc::new(Int32Array::from(vec![Some(2), None, Some(1), Some(2)])) as ArrayRef; - - let expected = - StructArray::try_from(vec![("f1", expected_string), ("f2", expected_int)]) - .unwrap(); - assert_eq!(array, expected) - } - - #[test] - fn test_binary_fixed_sized_offsets() { - let array = FixedSizeBinaryArray::try_from_iter( - vec![vec![0, 0], vec![0, 1], vec![0, 2]].into_iter(), - ) - .expect("Failed to create FixedSizeBinaryArray from iterable"); - let array = array.slice(1, 2); - // = [[0, 1], [0, 2]] due to the offset = 1 - - let arrays = vec![array.data()]; - - let mut mutable = MutableArrayData::new(arrays, false, 0); - - mutable.extend(0, 1, 2); - mutable.extend(0, 0, 1); - - let result = mutable.freeze(); - let result = FixedSizeBinaryArray::from(result); - - let expected = - FixedSizeBinaryArray::try_from_iter(vec![vec![0, 2], vec![0, 1]].into_iter()) - .expect("Failed to create FixedSizeBinaryArray from iterable"); - assert_eq!(result, expected); - } - - #[test] - fn test_list_append() { - let mut builder = - ListBuilder::::new(Int64Builder::with_capacity(24)); - builder.values().append_slice(&[1, 2, 3]); - builder.append(true); - builder.values().append_slice(&[4, 5]); - builder.append(true); - builder.values().append_slice(&[6, 7, 8]); - builder.values().append_slice(&[9, 10, 11]); - builder.append(true); - let a = builder.finish(); - - let a_builder = Int64Builder::with_capacity(24); - let mut a_builder = ListBuilder::::new(a_builder); - a_builder.values().append_slice(&[12, 13]); - a_builder.append(true); - a_builder.append(true); - a_builder.values().append_slice(&[14, 15]); - a_builder.append(true); - let b = a_builder.finish(); - - let c = b.slice(1, 2); - - let mut mutable = - MutableArrayData::new(vec![a.data(), b.data(), c.data()], false, 1); - mutable.extend(0, 0, a.len()); - mutable.extend(1, 0, b.len()); - mutable.extend(2, 0, c.len()); - - let finished = mutable.freeze(); - - let expected_int_array = Int64Array::from(vec![ - Some(1), - Some(2), - Some(3), - Some(4), - Some(5), - Some(6), - Some(7), - Some(8), - Some(9), - Some(10), - Some(11), - // append first array - Some(12), - Some(13), - Some(14), - Some(15), - // append second array - Some(14), - Some(15), - ]); - let list_value_offsets = - Buffer::from_slice_ref(&[0i32, 3, 5, 11, 13, 13, 15, 15, 17]); - let expected_list_data = ArrayData::try_new( - DataType::List(Box::new(Field::new("item", DataType::Int64, true))), - 8, - None, - 0, - vec![list_value_offsets], - vec![expected_int_array.into_data()], - ) - .unwrap(); - assert_eq!(finished, expected_list_data); - } - - #[test] - fn test_list_nulls_append() -> Result<()> { - let mut builder = - ListBuilder::::new(Int64Builder::with_capacity(32)); - builder.values().append_slice(&[1, 2, 3]); - builder.append(true); - builder.values().append_slice(&[4, 5]); - builder.append(true); - builder.append(false); - builder.values().append_slice(&[6, 7, 8]); - builder.values().append_null(); - builder.values().append_null(); - builder.values().append_slice(&[9, 10, 11]); - builder.append(true); - let a = builder.finish(); - let a = a.data(); - - let mut builder = - ListBuilder::::new(Int64Builder::with_capacity(32)); - builder.values().append_slice(&[12, 13]); - builder.append(true); - builder.append(false); - builder.append(true); - builder.values().append_null(); - builder.values().append_null(); - builder.values().append_slice(&[14, 15]); - builder.append(true); - let b = builder.finish(); - let b = b.data(); - let c = b.slice(1, 2); - let d = b.slice(2, 2); - - let mut mutable = MutableArrayData::new(vec![a, b, &c, &d], false, 10); - - mutable.extend(0, 0, a.len()); - mutable.extend(1, 0, b.len()); - mutable.extend(2, 0, c.len()); - mutable.extend(3, 0, d.len()); - let result = mutable.freeze(); - - let expected_int_array = Int64Array::from(vec![ - Some(1), - Some(2), - Some(3), - Some(4), - Some(5), - Some(6), - Some(7), - Some(8), - None, - None, - Some(9), - Some(10), - Some(11), - // second array - Some(12), - Some(13), - None, - None, - Some(14), - Some(15), - // slice(1, 2) results in no values added - None, - None, - Some(14), - Some(15), - ]); - let list_value_offsets = - Buffer::from_slice_ref(&[0, 3, 5, 5, 13, 15, 15, 15, 19, 19, 19, 19, 23]); - let expected_list_data = ArrayData::try_new( - DataType::List(Box::new(Field::new("item", DataType::Int64, true))), - 12, - Some(Buffer::from(&[0b11011011, 0b1110])), - 0, - vec![list_value_offsets], - vec![expected_int_array.into_data()], - ) - .unwrap(); - assert_eq!(result, expected_list_data); - - Ok(()) - } - - #[test] - fn test_list_append_with_capacities() { - let mut builder = - ListBuilder::::new(Int64Builder::with_capacity(24)); - builder.values().append_slice(&[1, 2, 3]); - builder.append(true); - builder.values().append_slice(&[4, 5]); - builder.append(true); - builder.values().append_slice(&[6, 7, 8]); - builder.values().append_slice(&[9, 10, 11]); - builder.append(true); - let a = builder.finish(); - - let a_builder = Int64Builder::with_capacity(24); - let mut a_builder = ListBuilder::::new(a_builder); - a_builder.values().append_slice(&[12, 13]); - a_builder.append(true); - a_builder.append(true); - a_builder.values().append_slice(&[14, 15, 16, 17]); - a_builder.append(true); - let b = a_builder.finish(); - - let mutable = MutableArrayData::with_capacities( - vec![a.data(), b.data()], - false, - Capacities::List(6, Some(Box::new(Capacities::Array(17)))), - ); - - // capacities are rounded up to multiples of 64 by MutableBuffer - assert_eq!(mutable.data.buffer1.capacity(), 64); - assert_eq!(mutable.data.child_data[0].data.buffer1.capacity(), 192); - } - - #[test] - fn test_map_nulls_append() -> Result<()> { - let mut builder = MapBuilder::::new( - None, - Int64Builder::with_capacity(32), - Int64Builder::with_capacity(32), - ); - builder.keys().append_slice(&[1, 2, 3]); - builder.values().append_slice(&[1, 2, 3]); - builder.append(true).unwrap(); - builder.keys().append_slice(&[4, 5]); - builder.values().append_slice(&[4, 5]); - builder.append(true).unwrap(); - builder.append(false).unwrap(); - builder.keys().append_slice(&[6, 7, 8, 100, 101, 9, 10, 11]); - builder.values().append_slice(&[6, 7, 8]); - builder.values().append_null(); - builder.values().append_null(); - builder.values().append_slice(&[9, 10, 11]); - builder.append(true).unwrap(); - - let a = builder.finish(); - let a = a.data(); - - let mut builder = MapBuilder::::new( - None, - Int64Builder::with_capacity(32), - Int64Builder::with_capacity(32), - ); - - builder.keys().append_slice(&[12, 13]); - builder.values().append_slice(&[12, 13]); - builder.append(true).unwrap(); - builder.append(false).unwrap(); - builder.append(true).unwrap(); - builder.keys().append_slice(&[100, 101, 14, 15]); - builder.values().append_null(); - builder.values().append_null(); - builder.values().append_slice(&[14, 15]); - builder.append(true).unwrap(); - - let b = builder.finish(); - let b = b.data(); - let c = b.slice(1, 2); - let d = b.slice(2, 2); - - let mut mutable = MutableArrayData::new(vec![a, b, &c, &d], false, 10); - - mutable.extend(0, 0, a.len()); - mutable.extend(1, 0, b.len()); - mutable.extend(2, 0, c.len()); - mutable.extend(3, 0, d.len()); - let result = mutable.freeze(); - - let expected_key_array = Int64Array::from(vec![ - Some(1), - Some(2), - Some(3), - Some(4), - Some(5), - Some(6), - Some(7), - Some(8), - Some(100), - Some(101), - Some(9), - Some(10), - Some(11), - // second array - Some(12), - Some(13), - Some(100), - Some(101), - Some(14), - Some(15), - // slice(1, 2) results in no values added - Some(100), - Some(101), - Some(14), - Some(15), - ]); - - let expected_value_array = Int64Array::from(vec![ - Some(1), - Some(2), - Some(3), - Some(4), - Some(5), - Some(6), - Some(7), - Some(8), - None, - None, - Some(9), - Some(10), - Some(11), - // second array - Some(12), - Some(13), - None, - None, - Some(14), - Some(15), - // slice(1, 2) results in no values added - None, - None, - Some(14), - Some(15), - ]); - - let expected_entry_array = StructArray::from(vec![ - ( - Field::new("keys", DataType::Int64, false), - Arc::new(expected_key_array) as ArrayRef, - ), - ( - Field::new("values", DataType::Int64, true), - Arc::new(expected_value_array) as ArrayRef, - ), - ]); - - let map_offsets = - Buffer::from_slice_ref(&[0, 3, 5, 5, 13, 15, 15, 15, 19, 19, 19, 19, 23]); - - let expected_list_data = ArrayData::try_new( - DataType::Map( - Box::new(Field::new( - "entries", - DataType::Struct(vec![ - Field::new("keys", DataType::Int64, false), - Field::new("values", DataType::Int64, true), - ]), - false, - )), - false, - ), - 12, - Some(Buffer::from(&[0b11011011, 0b1110])), - 0, - vec![map_offsets], - vec![expected_entry_array.into_data()], - ) - .unwrap(); - assert_eq!(result, expected_list_data); - - Ok(()) - } - - #[test] - fn test_list_of_strings_append() -> Result<()> { - // [["alpha", "beta", None]] - let mut builder = ListBuilder::new(StringBuilder::new()); - builder.values().append_value("Hello"); - builder.values().append_value("Arrow"); - builder.values().append_null(); - builder.append(true); - let a = builder.finish(); - - // [["alpha", "beta"], [None], ["gamma", "delta", None]] - let mut builder = ListBuilder::new(StringBuilder::new()); - builder.values().append_value("alpha"); - builder.values().append_value("beta"); - builder.append(true); - builder.values().append_null(); - builder.append(true); - builder.values().append_value("gamma"); - builder.values().append_value("delta"); - builder.values().append_null(); - builder.append(true); - let b = builder.finish(); - - let mut mutable = MutableArrayData::new(vec![a.data(), b.data()], false, 10); - - mutable.extend(0, 0, a.len()); - mutable.extend(1, 0, b.len()); - mutable.extend(1, 1, 3); - mutable.extend(1, 0, 0); - let result = mutable.freeze(); - - let expected_string_array = StringArray::from(vec![ - // extend a[0..a.len()] - // a[0] - Some("Hello"), - Some("Arrow"), - None, - // extend b[0..b.len()] - // b[0] - Some("alpha"), - Some("beta"), - // b[1] - None, - // b[2] - Some("gamma"), - Some("delta"), - None, - // extend b[1..3] - // b[1] - None, - // b[2] - Some("gamma"), - Some("delta"), - None, - // extend b[0..0] - ]); - let list_value_offsets = Buffer::from_slice_ref(&[0, 3, 5, 6, 9, 10, 13]); - let expected_list_data = ArrayData::try_new( - DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), - 6, - None, - 0, - vec![list_value_offsets], - vec![expected_string_array.into_data()], - ) - .unwrap(); - assert_eq!(result, expected_list_data); - Ok(()) - } - - #[test] - fn test_fixed_size_binary_append() { - let a = vec![Some(vec![1, 2]), Some(vec![3, 4]), Some(vec![5, 6])]; - let a = FixedSizeBinaryArray::try_from_sparse_iter(a.into_iter()) - .expect("Failed to create FixedSizeBinaryArray from iterable"); - - let b = vec![ - None, - Some(vec![7, 8]), - Some(vec![9, 10]), - None, - Some(vec![13, 14]), - None, - ]; - let b = FixedSizeBinaryArray::try_from_sparse_iter(b.into_iter()) - .expect("Failed to create FixedSizeBinaryArray from iterable"); - - let mut mutable = MutableArrayData::new(vec![a.data(), b.data()], false, 10); - - mutable.extend(0, 0, a.len()); - mutable.extend(1, 0, b.len()); - mutable.extend(1, 1, 4); - mutable.extend(1, 2, 3); - mutable.extend(1, 5, 5); - let result = mutable.freeze(); - - let expected = vec![ - // a - Some(vec![1, 2]), - Some(vec![3, 4]), - Some(vec![5, 6]), - // b - None, - Some(vec![7, 8]), - Some(vec![9, 10]), - None, - Some(vec![13, 14]), - None, - // b[1..4] - Some(vec![7, 8]), - Some(vec![9, 10]), - None, - // b[2..3] - Some(vec![9, 10]), - // b[4..4] - ]; - let expected = FixedSizeBinaryArray::try_from_sparse_iter(expected.into_iter()) - .expect("Failed to create FixedSizeBinaryArray from iterable"); - assert_eq!(&result, expected.data()); - } - - /* - // this is an old test used on a meanwhile removed dead code - // that is still useful when `MutableArrayData` supports fixed-size lists. - #[test] - fn test_fixed_size_list_append() -> Result<()> { - let int_builder = UInt16Builder::new(64); - let mut builder = FixedSizeListBuilder::::new(int_builder, 2); - builder.values().append_slice(&[1, 2])?; - builder.append(true)?; - builder.values().append_slice(&[3, 4])?; - builder.append(false)?; - builder.values().append_slice(&[5, 6])?; - builder.append(true)?; - - let a_builder = UInt16Builder::new(64); - let mut a_builder = FixedSizeListBuilder::::new(a_builder, 2); - a_builder.values().append_slice(&[7, 8])?; - a_builder.append(true)?; - a_builder.values().append_slice(&[9, 10])?; - a_builder.append(true)?; - a_builder.values().append_slice(&[11, 12])?; - a_builder.append(false)?; - a_builder.values().append_slice(&[13, 14])?; - a_builder.append(true)?; - a_builder.values().append_null()?; - a_builder.values().append_null()?; - a_builder.append(true)?; - let a = a_builder.finish(); - - // append array - builder.append_data(&[ - a.data(), - a.slice(1, 3).data(), - a.slice(2, 1).data(), - a.slice(5, 0).data(), - ])?; - let finished = builder.finish(); - - let expected_int_array = UInt16Array::from(vec![ - Some(1), - Some(2), - Some(3), - Some(4), - Some(5), - Some(6), - // append first array - Some(7), - Some(8), - Some(9), - Some(10), - Some(11), - Some(12), - Some(13), - Some(14), - None, - None, - // append slice(1, 3) - Some(9), - Some(10), - Some(11), - Some(12), - Some(13), - Some(14), - // append slice(2, 1) - Some(11), - Some(12), - ]); - let expected_list_data = ArrayData::new( - DataType::FixedSizeList( - Box::new(Field::new("item", DataType::UInt16, true)), - 2, - ), - 12, - None, - None, - 0, - vec![], - vec![expected_int_array.data()], - ); - let expected_list = - FixedSizeListArray::from(Arc::new(expected_list_data) as ArrayData); - assert_eq!(&expected_list.values(), &finished.values()); - assert_eq!(expected_list.len(), finished.len()); - - Ok(()) - } - */ -} diff --git a/arrow/src/compute/kernels/boolean.rs b/arrow/src/compute/kernels/boolean.rs index b8719ad2d6c..34921ca97ee 100644 --- a/arrow/src/compute/kernels/boolean.rs +++ b/arrow/src/compute/kernels/boolean.rs @@ -366,7 +366,7 @@ pub fn not(left: &BooleanArray) -> Result { let null_bit_buffer = data .null_bitmap() .as_ref() - .map(|b| b.bits.bit_slice(left_offset, len)); + .map(|b| b.buffer().bit_slice(left_offset, len)); let values = buffer_unary_not(&data.buffers()[0], left_offset, len); @@ -507,7 +507,7 @@ where let and = buffer_bin_and( right.values(), right.offset(), - &right_bitmap.bits, + right_bitmap.buffer(), right.offset(), right.len(), ); @@ -520,7 +520,7 @@ where // Here we take care of the possible offsets of the left and right arrays all at once. let modified_null_buffer = match left_data.null_bitmap() { Some(left_null_bitmap) => buffer_bin_and( - &left_null_bitmap.bits, + left_null_bitmap.buffer(), left_data.offset(), &rcb, 0, diff --git a/arrow/src/compute/kernels/cast.rs b/arrow/src/compute/kernels/cast.rs index dcd80ab11d6..791363574c5 100644 --- a/arrow/src/compute/kernels/cast.rs +++ b/arrow/src/compute/kernels/cast.rs @@ -1612,7 +1612,7 @@ where .data() .null_bitmap() .cloned() - .map(|bitmap| bitmap.bits), + .map(|bitmap| bitmap.into_buffer()), array.data().offset(), array.data().buffers().to_vec(), vec![], @@ -2408,7 +2408,7 @@ fn dictionary_cast( .data() .null_bitmap() .cloned() - .map(|bitmap| bitmap.bits), + .map(|bitmap| bitmap.into_buffer()), cast_keys.data().offset(), cast_keys.data().buffers().to_vec(), vec![cast_values.into_data()], @@ -2622,7 +2622,7 @@ fn cast_primitive_to_list( .data() .null_bitmap() .cloned() - .map(|bitmap| bitmap.bits), + .map(|bitmap| bitmap.into_buffer()), 0, vec![offsets.into()], vec![cast_array.into_data()], @@ -2649,7 +2649,9 @@ fn cast_list_inner( to_type.clone(), array.len(), Some(data.null_count()), - data.null_bitmap().cloned().map(|bitmap| bitmap.bits), + data.null_bitmap() + .cloned() + .map(|bitmap| bitmap.into_buffer()), array.offset(), // reuse offset buffer data.buffers().to_vec(), diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs index d4eb5a3e1d2..fec464b9328 100644 --- a/arrow/src/compute/kernels/comparison.rs +++ b/arrow/src/compute/kernels/comparison.rs @@ -945,7 +945,7 @@ pub fn eq_bool_scalar(left: &BooleanArray, right: bool) -> Result left.data_ref() .null_bitmap() .as_ref() - .map(|b| b.bits.bit_slice(left_offset, len)), + .map(|b| b.buffer().bit_slice(left_offset, len)), 0, vec![values], vec![], diff --git a/arrow/src/datatypes/mod.rs b/arrow/src/datatypes/mod.rs index 2f83871127f..e7d9bfd5a4f 100644 --- a/arrow/src/datatypes/mod.rs +++ b/arrow/src/datatypes/mod.rs @@ -30,10 +30,9 @@ mod numeric; pub use numeric::*; mod types; pub use types::*; -mod decimal; mod delta; -pub use decimal::*; +pub use arrow_data::decimal::*; pub use arrow_schema::{DataType, Field, IntervalUnit, Schema, TimeUnit, UnionMode}; #[cfg(feature = "ffi")] diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs index a4d864754cd..5cc264b1392 100644 --- a/arrow/src/lib.rs +++ b/arrow/src/lib.rs @@ -250,8 +250,11 @@ pub use arrow_buffer::{alloc, buffer}; +pub mod bitmap { + pub use arrow_data::Bitmap; +} + pub mod array; -pub mod bitmap; pub mod compute; #[cfg(feature = "csv")] pub mod csv; diff --git a/arrow/src/pyarrow.rs b/arrow/src/pyarrow.rs index 90caa2e3a5c..a775b2ce8bc 100644 --- a/arrow/src/pyarrow.rs +++ b/arrow/src/pyarrow.rs @@ -26,7 +26,7 @@ use pyo3::import_exception; use pyo3::prelude::*; use pyo3::types::{PyList, PyTuple}; -use crate::array::{Array, ArrayData, ArrayRef}; +use crate::array::{make_array, Array, ArrayData}; use crate::datatypes::{DataType, Field, Schema}; use crate::error::ArrowError; use crate::ffi; @@ -188,7 +188,7 @@ impl PyArrowConvert for RecordBatch { let arrays = value.getattr("columns")?.downcast::()?; let arrays = arrays .iter() - .map(ArrayRef::from_pyarrow) + .map(|a| Ok(make_array(ArrayData::from_pyarrow(a)?))) .collect::>()?; let batch = RecordBatch::try_new(schema, arrays).map_err(to_py_err)?; @@ -204,7 +204,7 @@ impl PyArrowConvert for RecordBatch { let columns = self.columns().iter(); for (array, field) in columns.zip(fields) { - py_arrays.push(array.to_pyarrow(py)?); + py_arrays.push(array.data().to_pyarrow(py)?); py_names.push(field.name()); } diff --git a/arrow/src/util/mod.rs b/arrow/src/util/mod.rs index adafc9f5053..310ffb8ee7a 100644 --- a/arrow/src/util/mod.rs +++ b/arrow/src/util/mod.rs @@ -17,10 +17,11 @@ pub use arrow_buffer::{bit_chunk_iterator, bit_util}; +pub use arrow_data::bit_iterator; +pub use arrow_data::bit_mask; + #[cfg(feature = "test_utils")] pub mod bench_util; -pub mod bit_iterator; -pub(crate) mod bit_mask; #[cfg(feature = "test_utils")] pub mod data_gen; pub mod display; diff --git a/arrow/tests/array_equal.rs b/arrow/tests/array_equal.rs new file mode 100644 index 00000000000..5a1b48c009d --- /dev/null +++ b/arrow/tests/array_equal.rs @@ -0,0 +1,1274 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::{ + make_array, Array, ArrayRef, BooleanArray, Decimal128Array, FixedSizeBinaryArray, + FixedSizeBinaryBuilder, FixedSizeListBuilder, GenericBinaryArray, GenericStringArray, + Int32Array, Int32Builder, Int64Builder, ListArray, ListBuilder, NullArray, + OffsetSizeTrait, StringArray, StringDictionaryBuilder, StructArray, UnionBuilder, +}; +use arrow::datatypes::{Int16Type, Int32Type}; +use arrow_buffer::{Buffer, ToByteSlice}; +use arrow_data::{ArrayData, ArrayDataBuilder}; +use arrow_schema::{DataType, Field}; +use std::sync::Arc; + +#[test] +fn test_null_equal() { + let a = NullArray::new(12); + let a = a.data(); + let b = NullArray::new(12); + let b = b.data(); + test_equal(a, b, true); + + let b = NullArray::new(10); + let b = b.data(); + test_equal(a, b, false); + + // Test the case where offset != 0 + + let a_slice = a.slice(2, 3); + let b_slice = b.slice(1, 3); + test_equal(&a_slice, &b_slice, true); + + let a_slice = a.slice(5, 4); + let b_slice = b.slice(3, 3); + test_equal(&a_slice, &b_slice, false); +} + +#[test] +fn test_boolean_equal() { + let a = BooleanArray::from(vec![false, false, true]); + let a = a.data(); + let b = BooleanArray::from(vec![false, false, true]); + let b = b.data(); + test_equal(a, b, true); + + let b = BooleanArray::from(vec![false, false, false]); + let b = b.data(); + test_equal(a, b, false); +} + +#[test] +fn test_boolean_equal_nulls() { + let a = BooleanArray::from(vec![Some(false), None, None, Some(true)]); + let a = a.data(); + let b = BooleanArray::from(vec![Some(false), None, None, Some(true)]); + let b = b.data(); + test_equal(a, b, true); + + let b = BooleanArray::from(vec![None, None, None, Some(true)]); + let b = b.data(); + test_equal(a, b, false); + + let b = BooleanArray::from(vec![Some(true), None, None, Some(true)]); + let b = b.data(); + test_equal(a, b, false); +} + +#[test] +fn test_boolean_equal_offset() { + let a = BooleanArray::from(vec![false, true, false, true, false, false, true]); + let a = a.data(); + let b = BooleanArray::from(vec![true, false, false, false, true, false, true, true]); + let b = b.data(); + assert_ne!(a, b); + assert_ne!(b, a); + + let a_slice = a.slice(2, 3); + let b_slice = b.slice(3, 3); + assert_eq!(a_slice, b_slice); + assert_eq!(b_slice, a_slice); + + let a_slice = a.slice(3, 4); + let b_slice = b.slice(4, 4); + assert_ne!(a_slice, b_slice); + assert_ne!(b_slice, a_slice); + + // Test the optimization cases where null_count == 0 and starts at 0 and len >= size_of(u8) + + // Elements fill in `u8`'s exactly. + let mut vector = vec![false, false, true, true, true, true, true, true]; + let a = BooleanArray::from(vector.clone()); + let a = a.data(); + let b = BooleanArray::from(vector.clone()); + let b = b.data(); + test_equal(a, b, true); + + // Elements fill in `u8`s + suffix bits. + vector.push(true); + let a = BooleanArray::from(vector.clone()); + let a = a.data(); + let b = BooleanArray::from(vector); + let b = b.data(); + test_equal(a, b, true); +} + +#[test] +fn test_primitive() { + let cases = vec![ + ( + vec![Some(1), Some(2), Some(3)], + vec![Some(1), Some(2), Some(3)], + true, + ), + ( + vec![Some(1), Some(2), Some(3)], + vec![Some(1), Some(2), Some(4)], + false, + ), + ( + vec![Some(1), Some(2), None], + vec![Some(1), Some(2), None], + true, + ), + ( + vec![Some(1), None, Some(3)], + vec![Some(1), Some(2), None], + false, + ), + ( + vec![Some(1), None, None], + vec![Some(1), Some(2), None], + false, + ), + ]; + + for (lhs, rhs, expected) in cases { + let lhs = Int32Array::from(lhs); + let lhs = lhs.data(); + let rhs = Int32Array::from(rhs); + let rhs = rhs.data(); + test_equal(lhs, rhs, expected); + } +} + +#[test] +fn test_primitive_slice() { + let cases = vec![ + ( + vec![Some(1), Some(2), Some(3)], + (0, 1), + vec![Some(1), Some(2), Some(3)], + (0, 1), + true, + ), + ( + vec![Some(1), Some(2), Some(3)], + (1, 1), + vec![Some(1), Some(2), Some(3)], + (2, 1), + false, + ), + ( + vec![Some(1), Some(2), None], + (1, 1), + vec![Some(1), None, Some(2)], + (2, 1), + true, + ), + ( + vec![None, Some(2), None], + (1, 1), + vec![None, None, Some(2)], + (2, 1), + true, + ), + ( + vec![Some(1), None, Some(2), None, Some(3)], + (2, 2), + vec![None, Some(2), None, Some(3)], + (1, 2), + true, + ), + ( + vec![Some(1), Some(2), None, Some(0)], + (2, 2), + vec![Some(4), Some(5), Some(0), None], + (2, 2), + false, + ), + ]; + + for (lhs, slice_lhs, rhs, slice_rhs, expected) in cases { + let lhs = Int32Array::from(lhs); + let lhs = lhs.data(); + let lhs = lhs.slice(slice_lhs.0, slice_lhs.1); + let rhs = Int32Array::from(rhs); + let rhs = rhs.data(); + let rhs = rhs.slice(slice_rhs.0, slice_rhs.1); + + test_equal(&lhs, &rhs, expected); + } +} + +#[allow(clippy::eq_op)] +fn test_equal(lhs: &ArrayData, rhs: &ArrayData, expected: bool) { + // equality is symmetric + assert_eq!(lhs, lhs); + assert_eq!(rhs, rhs); + + match expected { + true => { + assert_eq!(lhs, rhs); + assert_eq!(rhs, lhs); + } + false => { + assert_ne!(lhs, rhs); + assert_ne!(rhs, lhs); + } + } +} + +type OptionString = Option; + +fn binary_cases() -> Vec<(Vec, Vec, bool)> { + let base = vec![ + Some("hello".to_owned()), + None, + None, + Some("world".to_owned()), + None, + None, + ]; + let not_base = vec![ + Some("hello".to_owned()), + Some("foo".to_owned()), + None, + Some("world".to_owned()), + None, + None, + ]; + vec![ + ( + vec![Some("hello".to_owned()), Some("world".to_owned())], + vec![Some("hello".to_owned()), Some("world".to_owned())], + true, + ), + ( + vec![Some("hello".to_owned()), Some("world".to_owned())], + vec![Some("hello".to_owned()), Some("arrow".to_owned())], + false, + ), + (base.clone(), base.clone(), true), + (base, not_base, false), + ] +} + +fn test_generic_string_equal() { + let cases = binary_cases(); + + for (lhs, rhs, expected) in cases { + let lhs: GenericStringArray = lhs.into_iter().collect(); + let lhs = lhs.data(); + let rhs: GenericStringArray = rhs.into_iter().collect(); + let rhs = rhs.data(); + test_equal(lhs, rhs, expected); + } +} + +#[test] +fn test_string_equal() { + test_generic_string_equal::() +} + +#[test] +fn test_large_string_equal() { + test_generic_string_equal::() +} + +fn test_generic_binary_equal() { + let cases = binary_cases(); + + for (lhs, rhs, expected) in cases { + let lhs = lhs + .iter() + .map(|x| x.as_deref().map(|x| x.as_bytes())) + .collect(); + let rhs = rhs + .iter() + .map(|x| x.as_deref().map(|x| x.as_bytes())) + .collect(); + let lhs = GenericBinaryArray::::from_opt_vec(lhs); + let lhs = lhs.data(); + let rhs = GenericBinaryArray::::from_opt_vec(rhs); + let rhs = rhs.data(); + test_equal(lhs, rhs, expected); + } +} + +#[test] +fn test_binary_equal() { + test_generic_binary_equal::() +} + +#[test] +fn test_large_binary_equal() { + test_generic_binary_equal::() +} + +#[test] +fn test_fixed_size_binary_array() { + let a_input_arg = vec![vec![1, 2], vec![3, 4], vec![5, 6]]; + let a = FixedSizeBinaryArray::try_from_iter(a_input_arg.into_iter()).unwrap(); + let a = a.data(); + + let b_input_arg = vec![vec![1, 2], vec![3, 4], vec![5, 6]]; + let b = FixedSizeBinaryArray::try_from_iter(b_input_arg.into_iter()).unwrap(); + let b = b.data(); + + test_equal(a, b, true); +} + +#[test] +fn test_string_offset() { + let a = StringArray::from(vec![Some("a"), None, Some("b")]); + let a = a.data(); + let a = a.slice(2, 1); + let b = StringArray::from(vec![Some("b")]); + let b = b.data(); + + test_equal(&a, b, true); +} + +#[test] +fn test_string_offset_larger() { + let a = StringArray::from(vec![Some("a"), None, Some("b"), None, Some("c")]); + let a = a.data(); + let b = StringArray::from(vec![None, Some("b"), None, Some("c")]); + let b = b.data(); + + test_equal(&a.slice(2, 2), &b.slice(0, 2), false); + test_equal(&a.slice(2, 2), &b.slice(1, 2), true); + test_equal(&a.slice(2, 2), &b.slice(2, 2), false); +} + +#[test] +fn test_null() { + let a = NullArray::new(2); + let a = a.data(); + let b = NullArray::new(2); + let b = b.data(); + test_equal(a, b, true); + + let b = NullArray::new(1); + let b = b.data(); + test_equal(a, b, false); +} + +fn create_list_array, T: AsRef<[Option]>>(data: T) -> ArrayData { + let mut builder = ListBuilder::new(Int32Builder::with_capacity(10)); + for d in data.as_ref() { + if let Some(v) = d { + builder.values().append_slice(v.as_ref()); + builder.append(true); + } else { + builder.append(false); + } + } + builder.finish().into_data() +} + +#[test] +fn test_list_equal() { + let a = create_list_array(&[Some(&[1, 2, 3]), Some(&[4, 5, 6])]); + let b = create_list_array(&[Some(&[1, 2, 3]), Some(&[4, 5, 6])]); + test_equal(&a, &b, true); + + let b = create_list_array(&[Some(&[1, 2, 3]), Some(&[4, 5, 7])]); + test_equal(&a, &b, false); +} + +#[test] +fn test_empty_offsets_list_equal() { + let empty: Vec = vec![]; + let values = Int32Array::from(empty); + let empty_offsets: [u8; 0] = []; + + let a = ArrayDataBuilder::new(DataType::List(Box::new(Field::new( + "item", + DataType::Int32, + true, + )))) + .len(0) + .add_buffer(Buffer::from(&empty_offsets)) + .add_child_data(values.data().clone()) + .null_bit_buffer(Some(Buffer::from(&empty_offsets))) + .build() + .unwrap(); + + let b = ArrayDataBuilder::new(DataType::List(Box::new(Field::new( + "item", + DataType::Int32, + true, + )))) + .len(0) + .add_buffer(Buffer::from(&empty_offsets)) + .add_child_data(values.data().clone()) + .null_bit_buffer(Some(Buffer::from(&empty_offsets))) + .build() + .unwrap(); + + test_equal(&a, &b, true); + + let c = ArrayDataBuilder::new(DataType::List(Box::new(Field::new( + "item", + DataType::Int32, + true, + )))) + .len(0) + .add_buffer(Buffer::from(vec![0i32, 2, 3, 4, 6, 7, 8].to_byte_slice())) + .add_child_data( + Int32Array::from(vec![1, 2, -1, -2, 3, 4, -3, -4]) + .data() + .clone(), + ) + .null_bit_buffer(Some(Buffer::from(vec![0b00001001]))) + .build() + .unwrap(); + + test_equal(&a, &c, true); +} + +// Test the case where null_count > 0 +#[test] +fn test_list_null() { + let a = create_list_array(&[Some(&[1, 2]), None, None, Some(&[3, 4]), None, None]); + let b = create_list_array(&[Some(&[1, 2]), None, None, Some(&[3, 4]), None, None]); + test_equal(&a, &b, true); + + let b = create_list_array(&[ + Some(&[1, 2]), + None, + Some(&[5, 6]), + Some(&[3, 4]), + None, + None, + ]); + test_equal(&a, &b, false); + + let b = create_list_array(&[Some(&[1, 2]), None, None, Some(&[3, 5]), None, None]); + test_equal(&a, &b, false); + + // a list where the nullness of values is determined by the list's bitmap + let c_values = Int32Array::from(vec![1, 2, -1, -2, 3, 4, -3, -4]); + let c = ArrayDataBuilder::new(DataType::List(Box::new(Field::new( + "item", + DataType::Int32, + true, + )))) + .len(6) + .add_buffer(Buffer::from(vec![0i32, 2, 3, 4, 6, 7, 8].to_byte_slice())) + .add_child_data(c_values.into_data()) + .null_bit_buffer(Some(Buffer::from(vec![0b00001001]))) + .build() + .unwrap(); + + let d_values = Int32Array::from(vec![ + Some(1), + Some(2), + None, + None, + Some(3), + Some(4), + None, + None, + ]); + let d = ArrayDataBuilder::new(DataType::List(Box::new(Field::new( + "item", + DataType::Int32, + true, + )))) + .len(6) + .add_buffer(Buffer::from(vec![0i32, 2, 3, 4, 6, 7, 8].to_byte_slice())) + .add_child_data(d_values.into_data()) + .null_bit_buffer(Some(Buffer::from(vec![0b00001001]))) + .build() + .unwrap(); + test_equal(&c, &d, true); +} + +// Test the case where offset != 0 +#[test] +fn test_list_offsets() { + let a = create_list_array(&[Some(&[1, 2]), None, None, Some(&[3, 4]), None, None]); + let b = create_list_array(&[Some(&[1, 2]), None, None, Some(&[3, 5]), None, None]); + + let a_slice = a.slice(0, 3); + let b_slice = b.slice(0, 3); + test_equal(&a_slice, &b_slice, true); + + let a_slice = a.slice(0, 5); + let b_slice = b.slice(0, 5); + test_equal(&a_slice, &b_slice, false); + + let a_slice = a.slice(4, 1); + let b_slice = b.slice(4, 1); + test_equal(&a_slice, &b_slice, true); +} + +fn create_fixed_size_binary_array, T: AsRef<[Option]>>( + data: T, +) -> ArrayData { + let mut builder = FixedSizeBinaryBuilder::with_capacity(data.as_ref().len(), 5); + + for d in data.as_ref() { + if let Some(v) = d { + builder.append_value(v.as_ref()).unwrap(); + } else { + builder.append_null(); + } + } + builder.finish().into_data() +} + +#[test] +fn test_fixed_size_binary_equal() { + let a = create_fixed_size_binary_array(&[Some(b"hello"), Some(b"world")]); + let b = create_fixed_size_binary_array(&[Some(b"hello"), Some(b"world")]); + test_equal(&a, &b, true); + + let b = create_fixed_size_binary_array(&[Some(b"hello"), Some(b"arrow")]); + test_equal(&a, &b, false); +} + +// Test the case where null_count > 0 +#[test] +fn test_fixed_size_binary_null() { + let a = create_fixed_size_binary_array(&[Some(b"hello"), None, Some(b"world")]); + let b = create_fixed_size_binary_array(&[Some(b"hello"), None, Some(b"world")]); + test_equal(&a, &b, true); + + let b = create_fixed_size_binary_array(&[Some(b"hello"), Some(b"world"), None]); + test_equal(&a, &b, false); + + let b = create_fixed_size_binary_array(&[Some(b"hello"), None, Some(b"arrow")]); + test_equal(&a, &b, false); +} + +#[test] +fn test_fixed_size_binary_offsets() { + // Test the case where offset != 0 + let a = create_fixed_size_binary_array(&[ + Some(b"hello"), + None, + None, + Some(b"world"), + None, + None, + ]); + let b = create_fixed_size_binary_array(&[ + Some(b"hello"), + None, + None, + Some(b"arrow"), + None, + None, + ]); + + let a_slice = a.slice(0, 3); + let b_slice = b.slice(0, 3); + test_equal(&a_slice, &b_slice, true); + + let a_slice = a.slice(0, 5); + let b_slice = b.slice(0, 5); + test_equal(&a_slice, &b_slice, false); + + let a_slice = a.slice(4, 1); + let b_slice = b.slice(4, 1); + test_equal(&a_slice, &b_slice, true); + + let a_slice = a.slice(3, 1); + let b_slice = b.slice(3, 1); + test_equal(&a_slice, &b_slice, false); +} + +fn create_decimal_array(data: Vec>) -> ArrayData { + data.into_iter() + .collect::() + .with_precision_and_scale(23, 6) + .unwrap() + .into() +} + +#[test] +fn test_decimal_equal() { + let a = create_decimal_array(vec![Some(8_887_000_000), Some(-8_887_000_000)]); + let b = create_decimal_array(vec![Some(8_887_000_000), Some(-8_887_000_000)]); + test_equal(&a, &b, true); + + let b = create_decimal_array(vec![Some(15_887_000_000), Some(-8_887_000_000)]); + test_equal(&a, &b, false); +} + +// Test the case where null_count > 0 +#[test] +fn test_decimal_null() { + let a = create_decimal_array(vec![Some(8_887_000_000), None, Some(-8_887_000_000)]); + let b = create_decimal_array(vec![Some(8_887_000_000), None, Some(-8_887_000_000)]); + test_equal(&a, &b, true); + + let b = create_decimal_array(vec![Some(8_887_000_000), Some(-8_887_000_000), None]); + test_equal(&a, &b, false); + + let b = create_decimal_array(vec![Some(15_887_000_000), None, Some(-8_887_000_000)]); + test_equal(&a, &b, false); +} + +#[test] +fn test_decimal_offsets() { + // Test the case where offset != 0 + let a = create_decimal_array(vec![ + Some(8_887_000_000), + None, + None, + Some(-8_887_000_000), + None, + None, + ]); + let b = create_decimal_array(vec![ + None, + Some(8_887_000_000), + None, + None, + Some(15_887_000_000), + None, + None, + ]); + + let a_slice = a.slice(0, 3); + let b_slice = b.slice(1, 3); + test_equal(&a_slice, &b_slice, true); + + let a_slice = a.slice(0, 5); + let b_slice = b.slice(1, 5); + test_equal(&a_slice, &b_slice, false); + + let a_slice = a.slice(4, 1); + let b_slice = b.slice(5, 1); + test_equal(&a_slice, &b_slice, true); + + let a_slice = a.slice(3, 3); + let b_slice = b.slice(4, 3); + test_equal(&a_slice, &b_slice, false); + + let a_slice = a.slice(1, 3); + let b_slice = b.slice(2, 3); + test_equal(&a_slice, &b_slice, false); + + let b = create_decimal_array(vec![ + None, + None, + None, + Some(-8_887_000_000), + Some(-3_000), + None, + ]); + let a_slice = a.slice(1, 3); + let b_slice = b.slice(1, 3); + test_equal(&a_slice, &b_slice, true); +} + +/// Create a fixed size list of 2 value lengths +fn create_fixed_size_list_array, T: AsRef<[Option]>>( + data: T, +) -> ArrayData { + let mut builder = FixedSizeListBuilder::new(Int32Builder::with_capacity(10), 3); + + for d in data.as_ref() { + if let Some(v) = d { + builder.values().append_slice(v.as_ref()); + builder.append(true); + } else { + for _ in 0..builder.value_length() { + builder.values().append_null(); + } + builder.append(false); + } + } + builder.finish().into_data() +} + +#[test] +fn test_fixed_size_list_equal() { + let a = create_fixed_size_list_array(&[Some(&[1, 2, 3]), Some(&[4, 5, 6])]); + let b = create_fixed_size_list_array(&[Some(&[1, 2, 3]), Some(&[4, 5, 6])]); + test_equal(&a, &b, true); + + let b = create_fixed_size_list_array(&[Some(&[1, 2, 3]), Some(&[4, 5, 7])]); + test_equal(&a, &b, false); +} + +// Test the case where null_count > 0 +#[test] +fn test_fixed_list_null() { + let a = create_fixed_size_list_array(&[ + Some(&[1, 2, 3]), + None, + None, + Some(&[4, 5, 6]), + None, + None, + ]); + let b = create_fixed_size_list_array(&[ + Some(&[1, 2, 3]), + None, + None, + Some(&[4, 5, 6]), + None, + None, + ]); + test_equal(&a, &b, true); + + let b = create_fixed_size_list_array(&[ + Some(&[1, 2, 3]), + None, + Some(&[7, 8, 9]), + Some(&[4, 5, 6]), + None, + None, + ]); + test_equal(&a, &b, false); + + let b = create_fixed_size_list_array(&[ + Some(&[1, 2, 3]), + None, + None, + Some(&[3, 6, 9]), + None, + None, + ]); + test_equal(&a, &b, false); + + let b = create_fixed_size_list_array(&[None, Some(&[4, 5, 6]), None, None]); + + test_equal(&a.slice(2, 4), &b, true); + test_equal(&a.slice(3, 3), &b.slice(1, 3), true); +} + +#[test] +fn test_fixed_list_offsets() { + // Test the case where offset != 0 + let a = create_fixed_size_list_array(&[ + Some(&[1, 2, 3]), + None, + None, + Some(&[4, 5, 6]), + None, + None, + ]); + let b = create_fixed_size_list_array(&[ + Some(&[1, 2, 3]), + None, + None, + Some(&[3, 6, 9]), + None, + None, + ]); + + let a_slice = a.slice(0, 3); + let b_slice = b.slice(0, 3); + test_equal(&a_slice, &b_slice, true); + + let a_slice = a.slice(0, 5); + let b_slice = b.slice(0, 5); + test_equal(&a_slice, &b_slice, false); + + let a_slice = a.slice(4, 1); + let b_slice = b.slice(4, 1); + test_equal(&a_slice, &b_slice, true); +} + +#[test] +fn test_struct_equal() { + let strings: ArrayRef = Arc::new(StringArray::from(vec![ + Some("joe"), + None, + None, + Some("mark"), + Some("doe"), + ])); + let ints: ArrayRef = Arc::new(Int32Array::from(vec![ + Some(1), + Some(2), + None, + Some(4), + Some(5), + ])); + + let a = StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) + .unwrap(); + let a = a.data(); + + let b = StructArray::try_from(vec![("f1", strings), ("f2", ints)]).unwrap(); + let b = b.data(); + + test_equal(a, b, true); +} + +#[test] +fn test_struct_equal_null() { + let strings: ArrayRef = Arc::new(StringArray::from(vec![ + Some("joe"), + None, + None, + Some("mark"), + Some("doe"), + ])); + let ints: ArrayRef = Arc::new(Int32Array::from(vec![ + Some(1), + Some(2), + None, + Some(4), + Some(5), + ])); + let ints_non_null: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 0])); + + let a = ArrayData::builder(DataType::Struct(vec![ + Field::new("f1", DataType::Utf8, true), + Field::new("f2", DataType::Int32, true), + ])) + .null_bit_buffer(Some(Buffer::from(vec![0b00001011]))) + .len(5) + .add_child_data(strings.data_ref().clone()) + .add_child_data(ints.data_ref().clone()) + .build() + .unwrap(); + let a = make_array(a); + + let b = ArrayData::builder(DataType::Struct(vec![ + Field::new("f1", DataType::Utf8, true), + Field::new("f2", DataType::Int32, true), + ])) + .null_bit_buffer(Some(Buffer::from(vec![0b00001011]))) + .len(5) + .add_child_data(strings.data_ref().clone()) + .add_child_data(ints_non_null.data_ref().clone()) + .build() + .unwrap(); + let b = make_array(b); + + test_equal(a.data_ref(), b.data_ref(), true); + + // test with arrays that are not equal + let c_ints_non_null: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 0, 4])); + let c = ArrayData::builder(DataType::Struct(vec![ + Field::new("f1", DataType::Utf8, true), + Field::new("f2", DataType::Int32, true), + ])) + .null_bit_buffer(Some(Buffer::from(vec![0b00001011]))) + .len(5) + .add_child_data(strings.data_ref().clone()) + .add_child_data(c_ints_non_null.data_ref().clone()) + .build() + .unwrap(); + let c = make_array(c); + + test_equal(a.data_ref(), c.data_ref(), false); + + // test a nested struct + let a = ArrayData::builder(DataType::Struct(vec![Field::new( + "f3", + a.data_type().clone(), + true, + )])) + .null_bit_buffer(Some(Buffer::from(vec![0b00011110]))) + .len(5) + .add_child_data(a.data_ref().clone()) + .build() + .unwrap(); + let a = make_array(a); + + // reconstruct b, but with different data where the first struct is null + let strings: ArrayRef = Arc::new(StringArray::from(vec![ + Some("joanne"), // difference + None, + None, + Some("mark"), + Some("doe"), + ])); + let b = ArrayData::builder(DataType::Struct(vec![ + Field::new("f1", DataType::Utf8, true), + Field::new("f2", DataType::Int32, true), + ])) + .null_bit_buffer(Some(Buffer::from(vec![0b00001011]))) + .len(5) + .add_child_data(strings.data_ref().clone()) + .add_child_data(ints_non_null.data_ref().clone()) + .build() + .unwrap(); + + let b = ArrayData::builder(DataType::Struct(vec![Field::new( + "f3", + b.data_type().clone(), + true, + )])) + .null_bit_buffer(Some(Buffer::from(vec![0b00011110]))) + .len(5) + .add_child_data(b) + .build() + .unwrap(); + let b = make_array(b); + + test_equal(a.data_ref(), b.data_ref(), true); +} + +#[test] +fn test_struct_equal_null_variable_size() { + // the string arrays differ, but where the struct array is null + let strings1: ArrayRef = Arc::new(StringArray::from(vec![ + Some("joe"), + None, + None, + Some("mark"), + Some("doel"), + ])); + let strings2: ArrayRef = Arc::new(StringArray::from(vec![ + Some("joel"), + None, + None, + Some("mark"), + Some("doe"), + ])); + + let a = ArrayData::builder(DataType::Struct(vec![Field::new( + "f1", + DataType::Utf8, + true, + )])) + .null_bit_buffer(Some(Buffer::from(vec![0b00001010]))) + .len(5) + .add_child_data(strings1.data_ref().clone()) + .build() + .unwrap(); + let a = make_array(a); + + let b = ArrayData::builder(DataType::Struct(vec![Field::new( + "f1", + DataType::Utf8, + true, + )])) + .null_bit_buffer(Some(Buffer::from(vec![0b00001010]))) + .len(5) + .add_child_data(strings2.data_ref().clone()) + .build() + .unwrap(); + let b = make_array(b); + + test_equal(a.data_ref(), b.data_ref(), true); + + // test with arrays that are not equal + let strings3: ArrayRef = Arc::new(StringArray::from(vec![ + Some("mark"), + None, + None, + Some("doe"), + Some("joe"), + ])); + let c = ArrayData::builder(DataType::Struct(vec![Field::new( + "f1", + DataType::Utf8, + true, + )])) + .null_bit_buffer(Some(Buffer::from(vec![0b00001011]))) + .len(5) + .add_child_data(strings3.data_ref().clone()) + .build() + .unwrap(); + let c = make_array(c); + + test_equal(a.data_ref(), c.data_ref(), false); +} + +fn create_dictionary_array(values: &[&str], keys: &[Option<&str>]) -> ArrayData { + let values = StringArray::from(values.to_vec()); + let mut builder = + StringDictionaryBuilder::::new_with_dictionary(keys.len(), &values) + .unwrap(); + for key in keys { + if let Some(v) = key { + builder.append(v).unwrap(); + } else { + builder.append_null() + } + } + builder.finish().into_data() +} + +#[test] +fn test_dictionary_equal() { + // (a, b, c), (1, 2, 1, 3) => (a, b, a, c) + let a = create_dictionary_array( + &["a", "b", "c"], + &[Some("a"), Some("b"), Some("a"), Some("c")], + ); + // different representation (values and keys are swapped), same result + let b = create_dictionary_array( + &["a", "c", "b"], + &[Some("a"), Some("b"), Some("a"), Some("c")], + ); + test_equal(&a, &b, true); + + // different len + let b = create_dictionary_array(&["a", "c", "b"], &[Some("a"), Some("b"), Some("a")]); + test_equal(&a, &b, false); + + // different key + let b = create_dictionary_array( + &["a", "c", "b"], + &[Some("a"), Some("b"), Some("a"), Some("a")], + ); + test_equal(&a, &b, false); + + // different values, same keys + let b = create_dictionary_array( + &["a", "b", "d"], + &[Some("a"), Some("b"), Some("a"), Some("d")], + ); + test_equal(&a, &b, false); +} + +#[test] +fn test_dictionary_equal_null() { + // (a, b, c), (1, 2, 1, 3) => (a, b, a, c) + let a = create_dictionary_array( + &["a", "b", "c"], + &[Some("a"), None, Some("a"), Some("c")], + ); + + // equal to self + test_equal(&a, &a, true); + + // different representation (values and keys are swapped), same result + let b = create_dictionary_array( + &["a", "c", "b"], + &[Some("a"), None, Some("a"), Some("c")], + ); + test_equal(&a, &b, true); + + // different null position + let b = create_dictionary_array( + &["a", "c", "b"], + &[Some("a"), Some("b"), Some("a"), None], + ); + test_equal(&a, &b, false); + + // different key + let b = create_dictionary_array( + &["a", "c", "b"], + &[Some("a"), None, Some("a"), Some("a")], + ); + test_equal(&a, &b, false); + + // different values, same keys + let b = create_dictionary_array( + &["a", "b", "d"], + &[Some("a"), None, Some("a"), Some("d")], + ); + test_equal(&a, &b, false); +} + +#[test] +fn test_non_null_empty_strings() { + let s = StringArray::from(vec![Some(""), Some(""), Some("")]); + + let string1 = s.data(); + + let string2 = ArrayData::builder(DataType::Utf8) + .len(string1.len()) + .buffers(string1.buffers().to_vec()) + .build() + .unwrap(); + + // string2 is identical to string1 except that it has no validity buffer but since there + // are no nulls, string1 and string2 are equal + test_equal(string1, &string2, true); +} + +#[test] +fn test_null_empty_strings() { + let s = StringArray::from(vec![Some(""), None, Some("")]); + + let string1 = s.data(); + + let string2 = ArrayData::builder(DataType::Utf8) + .len(string1.len()) + .buffers(string1.buffers().to_vec()) + .build() + .unwrap(); + + // string2 is identical to string1 except that it has no validity buffer since string1 has + // nulls in it, string1 and string2 are not equal + test_equal(string1, &string2, false); +} + +#[test] +fn test_union_equal_dense() { + let mut builder = UnionBuilder::new_dense(); + builder.append::("a", 1).unwrap(); + builder.append::("b", 2).unwrap(); + builder.append::("c", 3).unwrap(); + builder.append::("a", 4).unwrap(); + builder.append_null::("a").unwrap(); + builder.append::("a", 6).unwrap(); + builder.append::("b", 7).unwrap(); + let union1 = builder.build().unwrap(); + + builder = UnionBuilder::new_dense(); + builder.append::("a", 1).unwrap(); + builder.append::("b", 2).unwrap(); + builder.append::("c", 3).unwrap(); + builder.append::("a", 4).unwrap(); + builder.append_null::("a").unwrap(); + builder.append::("a", 6).unwrap(); + builder.append::("b", 7).unwrap(); + let union2 = builder.build().unwrap(); + + builder = UnionBuilder::new_dense(); + builder.append::("a", 1).unwrap(); + builder.append::("b", 2).unwrap(); + builder.append::("c", 3).unwrap(); + builder.append::("a", 5).unwrap(); + builder.append::("c", 4).unwrap(); + builder.append::("a", 6).unwrap(); + builder.append::("b", 7).unwrap(); + let union3 = builder.build().unwrap(); + + builder = UnionBuilder::new_dense(); + builder.append::("a", 1).unwrap(); + builder.append::("b", 2).unwrap(); + builder.append::("c", 3).unwrap(); + builder.append::("a", 4).unwrap(); + builder.append_null::("c").unwrap(); + builder.append_null::("b").unwrap(); + builder.append::("b", 7).unwrap(); + let union4 = builder.build().unwrap(); + + test_equal(union1.data(), union2.data(), true); + test_equal(union1.data(), union3.data(), false); + test_equal(union1.data(), union4.data(), false); +} + +#[test] +fn test_union_equal_sparse() { + let mut builder = UnionBuilder::new_sparse(); + builder.append::("a", 1).unwrap(); + builder.append::("b", 2).unwrap(); + builder.append::("c", 3).unwrap(); + builder.append::("a", 4).unwrap(); + builder.append_null::("a").unwrap(); + builder.append::("a", 6).unwrap(); + builder.append::("b", 7).unwrap(); + let union1 = builder.build().unwrap(); + + builder = UnionBuilder::new_sparse(); + builder.append::("a", 1).unwrap(); + builder.append::("b", 2).unwrap(); + builder.append::("c", 3).unwrap(); + builder.append::("a", 4).unwrap(); + builder.append_null::("a").unwrap(); + builder.append::("a", 6).unwrap(); + builder.append::("b", 7).unwrap(); + let union2 = builder.build().unwrap(); + + builder = UnionBuilder::new_sparse(); + builder.append::("a", 1).unwrap(); + builder.append::("b", 2).unwrap(); + builder.append::("c", 3).unwrap(); + builder.append::("a", 5).unwrap(); + builder.append::("c", 4).unwrap(); + builder.append::("a", 6).unwrap(); + builder.append::("b", 7).unwrap(); + let union3 = builder.build().unwrap(); + + builder = UnionBuilder::new_sparse(); + builder.append::("a", 1).unwrap(); + builder.append::("b", 2).unwrap(); + builder.append::("c", 3).unwrap(); + builder.append::("a", 4).unwrap(); + builder.append_null::("a").unwrap(); + builder.append_null::("a").unwrap(); + builder.append::("b", 7).unwrap(); + let union4 = builder.build().unwrap(); + + test_equal(union1.data(), union2.data(), true); + test_equal(union1.data(), union3.data(), false); + test_equal(union1.data(), union4.data(), false); +} + +#[test] +fn test_boolean_slice() { + let array = BooleanArray::from(vec![true; 32]); + let slice = array.slice(4, 12); + assert_eq!(slice.data(), slice.data()); + + let slice = array.slice(8, 12); + assert_eq!(slice.data(), slice.data()); + + let slice = array.slice(8, 24); + assert_eq!(slice.data(), slice.data()); +} + +#[test] +fn test_sliced_nullable_boolean_array() { + let a = BooleanArray::from(vec![None; 32]); + let b = BooleanArray::from(vec![true; 32]); + let slice_a = a.slice(1, 12); + let slice_b = b.slice(1, 12); + assert_ne!(slice_a.data(), slice_b.data()); +} + +#[test] +fn list_array_non_zero_nulls() { + // Tests handling of list arrays with non-empty null ranges + let mut builder = ListBuilder::new(Int64Builder::with_capacity(10)); + builder.values().append_value(1); + builder.values().append_value(2); + builder.values().append_value(3); + builder.append(true); + builder.append(false); + let array1 = builder.finish(); + + let mut builder = ListBuilder::new(Int64Builder::with_capacity(10)); + builder.values().append_value(1); + builder.values().append_value(2); + builder.values().append_value(3); + builder.append(true); + builder.values().append_null(); + builder.values().append_null(); + builder.append(false); + let array2 = builder.finish(); + + assert_eq!(array1, array2); +} + +#[test] +fn test_list_different_offsets() { + let a = ListArray::from_iter_primitive::([ + Some([Some(0), Some(0)]), + Some([Some(1), Some(2)]), + Some([None, None]), + ]); + let b = ListArray::from_iter_primitive::([ + Some([Some(1), Some(2)]), + Some([None, None]), + Some([None, None]), + ]); + let a_slice = a.slice(1, 2); + let b_slice = b.slice(0, 2); + assert_eq!(&a_slice, &b_slice); +} diff --git a/arrow/tests/array_transform.rs b/arrow/tests/array_transform.rs new file mode 100644 index 00000000000..3619abacdc9 --- /dev/null +++ b/arrow/tests/array_transform.rs @@ -0,0 +1,1005 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::{ + Array, ArrayRef, BooleanArray, Decimal128Array, DictionaryArray, + FixedSizeBinaryArray, Int16Array, Int32Array, Int64Array, Int64Builder, ListArray, + ListBuilder, MapBuilder, NullArray, StringArray, StringBuilder, + StringDictionaryBuilder, StructArray, UInt8Array, +}; +use arrow::datatypes::Int16Type; +use arrow_buffer::Buffer; +use arrow_data::transform::MutableArrayData; +use arrow_data::ArrayData; +use arrow_schema::{DataType, Field}; +use std::sync::Arc; + +fn create_decimal_array( + array: Vec>, + precision: u8, + scale: u8, +) -> Decimal128Array { + array + .into_iter() + .collect::() + .with_precision_and_scale(precision, scale) + .unwrap() +} + +#[test] +#[cfg(not(feature = "force_validate"))] +fn test_decimal() { + let decimal_array = + create_decimal_array(vec![Some(1), Some(2), None, Some(3)], 10, 3); + let arrays = vec![Array::data(&decimal_array)]; + let mut a = MutableArrayData::new(arrays, true, 3); + a.extend(0, 0, 3); + a.extend(0, 2, 3); + let result = a.freeze(); + let array = Decimal128Array::from(result); + let expected = create_decimal_array(vec![Some(1), Some(2), None, None], 10, 3); + assert_eq!(array, expected); +} +#[test] +#[cfg(not(feature = "force_validate"))] +fn test_decimal_offset() { + let decimal_array = + create_decimal_array(vec![Some(1), Some(2), None, Some(3)], 10, 3); + let decimal_array = decimal_array.slice(1, 3); // 2, null, 3 + let arrays = vec![decimal_array.data()]; + let mut a = MutableArrayData::new(arrays, true, 2); + a.extend(0, 0, 2); // 2, null + let result = a.freeze(); + let array = Decimal128Array::from(result); + let expected = create_decimal_array(vec![Some(2), None], 10, 3); + assert_eq!(array, expected); +} + +#[test] +#[cfg(not(feature = "force_validate"))] +fn test_decimal_null_offset_nulls() { + let decimal_array = + create_decimal_array(vec![Some(1), Some(2), None, Some(3)], 10, 3); + let decimal_array = decimal_array.slice(1, 3); // 2, null, 3 + let arrays = vec![decimal_array.data()]; + let mut a = MutableArrayData::new(arrays, true, 2); + a.extend(0, 0, 2); // 2, null + a.extend_nulls(3); // 2, null, null, null, null + a.extend(0, 1, 3); //2, null, null, null, null, null, 3 + let result = a.freeze(); + let array = Decimal128Array::from(result); + let expected = + create_decimal_array(vec![Some(2), None, None, None, None, None, Some(3)], 10, 3); + assert_eq!(array, expected); +} + +/// tests extending from a primitive array w/ offset nor nulls +#[test] +fn test_primitive() { + let b = UInt8Array::from(vec![Some(1), Some(2), Some(3)]); + let arrays = vec![b.data()]; + let mut a = MutableArrayData::new(arrays, false, 3); + a.extend(0, 0, 2); + let result = a.freeze(); + let array = UInt8Array::from(result); + let expected = UInt8Array::from(vec![Some(1), Some(2)]); + assert_eq!(array, expected); +} + +/// tests extending from a primitive array with offset w/ nulls +#[test] +fn test_primitive_offset() { + let b = UInt8Array::from(vec![Some(1), Some(2), Some(3)]); + let b = b.slice(1, 2); + let arrays = vec![b.data()]; + let mut a = MutableArrayData::new(arrays, false, 2); + a.extend(0, 0, 2); + let result = a.freeze(); + let array = UInt8Array::from(result); + let expected = UInt8Array::from(vec![Some(2), Some(3)]); + assert_eq!(array, expected); +} + +/// tests extending from a primitive array with offset and nulls +#[test] +fn test_primitive_null_offset() { + let b = UInt8Array::from(vec![Some(1), None, Some(3)]); + let b = b.slice(1, 2); + let arrays = vec![b.data()]; + let mut a = MutableArrayData::new(arrays, false, 2); + a.extend(0, 0, 2); + let result = a.freeze(); + let array = UInt8Array::from(result); + let expected = UInt8Array::from(vec![None, Some(3)]); + assert_eq!(array, expected); +} + +#[test] +fn test_primitive_null_offset_nulls() { + let b = UInt8Array::from(vec![Some(1), Some(2), Some(3)]); + let b = b.slice(1, 2); + let arrays = vec![b.data()]; + let mut a = MutableArrayData::new(arrays, true, 2); + a.extend(0, 0, 2); + a.extend_nulls(3); + a.extend(0, 1, 2); + let result = a.freeze(); + let array = UInt8Array::from(result); + let expected = UInt8Array::from(vec![Some(2), Some(3), None, None, None, Some(3)]); + assert_eq!(array, expected); +} + +#[test] +fn test_list_null_offset() { + let int_builder = Int64Builder::with_capacity(24); + let mut builder = ListBuilder::::new(int_builder); + builder.values().append_slice(&[1, 2, 3]); + builder.append(true); + builder.values().append_slice(&[4, 5]); + builder.append(true); + builder.values().append_slice(&[6, 7, 8]); + builder.append(true); + let array = builder.finish(); + let arrays = vec![array.data()]; + + let mut mutable = MutableArrayData::new(arrays, false, 0); + mutable.extend(0, 0, 1); + + let result = mutable.freeze(); + let array = ListArray::from(result); + + let int_builder = Int64Builder::with_capacity(24); + let mut builder = ListBuilder::::new(int_builder); + builder.values().append_slice(&[1, 2, 3]); + builder.append(true); + let expected = builder.finish(); + + assert_eq!(array, expected); +} + +/// tests extending from a variable-sized (strings and binary) array w/ offset with nulls +#[test] +fn test_variable_sized_nulls() { + let array = StringArray::from(vec![Some("a"), Some("bc"), None, Some("defh")]); + let arrays = vec![array.data()]; + + let mut mutable = MutableArrayData::new(arrays, false, 0); + + mutable.extend(0, 1, 3); + + let result = mutable.freeze(); + let result = StringArray::from(result); + + let expected = StringArray::from(vec![Some("bc"), None]); + assert_eq!(result, expected); +} + +/// tests extending from a variable-sized (strings and binary) array +/// with an offset and nulls +#[test] +fn test_variable_sized_offsets() { + let array = StringArray::from(vec![Some("a"), Some("bc"), None, Some("defh")]); + let array = array.slice(1, 3); + + let arrays = vec![array.data()]; + + let mut mutable = MutableArrayData::new(arrays, false, 0); + + mutable.extend(0, 0, 3); + + let result = mutable.freeze(); + let result = StringArray::from(result); + + let expected = StringArray::from(vec![Some("bc"), None, Some("defh")]); + assert_eq!(result, expected); +} + +#[test] +fn test_string_offsets() { + let array = StringArray::from(vec![Some("a"), Some("bc"), None, Some("defh")]); + let array = array.slice(1, 3); + + let arrays = vec![array.data()]; + + let mut mutable = MutableArrayData::new(arrays, false, 0); + + mutable.extend(0, 0, 3); + + let result = mutable.freeze(); + let result = StringArray::from(result); + + let expected = StringArray::from(vec![Some("bc"), None, Some("defh")]); + assert_eq!(result, expected); +} + +#[test] +fn test_multiple_with_nulls() { + let array1 = StringArray::from(vec!["hello", "world"]); + let array2 = StringArray::from(vec![Some("1"), None]); + + let arrays = vec![array1.data(), array2.data()]; + + let mut mutable = MutableArrayData::new(arrays, false, 5); + + mutable.extend(0, 0, 2); + mutable.extend(1, 0, 2); + + let result = mutable.freeze(); + let result = StringArray::from(result); + + let expected = StringArray::from(vec![Some("hello"), Some("world"), Some("1"), None]); + assert_eq!(result, expected); +} + +#[test] +fn test_string_null_offset_nulls() { + let array = StringArray::from(vec![Some("a"), Some("bc"), None, Some("defh")]); + let array = array.slice(1, 3); + + let arrays = vec![array.data()]; + + let mut mutable = MutableArrayData::new(arrays, true, 0); + + mutable.extend(0, 1, 3); + mutable.extend_nulls(1); + + let result = mutable.freeze(); + let result = StringArray::from(result); + + let expected = StringArray::from(vec![None, Some("defh"), None]); + assert_eq!(result, expected); +} + +#[test] +fn test_bool() { + let array = BooleanArray::from(vec![Some(false), Some(true), None, Some(false)]); + let arrays = vec![array.data()]; + + let mut mutable = MutableArrayData::new(arrays, false, 0); + + mutable.extend(0, 1, 3); + + let result = mutable.freeze(); + let result = BooleanArray::from(result); + + let expected = BooleanArray::from(vec![Some(true), None]); + assert_eq!(result, expected); +} + +#[test] +fn test_null() { + let array1 = NullArray::new(10); + let array2 = NullArray::new(5); + let arrays = vec![array1.data(), array2.data()]; + + let mut mutable = MutableArrayData::new(arrays, false, 0); + + mutable.extend(0, 1, 3); + mutable.extend(1, 0, 1); + + let result = mutable.freeze(); + let result = NullArray::from(result); + + let expected = NullArray::new(3); + assert_eq!(result, expected); +} + +fn create_dictionary_array(values: &[&str], keys: &[Option<&str>]) -> ArrayData { + let values = StringArray::from(values.to_vec()); + let mut builder = + StringDictionaryBuilder::::new_with_dictionary(keys.len(), &values) + .unwrap(); + for key in keys { + if let Some(v) = key { + builder.append(v).unwrap(); + } else { + builder.append_null() + } + } + builder.finish().into_data() +} + +#[test] +fn test_dictionary() { + // (a, b, c), (0, 1, 0, 2) => (a, b, a, c) + let array = create_dictionary_array( + &["a", "b", "c"], + &[Some("a"), Some("b"), None, Some("c")], + ); + let arrays = vec![&array]; + + let mut mutable = MutableArrayData::new(arrays, false, 0); + + mutable.extend(0, 1, 3); + + let result = mutable.freeze(); + let result = DictionaryArray::from(result); + + let expected = Int16Array::from(vec![Some(1), None]); + assert_eq!(result.keys(), &expected); +} + +#[test] +fn test_struct() { + let strings: ArrayRef = Arc::new(StringArray::from(vec![ + Some("joe"), + None, + None, + Some("mark"), + Some("doe"), + ])); + let ints: ArrayRef = Arc::new(Int32Array::from(vec![ + Some(1), + Some(2), + Some(3), + Some(4), + Some(5), + ])); + + let array = + StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) + .unwrap(); + let arrays = vec![array.data()]; + let mut mutable = MutableArrayData::new(arrays, false, 0); + + mutable.extend(0, 1, 3); + let data = mutable.freeze(); + let array = StructArray::from(data); + + let expected = StructArray::try_from(vec![ + ("f1", strings.slice(1, 2)), + ("f2", ints.slice(1, 2)), + ]) + .unwrap(); + assert_eq!(array, expected) +} + +#[test] +fn test_struct_offset() { + let strings: ArrayRef = Arc::new(StringArray::from(vec![ + Some("joe"), + None, + None, + Some("mark"), + Some("doe"), + ])); + let ints: ArrayRef = Arc::new(Int32Array::from(vec![ + Some(1), + Some(2), + Some(3), + Some(4), + Some(5), + ])); + + let array = + StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) + .unwrap() + .slice(1, 3); + let arrays = vec![array.data()]; + let mut mutable = MutableArrayData::new(arrays, false, 0); + + mutable.extend(0, 1, 3); + let data = mutable.freeze(); + let array = StructArray::from(data); + + let expected_strings: ArrayRef = + Arc::new(StringArray::from(vec![None, Some("mark")])); + let expected = + StructArray::try_from(vec![("f1", expected_strings), ("f2", ints.slice(2, 2))]) + .unwrap(); + + assert_eq!(array, expected); +} + +#[test] +fn test_struct_nulls() { + let strings: ArrayRef = Arc::new(StringArray::from(vec![ + Some("joe"), + None, + None, + Some("mark"), + Some("doe"), + ])); + let ints: ArrayRef = Arc::new(Int32Array::from(vec![ + Some(1), + Some(2), + None, + Some(4), + Some(5), + ])); + + let array = + StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) + .unwrap(); + let arrays = vec![array.data()]; + + let mut mutable = MutableArrayData::new(arrays, false, 0); + + mutable.extend(0, 1, 3); + let data = mutable.freeze(); + let array = StructArray::from(data); + + let expected_string = Arc::new(StringArray::from(vec![None, None])) as ArrayRef; + let expected_int = Arc::new(Int32Array::from(vec![Some(2), None])) as ArrayRef; + + let expected = + StructArray::try_from(vec![("f1", expected_string), ("f2", expected_int)]) + .unwrap(); + assert_eq!(array, expected) +} + +#[test] +fn test_struct_many() { + let strings: ArrayRef = Arc::new(StringArray::from(vec![ + Some("joe"), + None, + None, + Some("mark"), + Some("doe"), + ])); + let ints: ArrayRef = Arc::new(Int32Array::from(vec![ + Some(1), + Some(2), + None, + Some(4), + Some(5), + ])); + + let array = + StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) + .unwrap(); + let arrays = vec![array.data(), array.data()]; + let mut mutable = MutableArrayData::new(arrays, false, 0); + + mutable.extend(0, 1, 3); + mutable.extend(1, 0, 2); + let data = mutable.freeze(); + let array = StructArray::from(data); + + let expected_string = + Arc::new(StringArray::from(vec![None, None, Some("joe"), None])) as ArrayRef; + let expected_int = + Arc::new(Int32Array::from(vec![Some(2), None, Some(1), Some(2)])) as ArrayRef; + + let expected = + StructArray::try_from(vec![("f1", expected_string), ("f2", expected_int)]) + .unwrap(); + assert_eq!(array, expected) +} + +#[test] +fn test_binary_fixed_sized_offsets() { + let array = FixedSizeBinaryArray::try_from_iter( + vec![vec![0, 0], vec![0, 1], vec![0, 2]].into_iter(), + ) + .expect("Failed to create FixedSizeBinaryArray from iterable"); + let array = array.slice(1, 2); + // = [[0, 1], [0, 2]] due to the offset = 1 + + let arrays = vec![array.data()]; + + let mut mutable = MutableArrayData::new(arrays, false, 0); + + mutable.extend(0, 1, 2); + mutable.extend(0, 0, 1); + + let result = mutable.freeze(); + let result = FixedSizeBinaryArray::from(result); + + let expected = + FixedSizeBinaryArray::try_from_iter(vec![vec![0, 2], vec![0, 1]].into_iter()) + .expect("Failed to create FixedSizeBinaryArray from iterable"); + assert_eq!(result, expected); +} + +#[test] +fn test_list_append() { + let mut builder = ListBuilder::::new(Int64Builder::with_capacity(24)); + builder.values().append_slice(&[1, 2, 3]); + builder.append(true); + builder.values().append_slice(&[4, 5]); + builder.append(true); + builder.values().append_slice(&[6, 7, 8]); + builder.values().append_slice(&[9, 10, 11]); + builder.append(true); + let a = builder.finish(); + + let a_builder = Int64Builder::with_capacity(24); + let mut a_builder = ListBuilder::::new(a_builder); + a_builder.values().append_slice(&[12, 13]); + a_builder.append(true); + a_builder.append(true); + a_builder.values().append_slice(&[14, 15]); + a_builder.append(true); + let b = a_builder.finish(); + + let c = b.slice(1, 2); + + let mut mutable = MutableArrayData::new(vec![a.data(), b.data(), c.data()], false, 1); + mutable.extend(0, 0, a.len()); + mutable.extend(1, 0, b.len()); + mutable.extend(2, 0, c.len()); + + let finished = mutable.freeze(); + + let expected_int_array = Int64Array::from(vec![ + Some(1), + Some(2), + Some(3), + Some(4), + Some(5), + Some(6), + Some(7), + Some(8), + Some(9), + Some(10), + Some(11), + // append first array + Some(12), + Some(13), + Some(14), + Some(15), + // append second array + Some(14), + Some(15), + ]); + let list_value_offsets = + Buffer::from_slice_ref(&[0i32, 3, 5, 11, 13, 13, 15, 15, 17]); + let expected_list_data = ArrayData::try_new( + DataType::List(Box::new(Field::new("item", DataType::Int64, true))), + 8, + None, + 0, + vec![list_value_offsets], + vec![expected_int_array.into_data()], + ) + .unwrap(); + assert_eq!(finished, expected_list_data); +} + +#[test] +fn test_list_nulls_append() { + let mut builder = ListBuilder::::new(Int64Builder::with_capacity(32)); + builder.values().append_slice(&[1, 2, 3]); + builder.append(true); + builder.values().append_slice(&[4, 5]); + builder.append(true); + builder.append(false); + builder.values().append_slice(&[6, 7, 8]); + builder.values().append_null(); + builder.values().append_null(); + builder.values().append_slice(&[9, 10, 11]); + builder.append(true); + let a = builder.finish(); + let a = a.data(); + + let mut builder = ListBuilder::::new(Int64Builder::with_capacity(32)); + builder.values().append_slice(&[12, 13]); + builder.append(true); + builder.append(false); + builder.append(true); + builder.values().append_null(); + builder.values().append_null(); + builder.values().append_slice(&[14, 15]); + builder.append(true); + let b = builder.finish(); + let b = b.data(); + let c = b.slice(1, 2); + let d = b.slice(2, 2); + + let mut mutable = MutableArrayData::new(vec![a, b, &c, &d], false, 10); + + mutable.extend(0, 0, a.len()); + mutable.extend(1, 0, b.len()); + mutable.extend(2, 0, c.len()); + mutable.extend(3, 0, d.len()); + let result = mutable.freeze(); + + let expected_int_array = Int64Array::from(vec![ + Some(1), + Some(2), + Some(3), + Some(4), + Some(5), + Some(6), + Some(7), + Some(8), + None, + None, + Some(9), + Some(10), + Some(11), + // second array + Some(12), + Some(13), + None, + None, + Some(14), + Some(15), + // slice(1, 2) results in no values added + None, + None, + Some(14), + Some(15), + ]); + let list_value_offsets = + Buffer::from_slice_ref(&[0, 3, 5, 5, 13, 15, 15, 15, 19, 19, 19, 19, 23]); + let expected_list_data = ArrayData::try_new( + DataType::List(Box::new(Field::new("item", DataType::Int64, true))), + 12, + Some(Buffer::from(&[0b11011011, 0b1110])), + 0, + vec![list_value_offsets], + vec![expected_int_array.into_data()], + ) + .unwrap(); + assert_eq!(result, expected_list_data); +} + +#[test] +fn test_map_nulls_append() { + let mut builder = MapBuilder::::new( + None, + Int64Builder::with_capacity(32), + Int64Builder::with_capacity(32), + ); + builder.keys().append_slice(&[1, 2, 3]); + builder.values().append_slice(&[1, 2, 3]); + builder.append(true).unwrap(); + builder.keys().append_slice(&[4, 5]); + builder.values().append_slice(&[4, 5]); + builder.append(true).unwrap(); + builder.append(false).unwrap(); + builder.keys().append_slice(&[6, 7, 8, 100, 101, 9, 10, 11]); + builder.values().append_slice(&[6, 7, 8]); + builder.values().append_null(); + builder.values().append_null(); + builder.values().append_slice(&[9, 10, 11]); + builder.append(true).unwrap(); + + let a = builder.finish(); + let a = a.data(); + + let mut builder = MapBuilder::::new( + None, + Int64Builder::with_capacity(32), + Int64Builder::with_capacity(32), + ); + + builder.keys().append_slice(&[12, 13]); + builder.values().append_slice(&[12, 13]); + builder.append(true).unwrap(); + builder.append(false).unwrap(); + builder.append(true).unwrap(); + builder.keys().append_slice(&[100, 101, 14, 15]); + builder.values().append_null(); + builder.values().append_null(); + builder.values().append_slice(&[14, 15]); + builder.append(true).unwrap(); + + let b = builder.finish(); + let b = b.data(); + let c = b.slice(1, 2); + let d = b.slice(2, 2); + + let mut mutable = MutableArrayData::new(vec![a, b, &c, &d], false, 10); + + mutable.extend(0, 0, a.len()); + mutable.extend(1, 0, b.len()); + mutable.extend(2, 0, c.len()); + mutable.extend(3, 0, d.len()); + let result = mutable.freeze(); + + let expected_key_array = Int64Array::from(vec![ + Some(1), + Some(2), + Some(3), + Some(4), + Some(5), + Some(6), + Some(7), + Some(8), + Some(100), + Some(101), + Some(9), + Some(10), + Some(11), + // second array + Some(12), + Some(13), + Some(100), + Some(101), + Some(14), + Some(15), + // slice(1, 2) results in no values added + Some(100), + Some(101), + Some(14), + Some(15), + ]); + + let expected_value_array = Int64Array::from(vec![ + Some(1), + Some(2), + Some(3), + Some(4), + Some(5), + Some(6), + Some(7), + Some(8), + None, + None, + Some(9), + Some(10), + Some(11), + // second array + Some(12), + Some(13), + None, + None, + Some(14), + Some(15), + // slice(1, 2) results in no values added + None, + None, + Some(14), + Some(15), + ]); + + let expected_entry_array = StructArray::from(vec![ + ( + Field::new("keys", DataType::Int64, false), + Arc::new(expected_key_array) as ArrayRef, + ), + ( + Field::new("values", DataType::Int64, true), + Arc::new(expected_value_array) as ArrayRef, + ), + ]); + + let map_offsets = + Buffer::from_slice_ref(&[0, 3, 5, 5, 13, 15, 15, 15, 19, 19, 19, 19, 23]); + + let expected_list_data = ArrayData::try_new( + DataType::Map( + Box::new(Field::new( + "entries", + DataType::Struct(vec![ + Field::new("keys", DataType::Int64, false), + Field::new("values", DataType::Int64, true), + ]), + false, + )), + false, + ), + 12, + Some(Buffer::from(&[0b11011011, 0b1110])), + 0, + vec![map_offsets], + vec![expected_entry_array.into_data()], + ) + .unwrap(); + assert_eq!(result, expected_list_data); +} + +#[test] +fn test_list_of_strings_append() { + // [["alpha", "beta", None]] + let mut builder = ListBuilder::new(StringBuilder::new()); + builder.values().append_value("Hello"); + builder.values().append_value("Arrow"); + builder.values().append_null(); + builder.append(true); + let a = builder.finish(); + + // [["alpha", "beta"], [None], ["gamma", "delta", None]] + let mut builder = ListBuilder::new(StringBuilder::new()); + builder.values().append_value("alpha"); + builder.values().append_value("beta"); + builder.append(true); + builder.values().append_null(); + builder.append(true); + builder.values().append_value("gamma"); + builder.values().append_value("delta"); + builder.values().append_null(); + builder.append(true); + let b = builder.finish(); + + let mut mutable = MutableArrayData::new(vec![a.data(), b.data()], false, 10); + + mutable.extend(0, 0, a.len()); + mutable.extend(1, 0, b.len()); + mutable.extend(1, 1, 3); + mutable.extend(1, 0, 0); + let result = mutable.freeze(); + + let expected_string_array = StringArray::from(vec![ + // extend a[0..a.len()] + // a[0] + Some("Hello"), + Some("Arrow"), + None, + // extend b[0..b.len()] + // b[0] + Some("alpha"), + Some("beta"), + // b[1] + None, + // b[2] + Some("gamma"), + Some("delta"), + None, + // extend b[1..3] + // b[1] + None, + // b[2] + Some("gamma"), + Some("delta"), + None, + // extend b[0..0] + ]); + let list_value_offsets = Buffer::from_slice_ref(&[0, 3, 5, 6, 9, 10, 13]); + let expected_list_data = ArrayData::try_new( + DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), + 6, + None, + 0, + vec![list_value_offsets], + vec![expected_string_array.into_data()], + ) + .unwrap(); + assert_eq!(result, expected_list_data); +} + +#[test] +fn test_fixed_size_binary_append() { + let a = vec![Some(vec![1, 2]), Some(vec![3, 4]), Some(vec![5, 6])]; + let a = FixedSizeBinaryArray::try_from_sparse_iter(a.into_iter()) + .expect("Failed to create FixedSizeBinaryArray from iterable"); + + let b = vec![ + None, + Some(vec![7, 8]), + Some(vec![9, 10]), + None, + Some(vec![13, 14]), + None, + ]; + let b = FixedSizeBinaryArray::try_from_sparse_iter(b.into_iter()) + .expect("Failed to create FixedSizeBinaryArray from iterable"); + + let mut mutable = MutableArrayData::new(vec![a.data(), b.data()], false, 10); + + mutable.extend(0, 0, a.len()); + mutable.extend(1, 0, b.len()); + mutable.extend(1, 1, 4); + mutable.extend(1, 2, 3); + mutable.extend(1, 5, 5); + let result = mutable.freeze(); + + let expected = vec![ + // a + Some(vec![1, 2]), + Some(vec![3, 4]), + Some(vec![5, 6]), + // b + None, + Some(vec![7, 8]), + Some(vec![9, 10]), + None, + Some(vec![13, 14]), + None, + // b[1..4] + Some(vec![7, 8]), + Some(vec![9, 10]), + None, + // b[2..3] + Some(vec![9, 10]), + // b[4..4] + ]; + let expected = FixedSizeBinaryArray::try_from_sparse_iter(expected.into_iter()) + .expect("Failed to create FixedSizeBinaryArray from iterable"); + assert_eq!(&result, expected.data()); +} + +/* +// this is an old test used on a meanwhile removed dead code +// that is still useful when `MutableArrayData` supports fixed-size lists. +#[test] +fn test_fixed_size_list_append() -> Result<()> { + let int_builder = UInt16Builder::new(64); + let mut builder = FixedSizeListBuilder::::new(int_builder, 2); + builder.values().append_slice(&[1, 2])?; + builder.append(true)?; + builder.values().append_slice(&[3, 4])?; + builder.append(false)?; + builder.values().append_slice(&[5, 6])?; + builder.append(true)?; + + let a_builder = UInt16Builder::new(64); + let mut a_builder = FixedSizeListBuilder::::new(a_builder, 2); + a_builder.values().append_slice(&[7, 8])?; + a_builder.append(true)?; + a_builder.values().append_slice(&[9, 10])?; + a_builder.append(true)?; + a_builder.values().append_slice(&[11, 12])?; + a_builder.append(false)?; + a_builder.values().append_slice(&[13, 14])?; + a_builder.append(true)?; + a_builder.values().append_null()?; + a_builder.values().append_null()?; + a_builder.append(true)?; + let a = a_builder.finish(); + + // append array + builder.append_data(&[ + a.data(), + a.slice(1, 3).data(), + a.slice(2, 1).data(), + a.slice(5, 0).data(), + ])?; + let finished = builder.finish(); + + let expected_int_array = UInt16Array::from(vec![ + Some(1), + Some(2), + Some(3), + Some(4), + Some(5), + Some(6), + // append first array + Some(7), + Some(8), + Some(9), + Some(10), + Some(11), + Some(12), + Some(13), + Some(14), + None, + None, + // append slice(1, 3) + Some(9), + Some(10), + Some(11), + Some(12), + Some(13), + Some(14), + // append slice(2, 1) + Some(11), + Some(12), + ]); + let expected_list_data = ArrayData::new( + DataType::FixedSizeList( + Box::new(Field::new("item", DataType::UInt16, true)), + 2, + ), + 12, + None, + None, + 0, + vec![], + vec![expected_int_array.data()], + ); + let expected_list = + FixedSizeListArray::from(Arc::new(expected_list_data) as ArrayData); + assert_eq!(&expected_list.values(), &finished.values()); + assert_eq!(expected_list.len(), finished.len()); + + Ok(()) +} +*/ diff --git a/arrow/tests/array_validation.rs b/arrow/tests/array_validation.rs new file mode 100644 index 00000000000..93b936e7c2f --- /dev/null +++ b/arrow/tests/array_validation.rs @@ -0,0 +1,1100 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::{ + make_array, Array, BooleanBuilder, Decimal128Builder, FixedSizeListBuilder, + Int32Array, Int32Builder, Int64Array, StringArray, StructBuilder, UInt64Array, + UInt8Builder, +}; +use arrow_buffer::{ArrowNativeType, Buffer}; +use arrow_data::ArrayData; +use arrow_schema::{DataType, Field, UnionMode}; +use std::ptr::NonNull; +use std::sync::Arc; + +#[test] +#[should_panic( + expected = "Need at least 80 bytes in buffers[0] in array of type Int64, but got 8" +)] +fn test_buffer_too_small() { + let buffer = Buffer::from_slice_ref(&[0i32, 2i32]); + // should fail as the declared size (10*8 = 80) is larger than the underlying bfufer (8) + ArrayData::try_new(DataType::Int64, 10, None, 0, vec![buffer], vec![]).unwrap(); +} + +#[test] +#[should_panic( + expected = "Need at least 16 bytes in buffers[0] in array of type Int64, but got 8" +)] +fn test_buffer_too_small_offset() { + let buffer = Buffer::from_slice_ref(&[0i32, 2i32]); + // should fail -- size is ok, but also has offset + ArrayData::try_new(DataType::Int64, 1, None, 1, vec![buffer], vec![]).unwrap(); +} + +#[test] +#[should_panic(expected = "Expected 1 buffers in array of type Int64, got 2")] +fn test_bad_number_of_buffers() { + let buffer1 = Buffer::from_slice_ref(&[0i32, 2i32]); + let buffer2 = Buffer::from_slice_ref(&[0i32, 2i32]); + ArrayData::try_new(DataType::Int64, 1, None, 0, vec![buffer1, buffer2], vec![]) + .unwrap(); +} + +#[test] +#[should_panic(expected = "integer overflow computing min buffer size")] +fn test_fixed_width_overflow() { + let buffer = Buffer::from_slice_ref(&[0i32, 2i32]); + ArrayData::try_new(DataType::Int64, usize::MAX, None, 0, vec![buffer], vec![]) + .unwrap(); +} + +#[test] +#[should_panic(expected = "null_bit_buffer size too small. got 1 needed 2")] +fn test_bitmap_too_small() { + let buffer = make_i32_buffer(9); + let null_bit_buffer = Buffer::from(vec![0b11111111]); + + ArrayData::try_new( + DataType::Int32, + 9, + Some(null_bit_buffer), + 0, + vec![buffer], + vec![], + ) + .unwrap(); +} + +// Test creating a dictionary with a non integer type +#[test] +#[should_panic(expected = "Dictionary key type must be integer, but was Utf8")] +fn test_non_int_dictionary() { + let i32_buffer = Buffer::from_slice_ref(&[0i32, 2i32]); + let data_type = + DataType::Dictionary(Box::new(DataType::Utf8), Box::new(DataType::Int32)); + let child_data = ArrayData::try_new( + DataType::Int32, + 1, + None, + 0, + vec![i32_buffer.clone()], + vec![], + ) + .unwrap(); + ArrayData::try_new( + data_type, + 1, + None, + 0, + vec![i32_buffer.clone(), i32_buffer], + vec![child_data], + ) + .unwrap(); +} + +#[test] +#[should_panic(expected = "Expected LargeUtf8 but child data had Utf8")] +fn test_mismatched_dictionary_types() { + // test w/ dictionary created with a child array data that has type different than declared + let string_array: StringArray = vec![Some("foo"), Some("bar")].into_iter().collect(); + let i32_buffer = Buffer::from_slice_ref(&[0i32, 1i32]); + // Dict says LargeUtf8 but array is Utf8 + let data_type = + DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::LargeUtf8)); + let child_data = string_array.into_data(); + ArrayData::try_new(data_type, 1, None, 0, vec![i32_buffer], vec![child_data]) + .unwrap(); +} + +#[test] +fn test_empty_utf8_array_with_empty_offsets_buffer() { + let data_buffer = Buffer::from(&[]); + let offsets_buffer = Buffer::from(&[]); + ArrayData::try_new( + DataType::Utf8, + 0, + None, + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); +} + +#[test] +fn test_empty_utf8_array_with_single_zero_offset() { + let data_buffer = Buffer::from(&[]); + let offsets_buffer = Buffer::from_slice_ref(&[0i32]); + ArrayData::try_new( + DataType::Utf8, + 0, + None, + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); +} + +#[test] +#[should_panic(expected = "First offset 1 of Utf8 is larger than values length 0")] +fn test_empty_utf8_array_with_invalid_offset() { + let data_buffer = Buffer::from(&[]); + let offsets_buffer = Buffer::from_slice_ref(&[1i32]); + ArrayData::try_new( + DataType::Utf8, + 0, + None, + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); +} + +#[test] +fn test_empty_utf8_array_with_non_zero_offset() { + let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); + let offsets_buffer = Buffer::from_slice_ref(&[0i32, 2, 6, 0]); + ArrayData::try_new( + DataType::Utf8, + 0, + None, + 3, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); +} + +#[test] +#[should_panic( + expected = "Buffer 0 of LargeUtf8 isn't large enough. Expected 8 bytes got 4" +)] +fn test_empty_large_utf8_array_with_wrong_type_offsets() { + let data_buffer = Buffer::from(&[]); + let offsets_buffer = Buffer::from_slice_ref(&[0i32]); + ArrayData::try_new( + DataType::LargeUtf8, + 0, + None, + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); +} + +#[test] +#[should_panic(expected = "Buffer 0 of Utf8 isn't large enough. Expected 12 bytes got 8")] +fn test_validate_offsets_i32() { + let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); + let offsets_buffer = Buffer::from_slice_ref(&[0i32, 2i32]); + ArrayData::try_new( + DataType::Utf8, + 2, + None, + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); +} + +#[test] +#[should_panic( + expected = "Buffer 0 of LargeUtf8 isn't large enough. Expected 24 bytes got 16" +)] +fn test_validate_offsets_i64() { + let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); + let offsets_buffer = Buffer::from_slice_ref(&[0i64, 2i64]); + ArrayData::try_new( + DataType::LargeUtf8, + 2, + None, + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); +} + +#[test] +#[should_panic(expected = "Error converting offset[0] (-2) to usize for Utf8")] +fn test_validate_offsets_negative_first_i32() { + let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); + let offsets_buffer = Buffer::from_slice_ref(&[-2i32, 1i32, 3i32]); + ArrayData::try_new( + DataType::Utf8, + 2, + None, + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); +} + +#[test] +#[should_panic(expected = "Error converting offset[2] (-3) to usize for Utf8")] +fn test_validate_offsets_negative_last_i32() { + let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); + let offsets_buffer = Buffer::from_slice_ref(&[0i32, 2i32, -3i32]); + ArrayData::try_new( + DataType::Utf8, + 2, + None, + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); +} + +#[test] +#[should_panic(expected = "First offset 4 in Utf8 is smaller than last offset 3")] +fn test_validate_offsets_range_too_small() { + let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); + // start offset is larger than end + let offsets_buffer = Buffer::from_slice_ref(&[4i32, 2i32, 3i32]); + ArrayData::try_new( + DataType::Utf8, + 2, + None, + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); +} + +#[test] +#[should_panic(expected = "Last offset 10 of Utf8 is larger than values length 6")] +fn test_validate_offsets_range_too_large() { + let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); + // 10 is off the end of the buffer + let offsets_buffer = Buffer::from_slice_ref(&[0i32, 2i32, 10i32]); + ArrayData::try_new( + DataType::Utf8, + 2, + None, + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); +} + +#[test] +#[should_panic(expected = "First offset 10 of Utf8 is larger than values length 6")] +fn test_validate_offsets_first_too_large() { + let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); + // 10 is off the end of the buffer + let offsets_buffer = Buffer::from_slice_ref(&[10i32, 2i32, 10i32]); + ArrayData::try_new( + DataType::Utf8, + 2, + None, + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); +} + +#[test] +fn test_validate_offsets_first_too_large_skipped() { + let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); + // 10 is off the end of the buffer, but offset starts at 1 so it is skipped + let offsets_buffer = Buffer::from_slice_ref(&[10i32, 2i32, 3i32, 4i32]); + let data = ArrayData::try_new( + DataType::Utf8, + 2, + None, + 1, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); + let array: StringArray = data.into(); + let expected: StringArray = vec![Some("c"), Some("d")].into_iter().collect(); + assert_eq!(array, expected); +} + +#[test] +#[should_panic(expected = "Last offset 8 of Utf8 is larger than values length 6")] +fn test_validate_offsets_last_too_large() { + let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); + // 10 is off the end of the buffer + let offsets_buffer = Buffer::from_slice_ref(&[5i32, 7i32, 8i32]); + ArrayData::try_new( + DataType::Utf8, + 2, + None, + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); +} + +#[test] +#[should_panic( + expected = "Values length 4 is less than the length (2) multiplied by the value size (2) for FixedSizeList" +)] +fn test_validate_fixed_size_list() { + // child has 4 elements, + let child_array = vec![Some(1), Some(2), Some(3), None] + .into_iter() + .collect::(); + + // but claim we have 3 elements for a fixed size of 2 + // 10 is off the end of the buffer + let field = Field::new("field", DataType::Int32, true); + ArrayData::try_new( + DataType::FixedSizeList(Box::new(field), 2), + 3, + None, + 0, + vec![], + vec![child_array.into_data()], + ) + .unwrap(); +} + +#[test] +#[should_panic(expected = "Child type mismatch for Struct")] +fn test_validate_struct_child_type() { + let field1 = vec![Some(1), Some(2), Some(3), None] + .into_iter() + .collect::(); + + // validate the the type of struct fields matches child fields + ArrayData::try_new( + DataType::Struct(vec![Field::new("field1", DataType::Int64, true)]), + 3, + None, + 0, + vec![], + vec![field1.into_data()], + ) + .unwrap(); +} + +#[test] +#[should_panic( + expected = "child array #0 for field field1 has length smaller than expected for struct array (4 < 6)" +)] +fn test_validate_struct_child_length() { + // field length only has 4 items, but array claims to have 6 + let field1 = vec![Some(1), Some(2), Some(3), None] + .into_iter() + .collect::(); + + ArrayData::try_new( + DataType::Struct(vec![Field::new("field1", DataType::Int32, true)]), + 6, + None, + 0, + vec![], + vec![field1.into_data()], + ) + .unwrap(); +} + +/// Test that the array of type `data_type` that has invalid utf8 data errors +fn check_utf8_validation(data_type: DataType) { + // 0x80 is a utf8 continuation sequence and is not a valid utf8 sequence itself + let data_buffer = Buffer::from_slice_ref(&[b'a', b'a', 0x80, 0x00]); + let offsets: Vec = [0, 2, 3] + .iter() + .map(|&v| T::from_usize(v).unwrap()) + .collect(); + + let offsets_buffer = Buffer::from_slice_ref(&offsets); + ArrayData::try_new( + data_type, + 2, + None, + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); +} + +#[test] +#[should_panic(expected = "Invalid UTF8 sequence at string index 1 (2..3)")] +fn test_validate_utf8_content() { + check_utf8_validation::(DataType::Utf8); +} + +#[test] +#[should_panic(expected = "Invalid UTF8 sequence at string index 1 (2..3)")] +fn test_validate_large_utf8_content() { + check_utf8_validation::(DataType::LargeUtf8); +} + +/// Tests that offsets are at valid codepoint boundaries +fn check_utf8_char_boundary(data_type: DataType) { + let data_buffer = Buffer::from("🙀".as_bytes()); + let offsets: Vec = [0, 1, data_buffer.len()] + .iter() + .map(|&v| T::from_usize(v).unwrap()) + .collect(); + + let offsets_buffer = Buffer::from_slice_ref(&offsets); + ArrayData::try_new( + data_type, + 2, + None, + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); +} + +#[test] +#[should_panic(expected = "incomplete utf-8 byte sequence from index 0")] +fn test_validate_utf8_char_boundary() { + check_utf8_char_boundary::(DataType::Utf8); +} + +#[test] +#[should_panic(expected = "incomplete utf-8 byte sequence from index 0")] +fn test_validate_large_utf8_char_boundary() { + check_utf8_char_boundary::(DataType::LargeUtf8); +} + +/// Test that the array of type `data_type` that has invalid indexes (out of bounds) +fn check_index_out_of_bounds_validation(data_type: DataType) { + let data_buffer = Buffer::from_slice_ref(&[b'a', b'b', b'c', b'd']); + // First two offsets are fine, then 5 is out of bounds + let offsets: Vec = [0, 1, 2, 5, 2] + .iter() + .map(|&v| T::from_usize(v).unwrap()) + .collect(); + + let offsets_buffer = Buffer::from_slice_ref(&offsets); + ArrayData::try_new( + data_type, + 4, + None, + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); +} + +#[test] +#[should_panic( + expected = "Offset invariant failure: offset at position 3 out of bounds: 5 > 4" +)] +fn test_validate_utf8_out_of_bounds() { + check_index_out_of_bounds_validation::(DataType::Utf8); +} + +#[test] +#[should_panic( + expected = "Offset invariant failure: offset at position 3 out of bounds: 5 > 4" +)] +fn test_validate_large_utf8_out_of_bounds() { + check_index_out_of_bounds_validation::(DataType::LargeUtf8); +} + +#[test] +#[should_panic( + expected = "Offset invariant failure: offset at position 3 out of bounds: 5 > 4" +)] +fn test_validate_binary_out_of_bounds() { + check_index_out_of_bounds_validation::(DataType::Binary); +} + +#[test] +#[should_panic( + expected = "Offset invariant failure: offset at position 3 out of bounds: 5 > 4" +)] +fn test_validate_large_binary_out_of_bounds() { + check_index_out_of_bounds_validation::(DataType::LargeBinary); +} + +// validate that indexes don't go bacwards check indexes that go backwards +fn check_index_backwards_validation(data_type: DataType) { + let data_buffer = Buffer::from_slice_ref(&[b'a', b'b', b'c', b'd']); + // First three offsets are fine, then 1 goes backwards + let offsets: Vec = [0, 1, 2, 2, 1] + .iter() + .map(|&v| T::from_usize(v).unwrap()) + .collect(); + + let offsets_buffer = Buffer::from_slice_ref(&offsets); + ArrayData::try_new( + data_type, + 4, + None, + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); +} + +#[test] +#[should_panic( + expected = "Offset invariant failure: non-monotonic offset at slot 3: 2 > 1" +)] +fn test_validate_utf8_index_backwards() { + check_index_backwards_validation::(DataType::Utf8); +} + +#[test] +#[should_panic( + expected = "Offset invariant failure: non-monotonic offset at slot 3: 2 > 1" +)] +fn test_validate_large_utf8_index_backwards() { + check_index_backwards_validation::(DataType::LargeUtf8); +} + +#[test] +#[should_panic( + expected = "Offset invariant failure: non-monotonic offset at slot 3: 2 > 1" +)] +fn test_validate_binary_index_backwards() { + check_index_backwards_validation::(DataType::Binary); +} + +#[test] +#[should_panic( + expected = "Offset invariant failure: non-monotonic offset at slot 3: 2 > 1" +)] +fn test_validate_large_binary_index_backwards() { + check_index_backwards_validation::(DataType::LargeBinary); +} + +#[test] +#[should_panic(expected = "Value at position 1 out of bounds: 3 (should be in [0, 1])")] +fn test_validate_dictionary_index_too_large() { + let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect(); + + // 3 is not a valid index into the values (only 0 and 1) + let keys: Int32Array = [Some(1), Some(3)].into_iter().collect(); + + let data_type = DataType::Dictionary( + Box::new(keys.data_type().clone()), + Box::new(values.data_type().clone()), + ); + + ArrayData::try_new( + data_type, + 2, + None, + 0, + vec![keys.data().buffers()[0].clone()], + vec![values.into_data()], + ) + .unwrap(); +} + +#[test] +#[should_panic(expected = "Value at position 1 out of bounds: -1 (should be in [0, 1]")] +fn test_validate_dictionary_index_negative() { + let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect(); + + // -1 is not a valid index at all! + let keys: Int32Array = [Some(1), Some(-1)].into_iter().collect(); + + let data_type = DataType::Dictionary( + Box::new(keys.data_type().clone()), + Box::new(values.data_type().clone()), + ); + + ArrayData::try_new( + data_type, + 2, + None, + 0, + vec![keys.data().buffers()[0].clone()], + vec![values.into_data()], + ) + .unwrap(); +} + +#[test] +fn test_validate_dictionary_index_negative_but_not_referenced() { + let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect(); + + // -1 is not a valid index at all, but the array is length 1 + // so the -1 should not be looked at + let keys: Int32Array = [Some(1), Some(-1)].into_iter().collect(); + + let data_type = DataType::Dictionary( + Box::new(keys.data_type().clone()), + Box::new(values.data_type().clone()), + ); + + // Expect this not to panic + ArrayData::try_new( + data_type, + 1, + None, + 0, + vec![keys.data().buffers()[0].clone()], + vec![values.into_data()], + ) + .unwrap(); +} + +#[test] +#[should_panic( + expected = "Value at position 0 out of bounds: 18446744073709551615 (can not convert to i64)" +)] +fn test_validate_dictionary_index_giant_negative() { + let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect(); + + // -1 is not a valid index at all! + let keys: UInt64Array = [Some(u64::MAX), Some(1)].into_iter().collect(); + + let data_type = DataType::Dictionary( + Box::new(keys.data_type().clone()), + Box::new(values.data_type().clone()), + ); + + ArrayData::try_new( + data_type, + 2, + None, + 0, + vec![keys.data().buffers()[0].clone()], + vec![values.into_data()], + ) + .unwrap(); +} + +/// Test that the list of type `data_type` generates correct offset out of bounds errors +fn check_list_offsets(data_type: DataType) { + let values: Int32Array = [Some(1), Some(2), Some(3), Some(4)].into_iter().collect(); + + // 5 is an invalid offset into a list of only three values + let offsets: Vec = [0, 2, 5, 4] + .iter() + .map(|&v| T::from_usize(v).unwrap()) + .collect(); + let offsets_buffer = Buffer::from_slice_ref(&offsets); + + ArrayData::try_new( + data_type, + 3, + None, + 0, + vec![offsets_buffer], + vec![values.into_data()], + ) + .unwrap(); +} + +#[test] +#[should_panic( + expected = "Offset invariant failure: offset at position 2 out of bounds: 5 > 4" +)] +fn test_validate_list_offsets() { + let field_type = Field::new("f", DataType::Int32, true); + check_list_offsets::(DataType::List(Box::new(field_type))); +} + +#[test] +#[should_panic( + expected = "Offset invariant failure: offset at position 2 out of bounds: 5 > 4" +)] +fn test_validate_large_list_offsets() { + let field_type = Field::new("f", DataType::Int32, true); + check_list_offsets::(DataType::LargeList(Box::new(field_type))); +} + +/// Test that the list of type `data_type` generates correct errors for negative offsets +#[test] +#[should_panic( + expected = "Offset invariant failure: Could not convert offset -1 to usize at position 2" +)] +fn test_validate_list_negative_offsets() { + let values: Int32Array = [Some(1), Some(2), Some(3), Some(4)].into_iter().collect(); + let field_type = Field::new("f", values.data_type().clone(), true); + let data_type = DataType::List(Box::new(field_type)); + + // -1 is an invalid offset any way you look at it + let offsets: Vec = vec![0, 2, -1, 4]; + let offsets_buffer = Buffer::from_slice_ref(&offsets); + + ArrayData::try_new( + data_type, + 3, + None, + 0, + vec![offsets_buffer], + vec![values.into_data()], + ) + .unwrap(); +} + +#[test] +#[should_panic(expected = "Value at position 1 out of bounds: -1 (should be in [0, 1])")] +/// test that children are validated recursively (aka bugs in child data of struct also are flagged) +fn test_validate_recursive() { + // Form invalid dictionary array + let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect(); + // -1 is not a valid index + let keys: Int32Array = [Some(1), Some(-1), Some(1)].into_iter().collect(); + + let dict_data_type = DataType::Dictionary( + Box::new(keys.data_type().clone()), + Box::new(values.data_type().clone()), + ); + + // purposely create an invalid child data + let dict_data = unsafe { + ArrayData::new_unchecked( + dict_data_type, + 2, + None, + None, + 0, + vec![keys.data().buffers()[0].clone()], + vec![values.into_data()], + ) + }; + + // Now, try and create a struct with this invalid child data (and expect an error) + let data_type = + DataType::Struct(vec![Field::new("d", dict_data.data_type().clone(), true)]); + + ArrayData::try_new(data_type, 1, None, 0, vec![], vec![dict_data]).unwrap(); +} + +/// returns a buffer initialized with some constant value for tests +fn make_i32_buffer(n: usize) -> Buffer { + Buffer::from_slice_ref(&vec![42i32; n]) +} + +#[test] +#[should_panic(expected = "Expected Int64 but child data had Int32")] +fn test_validate_union_different_types() { + let field1 = vec![Some(1), Some(2)].into_iter().collect::(); + + let field2 = vec![Some(1), Some(2)].into_iter().collect::(); + + let type_ids = Buffer::from_slice_ref(&[0i8, 1i8]); + + ArrayData::try_new( + DataType::Union( + vec![ + Field::new("field1", DataType::Int32, true), + Field::new("field2", DataType::Int64, true), // data is int32 + ], + vec![0, 1], + UnionMode::Sparse, + ), + 2, + None, + 0, + vec![type_ids], + vec![field1.into_data(), field2.into_data()], + ) + .unwrap(); +} + +// sparse with wrong sized children +#[test] +#[should_panic( + expected = "Sparse union child array #1 has length smaller than expected for union array (1 < 2)" +)] +fn test_validate_union_sparse_different_child_len() { + let field1 = vec![Some(1), Some(2)].into_iter().collect::(); + + // field 2 only has 1 item but array should have 2 + let field2 = vec![Some(1)].into_iter().collect::(); + + let type_ids = Buffer::from_slice_ref(&[0i8, 1i8]); + + ArrayData::try_new( + DataType::Union( + vec![ + Field::new("field1", DataType::Int32, true), + Field::new("field2", DataType::Int64, true), + ], + vec![0, 1], + UnionMode::Sparse, + ), + 2, + None, + 0, + vec![type_ids], + vec![field1.into_data(), field2.into_data()], + ) + .unwrap(); +} + +#[test] +#[should_panic(expected = "Expected 2 buffers in array of type Union")] +fn test_validate_union_dense_without_offsets() { + let field1 = vec![Some(1), Some(2)].into_iter().collect::(); + + let field2 = vec![Some(1)].into_iter().collect::(); + + let type_ids = Buffer::from_slice_ref(&[0i8, 1i8]); + + ArrayData::try_new( + DataType::Union( + vec![ + Field::new("field1", DataType::Int32, true), + Field::new("field2", DataType::Int64, true), + ], + vec![0, 1], + UnionMode::Dense, + ), + 2, + None, + 0, + vec![type_ids], // need offsets buffer here too + vec![field1.into_data(), field2.into_data()], + ) + .unwrap(); +} + +#[test] +#[should_panic(expected = "Need at least 8 bytes in buffers[1] in array of type Union")] +fn test_validate_union_dense_with_bad_len() { + let field1 = vec![Some(1), Some(2)].into_iter().collect::(); + + let field2 = vec![Some(1)].into_iter().collect::(); + + let type_ids = Buffer::from_slice_ref(&[0i8, 1i8]); + let offsets = Buffer::from_slice_ref(&[0i32]); // should have 2 offsets, but only have 1 + + ArrayData::try_new( + DataType::Union( + vec![ + Field::new("field1", DataType::Int32, true), + Field::new("field2", DataType::Int64, true), + ], + vec![0, 1], + UnionMode::Dense, + ), + 2, + None, + 0, + vec![type_ids, offsets], + vec![field1.into_data(), field2.into_data()], + ) + .unwrap(); +} + +#[test] +fn test_try_new_sliced_struct() { + let mut builder = StructBuilder::new( + vec![ + Field::new("a", DataType::Int32, true), + Field::new("b", DataType::Boolean, true), + ], + vec![ + Box::new(Int32Builder::with_capacity(5)), + Box::new(BooleanBuilder::with_capacity(5)), + ], + ); + + // struct[0] = { a: 10, b: true } + builder + .field_builder::(0) + .unwrap() + .append_option(Some(10)); + builder + .field_builder::(1) + .unwrap() + .append_option(Some(true)); + builder.append(true); + + // struct[1] = null + builder + .field_builder::(0) + .unwrap() + .append_option(None); + builder + .field_builder::(1) + .unwrap() + .append_option(None); + builder.append(false); + + // struct[2] = { a: null, b: false } + builder + .field_builder::(0) + .unwrap() + .append_option(None); + builder + .field_builder::(1) + .unwrap() + .append_option(Some(false)); + builder.append(true); + + // struct[3] = { a: 21, b: null } + builder + .field_builder::(0) + .unwrap() + .append_option(Some(21)); + builder + .field_builder::(1) + .unwrap() + .append_option(None); + builder.append(true); + + // struct[4] = { a: 18, b: false } + builder + .field_builder::(0) + .unwrap() + .append_option(Some(18)); + builder + .field_builder::(1) + .unwrap() + .append_option(Some(false)); + builder.append(true); + + let struct_array = builder.finish(); + let struct_array_slice = struct_array.slice(1, 3); + let struct_array_data = struct_array_slice.data(); + + let cloned_data = ArrayData::try_new( + struct_array_slice.data_type().clone(), + struct_array_slice.len(), + struct_array_data.null_buffer().cloned(), + struct_array_slice.offset(), + struct_array_data.buffers().to_vec(), + struct_array_data.child_data().to_vec(), + ) + .unwrap(); + let cloned = make_array(cloned_data); + + assert_eq!(&struct_array_slice, &cloned); +} + +#[test] +fn test_string_data_from_foreign() { + let mut strings = "foobarfoobar".to_owned(); + let mut offsets = vec![0_i32, 0, 3, 6, 12]; + let mut bitmap = vec![0b1110_u8]; + + let strings_buffer = unsafe { + Buffer::from_custom_allocation( + NonNull::new_unchecked(strings.as_mut_ptr()), + strings.len(), + Arc::new(strings), + ) + }; + let offsets_buffer = unsafe { + Buffer::from_custom_allocation( + NonNull::new_unchecked(offsets.as_mut_ptr() as *mut u8), + offsets.len() * std::mem::size_of::(), + Arc::new(offsets), + ) + }; + let null_buffer = unsafe { + Buffer::from_custom_allocation( + NonNull::new_unchecked(bitmap.as_mut_ptr()), + bitmap.len(), + Arc::new(bitmap), + ) + }; + + let data = ArrayData::try_new( + DataType::Utf8, + 4, + Some(null_buffer), + 0, + vec![offsets_buffer, strings_buffer], + vec![], + ) + .unwrap(); + + let array = make_array(data); + let array = array.as_any().downcast_ref::().unwrap(); + + let expected = + StringArray::from(vec![None, Some("foo"), Some("bar"), Some("foobar")]); + + assert_eq!(array, &expected); +} + +#[test] +#[cfg(not(feature = "force_validate"))] +fn test_decimal_full_validation() { + let values_builder = UInt8Builder::with_capacity(10); + let byte_width = 16; + let mut fixed_size_builder = FixedSizeListBuilder::new(values_builder, byte_width); + let value_as_bytes = 123456_i128.to_le_bytes(); + fixed_size_builder + .values() + .append_slice(value_as_bytes.as_slice()); + fixed_size_builder.append(true); + let fixed_size_array = fixed_size_builder.finish(); + + // Build ArrayData for Decimal + let builder = ArrayData::builder(DataType::Decimal128(5, 3)) + .len(fixed_size_array.len()) + .add_buffer(fixed_size_array.data_ref().child_data()[0].buffers()[0].clone()); + let array_data = unsafe { builder.build_unchecked() }; + let validation_result = array_data.validate_full(); + let error = validation_result.unwrap_err(); + assert_eq!( + "Invalid argument error: 123456 is too large to store in a Decimal128 of precision 5. Max is 99999", + error.to_string() + ); +} + +#[test] +fn test_decimal_validation() { + let mut builder = Decimal128Builder::with_capacity(4, 10, 4); + builder.append_value(10000).unwrap(); + builder.append_value(20000).unwrap(); + let array = builder.finish(); + + array.data().validate_full().unwrap(); +} + +#[test] +#[cfg(not(feature = "force_validate"))] +fn test_sliced_array_child() { + let values = Int32Array::from_iter_values([1, 2, 3]); + let values_sliced = values.slice(1, 2); + let offsets = Buffer::from_iter([1_i32, 3_i32]); + + let list_field = Field::new("element", DataType::Int32, false); + let data_type = DataType::List(Box::new(list_field)); + + let data = unsafe { + ArrayData::new_unchecked( + data_type, + 1, + None, + None, + 0, + vec![offsets], + vec![values_sliced.into_data()], + ) + }; + + let err = data.validate_values().unwrap_err(); + assert_eq!(err.to_string(), "Invalid argument error: Offset invariant failure: offset at position 1 out of bounds: 3 > 2"); +}