From 48d2d2fec7cb2fdb4b11f357804732b63eec31c4 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Tue, 18 Oct 2022 05:44:25 +0000 Subject: [PATCH] Added MutableBinaryValuesArray --- src/array/binary/iterator.rs | 11 +- src/array/binary/mod.rs | 2 + src/array/binary/mutable.rs | 210 ++++++------ src/array/binary/mutable_values.rs | 410 ++++++++++++++++++++++++ src/array/mod.rs | 2 +- src/array/specification.rs | 7 - src/array/utf8/mutable.rs | 12 +- tests/it/array/binary/mod.rs | 1 + tests/it/array/binary/mutable_values.rs | 105 ++++++ tests/it/array/utf8/mutable_values.rs | 14 +- 10 files changed, 656 insertions(+), 118 deletions(-) create mode 100644 src/array/binary/mutable_values.rs create mode 100644 tests/it/array/binary/mutable_values.rs diff --git a/src/array/binary/iterator.rs b/src/array/binary/iterator.rs index 43f85176197..7d452fd4b91 100644 --- a/src/array/binary/iterator.rs +++ b/src/array/binary/iterator.rs @@ -3,7 +3,7 @@ use crate::{ bitmap::utils::ZipValidity, }; -use super::BinaryArray; +use super::{BinaryArray, MutableBinaryValuesArray}; unsafe impl<'a, O: Offset> ArrayAccessor<'a> for BinaryArray { type Item = &'a [u8]; @@ -30,3 +30,12 @@ impl<'a, O: Offset> IntoIterator for &'a BinaryArray { self.iter() } } + +impl<'a, O: Offset> IntoIterator for &'a MutableBinaryValuesArray { + type Item = &'a [u8]; + type IntoIter = ArrayValuesIter<'a, MutableBinaryValuesArray>; + + fn into_iter(self) -> Self::IntoIter { + self.iter() + } +} diff --git a/src/array/binary/mod.rs b/src/array/binary/mod.rs index 39c2a4fcfe9..cbafd5c98b7 100644 --- a/src/array/binary/mod.rs +++ b/src/array/binary/mod.rs @@ -21,6 +21,8 @@ pub(super) mod fmt; mod iterator; pub use iterator::*; mod from; +mod mutable_values; +pub use mutable_values::*; mod mutable; pub use mutable::*; diff --git a/src/array/binary/mutable.rs b/src/array/binary/mutable.rs index c1ee4ba8a6e..1d52f1cbafd 100644 --- a/src/array/binary/mutable.rs +++ b/src/array/binary/mutable.rs @@ -1,14 +1,14 @@ use std::{iter::FromIterator, sync::Arc}; use crate::{ - array::{specification::check_offsets, Array, MutableArray, Offset, TryExtend, TryPush}, - bitmap::MutableBitmap, + array::{Array, MutableArray, Offset, TryExtend, TryPush}, + bitmap::{Bitmap, MutableBitmap}, datatypes::DataType, error::{Error, Result}, trusted_len::TrustedLen, }; -use super::BinaryArray; +use super::{BinaryArray, MutableBinaryValuesArray}; use crate::array::physical_binary::*; /// The Arrow's equivalent to `Vec>>`. @@ -17,20 +17,18 @@ use crate::array::physical_binary::*; /// This struct does not allocate a validity until one is required (i.e. push a null to it). #[derive(Debug)] pub struct MutableBinaryArray { - data_type: DataType, - offsets: Vec, - values: Vec, + values: MutableBinaryValuesArray, validity: Option, } impl From> for BinaryArray { fn from(other: MutableBinaryArray) -> Self { - BinaryArray::::new( - other.data_type, - other.offsets.into(), - other.values.into(), - other.validity.map(|x| x.into()), - ) + let validity = other.validity.and_then(|x| { + let validity: Option = x.into(); + validity + }); + let array: BinaryArray = other.values.into(); + array.with_validity(validity) } } @@ -48,76 +46,83 @@ impl MutableBinaryArray { Self::with_capacity(0) } - /// The canonical method to create a [`MutableBinaryArray`] out of low-end APIs. + /// Returns a [`MutableBinaryArray`] created from its internal representation. + /// + /// # Errors + /// This function returns an error iff: + /// * the offsets are not monotonically increasing + /// * The last offset is not equal to the values' length. + /// * the validity's length is not equal to `offsets.len() - 1`. + /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Binary` or `LargeBinary`. + /// # Implementation + /// This function is `O(N)` - checking monotinicity is `O(N)` + pub fn try_new( + data_type: DataType, + offsets: Vec, + values: Vec, + validity: Option, + ) -> Result { + let values = MutableBinaryValuesArray::try_new(data_type, offsets, values)?; + + if validity + .as_ref() + .map_or(false, |validity| validity.len() != values.len()) + { + return Err(Error::oos( + "validity's length must be equal to the number of values", + )); + } + + Ok(Self { values, validity }) + } + + /// Create a [`MutableBinaryArray`] out of its inner attributes. + /// # Safety + /// The caller must ensure that every value between offsets is a valid utf8. /// # Panics /// This function panics iff: /// * The `offsets` and `values` are inconsistent /// * The validity is not `None` and its length is different from `offsets`'s length minus one. - pub fn from_data( + pub unsafe fn new_unchecked( data_type: DataType, offsets: Vec, values: Vec, validity: Option, ) -> Self { - check_offsets(&offsets, values.len()); + let values = MutableBinaryValuesArray::new_unchecked(data_type, offsets, values); if let Some(ref validity) = validity { - assert_eq!(offsets.len() - 1, validity.len()); - } - if data_type.to_physical_type() != Self::default_data_type().to_physical_type() { - panic!("MutableBinaryArray can only be initialized with DataType::Binary or DataType::LargeBinary") - } - Self { - data_type, - offsets, - values, - validity, + assert_eq!(values.len(), validity.len()); } + Self { values, validity } } fn default_data_type() -> DataType { BinaryArray::::default_data_type() } - /// Creates a new [`MutableBinaryArray`] with capacity for `capacity` values. - /// # Implementation - /// This does not allocate the validity. + /// Initializes a new [`MutableBinaryArray`] with a pre-allocated capacity of slots. pub fn with_capacity(capacity: usize) -> Self { - let mut offsets = Vec::::with_capacity(capacity + 1); - offsets.push(O::default()); - Self { - data_type: BinaryArray::::default_data_type(), - offsets, - values: Vec::::new(), - validity: None, - } + Self::with_capacities(capacity, 0) } /// Initializes a new [`MutableBinaryArray`] with a pre-allocated capacity of slots and values. + /// # Implementation + /// This does not allocate the validity. pub fn with_capacities(capacity: usize, values: usize) -> Self { - let mut offsets = Vec::::with_capacity(capacity + 1); - offsets.push(O::default()); - Self { - data_type: Self::default_data_type(), - offsets, - values: Vec::::with_capacity(values), + values: MutableBinaryValuesArray::with_capacities(capacity, values), validity: None, } } - /// Reserves `additional` slots. - pub fn reserve(&mut self, additional: usize) { - self.offsets.reserve(additional); + /// Reserves `additional` elements and `additional_values` on the values buffer. + pub fn reserve(&mut self, additional: usize, additional_values: usize) { + self.values.reserve(additional, additional_values); if let Some(x) = self.validity.as_mut() { x.reserve(additional) } } - #[inline] - fn last_offset(&self) -> O { - *self.offsets.last().unwrap() - } - /// Pushes a new element to the array. /// # Panic /// This operation panics iff the length of all values (in bytes) exceeds `O` maximum value. @@ -128,12 +133,7 @@ impl MutableBinaryArray { /// Pop the last entry from [`MutableBinaryArray`]. /// This function returns `None` iff this array is empty pub fn pop(&mut self) -> Option> { - if self.offsets.len() < 2 { - return None; - } - self.offsets.pop()?; - let value_start = self.offsets.iter().last().cloned()?.to_usize(); - let value = self.values.split_off(value_start); + let value = self.values.pop()?; self.validity .as_mut() .map(|x| x.pop()?.then(|| ())) @@ -152,10 +152,10 @@ impl MutableBinaryArray { } fn init_validity(&mut self) { - let mut validity = MutableBitmap::with_capacity(self.offsets.capacity()); + let mut validity = MutableBitmap::with_capacity(self.values.capacity()); validity.extend_constant(self.len(), true); validity.set(self.len() - 1, false); - self.validity = Some(validity) + self.validity = Some(validity); } /// Converts itself into an [`Array`]. @@ -167,28 +167,37 @@ impl MutableBinaryArray { /// Shrinks the capacity of the [`MutableBinaryArray`] to fit its current length. pub fn shrink_to_fit(&mut self) { self.values.shrink_to_fit(); - self.offsets.shrink_to_fit(); if let Some(validity) = &mut self.validity { validity.shrink_to_fit() } } + + /// Equivalent to `Self::try_new(...).unwrap()` + pub fn from_data( + data_type: DataType, + offsets: Vec, + values: Vec, + validity: Option, + ) -> Self { + Self::try_new(data_type, offsets, values, validity).unwrap() + } } impl MutableBinaryArray { /// returns its values. pub fn values(&self) -> &Vec { - &self.values + self.values.values() } /// returns its offsets. pub fn offsets(&self) -> &Vec { - &self.offsets + self.values.offsets() } } impl MutableArray for MutableBinaryArray { fn len(&self) -> usize { - self.offsets.len() - 1 + self.values.len() } fn validity(&self) -> Option<&MutableBitmap> { @@ -196,25 +205,39 @@ impl MutableArray for MutableBinaryArray { } fn as_box(&mut self) -> Box { - Box::new(BinaryArray::new( - self.data_type.clone(), - std::mem::take(&mut self.offsets).into(), - std::mem::take(&mut self.values).into(), - std::mem::take(&mut self.validity).map(|x| x.into()), - )) + // Safety: + // `MutableBinaryArray` has the same invariants as `BinaryArray` and thus + // `BinaryArray` can be safely created from `MutableBinaryArray` without checks. + let (data_type, offsets, values) = std::mem::take(&mut self.values).into_inner(); + unsafe { + BinaryArray::new_unchecked( + data_type, + offsets.into(), + values.into(), + std::mem::take(&mut self.validity).map(|x| x.into()), + ) + } + .boxed() } fn as_arc(&mut self) -> Arc { - Arc::new(BinaryArray::new( - self.data_type.clone(), - std::mem::take(&mut self.offsets).into(), - std::mem::take(&mut self.values).into(), - std::mem::take(&mut self.validity).map(|x| x.into()), - )) + // Safety: + // `MutableBinaryArray` has the same invariants as `BinaryArray` and thus + // `BinaryArray` can be safely created from `MutableBinaryArray` without checks. + let (data_type, offsets, values) = std::mem::take(&mut self.values).into_inner(); + unsafe { + BinaryArray::new_unchecked( + data_type, + offsets.into(), + values.into(), + std::mem::take(&mut self.validity).map(|x| x.into()), + ) + } + .arced() } fn data_type(&self) -> &DataType { - &self.data_type + self.values.data_type() } fn as_any(&self) -> &dyn std::any::Any { @@ -231,7 +254,7 @@ impl MutableArray for MutableBinaryArray { } fn reserve(&mut self, additional: usize) { - self.reserve(additional) + self.reserve(additional, 0) } fn shrink_to_fit(&mut self) { @@ -353,7 +376,9 @@ impl MutableBinaryArray { P: AsRef<[u8]>, I: Iterator, { - let additional = extend_from_values_iter(&mut self.offsets, &mut self.values, iterator); + let length = self.values.len(); + self.values.extend(iterator); + let additional = self.values.len() - length; if let Some(validity) = self.validity.as_mut() { validity.extend_constant(additional, true); @@ -371,10 +396,9 @@ impl MutableBinaryArray { P: AsRef<[u8]>, I: Iterator, { - let (_, upper) = iterator.size_hint(); - let additional = upper.expect("extend_trusted_len_values requires an upper limit"); - - extend_from_trusted_len_values_iter(&mut self.offsets, &mut self.values, iterator); + let length = self.values.len(); + self.values.extend_trusted_len_unchecked(iterator); + let additional = self.values.len() - length; if let Some(validity) = self.validity.as_mut() { validity.extend_constant(additional, true); @@ -407,16 +431,8 @@ impl MutableBinaryArray { self.validity = Some(validity); } - extend_from_trusted_len_iter( - &mut self.offsets, - &mut self.values, - self.validity.as_mut().unwrap(), - iterator, - ); - - if self.validity.as_mut().unwrap().unset_bits() == 0 { - self.validity = None; - } + self.values + .extend_from_trusted_len_iter(self.validity.as_mut().unwrap(), iterator); } /// Creates a new [`MutableBinaryArray`] from a [`Iterator`] of `&[u8]`. @@ -435,7 +451,7 @@ impl> Extend> for MutableBinaryArray { impl> TryExtend> for MutableBinaryArray { fn try_extend>>(&mut self, iter: I) -> Result<()> { let mut iter = iter.into_iter(); - self.reserve(iter.size_hint().0); + self.reserve(iter.size_hint().0, 0); iter.try_for_each(|x| self.try_push(x)) } } @@ -444,13 +460,7 @@ impl> TryPush> for MutableBinaryArray { fn try_push(&mut self, value: Option) -> Result<()> { match value { Some(value) => { - let bytes = value.as_ref(); - - let size = O::from_usize(self.values.len() + bytes.len()).ok_or(Error::Overflow)?; - - self.values.extend_from_slice(bytes); - - self.offsets.push(size); + self.values.try_push(value.as_ref())?; match &mut self.validity { Some(validity) => validity.push(true), @@ -458,7 +468,7 @@ impl> TryPush> for MutableBinaryArray { } } None => { - self.offsets.push(self.last_offset()); + self.values.push(""); match &mut self.validity { Some(validity) => validity.push(false), None => self.init_validity(), diff --git a/src/array/binary/mutable_values.rs b/src/array/binary/mutable_values.rs new file mode 100644 index 00000000000..18518889c6e --- /dev/null +++ b/src/array/binary/mutable_values.rs @@ -0,0 +1,410 @@ +use std::{iter::FromIterator, sync::Arc}; + +use crate::{ + array::{ + specification::{check_offsets_minimal, try_check_offsets}, + Array, ArrayAccessor, ArrayValuesIter, MutableArray, Offset, TryExtend, TryPush, + }, + bitmap::MutableBitmap, + datatypes::DataType, + error::{Error, Result}, + trusted_len::TrustedLen, +}; + +use super::{BinaryArray, MutableBinaryArray}; +use crate::array::physical_binary::*; + +/// A [`MutableArray`] that builds a [`BinaryArray`]. It differs +/// from [`MutableBinaryArray`] in that it builds non-null [`BinaryArray`]. +#[derive(Debug, Clone)] +pub struct MutableBinaryValuesArray { + data_type: DataType, + offsets: Vec, + values: Vec, +} + +impl From> for BinaryArray { + fn from(other: MutableBinaryValuesArray) -> Self { + // Safety: + // `MutableBinaryValuesArray` has the same invariants as `BinaryArray` and thus + // `BinaryArray` can be safely created from `MutableBinaryValuesArray` without checks. + unsafe { + BinaryArray::::from_data_unchecked( + other.data_type, + other.offsets.into(), + other.values.into(), + None, + ) + } + } +} + +impl From> for MutableBinaryArray { + fn from(other: MutableBinaryValuesArray) -> Self { + // Safety: + // `MutableBinaryValuesArray` has the same invariants as `MutableBinaryArray` + unsafe { + MutableBinaryArray::::new_unchecked( + other.data_type, + other.offsets, + other.values, + None, + ) + } + } +} + +impl Default for MutableBinaryValuesArray { + fn default() -> Self { + Self::new() + } +} + +impl MutableBinaryValuesArray { + /// Returns an empty [`MutableBinaryValuesArray`]. + pub fn new() -> Self { + Self { + data_type: Self::default_data_type(), + offsets: vec![O::default()], + values: Vec::::new(), + } + } + + /// Returns a [`MutableBinaryValuesArray`] created from its internal representation. + /// + /// # Errors + /// This function returns an error iff: + /// * the offsets are not monotonically increasing + /// * The last offset is not equal to the values' length. + /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Binary` or `LargeBinary`. + /// # Implementation + /// This function is `O(N)` - checking monotinicity is `O(N)` + pub fn try_new(data_type: DataType, offsets: Vec, values: Vec) -> Result { + try_check_offsets(&offsets, values.len())?; + if data_type.to_physical_type() != Self::default_data_type().to_physical_type() { + return Err(Error::oos( + "MutableBinaryValuesArray can only be initialized with DataType::Binary or DataType::LargeBinary", + )); + } + + Ok(Self { + data_type, + offsets, + values, + }) + } + + /// Returns a [`MutableBinaryValuesArray`] created from its internal representation. + /// + /// # Panic + /// This function does not panic iff: + /// * The last offset is equal to the values' length. + /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is equal to either `Binary` or `LargeBinary`. + /// # Safety + /// This function is safe iff: + /// * the offsets are monotonically increasing + /// # Implementation + /// This function is `O(1)` + pub unsafe fn new_unchecked(data_type: DataType, offsets: Vec, values: Vec) -> Self { + check_offsets_minimal(&offsets, values.len()); + + if data_type.to_physical_type() != Self::default_data_type().to_physical_type() { + panic!("MutableBinaryValuesArray can only be initialized with DataType::Binary or DataType::LargeBinary") + } + + Self { + data_type, + offsets, + values, + } + } + + /// Returns the default [`DataType`] of this container: [`DataType::Utf8`] or [`DataType::LargeUtf8`] + /// depending on the generic [`Offset`]. + pub fn default_data_type() -> DataType { + BinaryArray::::default_data_type() + } + + /// Initializes a new [`MutableBinaryValuesArray`] with a pre-allocated capacity of items. + pub fn with_capacity(capacity: usize) -> Self { + Self::with_capacities(capacity, 0) + } + + /// Initializes a new [`MutableBinaryValuesArray`] with a pre-allocated capacity of items and values. + pub fn with_capacities(capacity: usize, values: usize) -> Self { + let mut offsets = Vec::::with_capacity(capacity + 1); + offsets.push(O::default()); + + Self { + data_type: Self::default_data_type(), + offsets, + values: Vec::::with_capacity(values), + } + } + + /// returns its values. + #[inline] + pub fn values(&self) -> &Vec { + &self.values + } + + /// returns its offsets. + #[inline] + pub fn offsets(&self) -> &Vec { + &self.offsets + } + + /// Reserves `additional` elements and `additional_values` on the values. + #[inline] + pub fn reserve(&mut self, additional: usize, additional_values: usize) { + self.offsets.reserve(additional + 1); + self.values.reserve(additional_values); + } + + /// Returns the capacity in number of items + pub fn capacity(&self) -> usize { + self.offsets.capacity() - 1 + } + + /// Returns the length of this array + #[inline] + pub fn len(&self) -> usize { + self.offsets.len() - 1 + } + + /// Pushes a new item to the array. + /// # Panic + /// This operation panics iff the length of all values (in bytes) exceeds `O` maximum value. + #[inline] + pub fn push>(&mut self, value: T) { + self.try_push(value).unwrap() + } + + /// Pop the last entry from [`MutableBinaryValuesArray`]. + /// This function returns `None` iff this array is empty. + pub fn pop(&mut self) -> Option> { + if self.len() == 0 { + return None; + } + self.offsets.pop()?; + let start = self.offsets.last()?.to_usize(); + let value = self.values.split_off(start); + Some(value.to_vec()) + } + + /// Returns the value of the element at index `i`. + /// # Panic + /// This function panics iff `i >= self.len`. + #[inline] + pub fn value(&self, i: usize) -> &[u8] { + assert!(i < self.len()); + unsafe { self.value_unchecked(i) } + } + + /// Returns the value of the element at index `i`. + /// # Safety + /// This function is safe iff `i < self.len`. + #[inline] + pub unsafe fn value_unchecked(&self, i: usize) -> &[u8] { + // soundness: the invariant of the function + let start = self.offsets.get_unchecked(i).to_usize(); + let end = self.offsets.get_unchecked(i + 1).to_usize(); + + // soundness: the invariant of the struct + self.values.get_unchecked(start..end) + } + + /// Returns an iterator of `&[u8]` + pub fn iter(&self) -> ArrayValuesIter { + ArrayValuesIter::new(self) + } + + /// Shrinks the capacity of the [`MutableBinaryValuesArray`] to fit its current length. + pub fn shrink_to_fit(&mut self) { + self.values.shrink_to_fit(); + self.offsets.shrink_to_fit(); + } + + /// Extract the low-end APIs from the [`MutableBinaryValuesArray`]. + pub fn into_inner(self) -> (DataType, Vec, Vec) { + (self.data_type, self.offsets, self.values) + } +} + +impl MutableArray for MutableBinaryValuesArray { + fn len(&self) -> usize { + self.len() + } + + fn validity(&self) -> Option<&MutableBitmap> { + None + } + + fn as_box(&mut self) -> Box { + // Safety: + // `MutableBinaryValuesArray` has the same invariants as `BinaryArray` and thus + // `BinaryArray` can be safely created from `MutableBinaryValuesArray` without checks. + let (data_type, offsets, values) = std::mem::take(self).into_inner(); + unsafe { BinaryArray::from_data_unchecked(data_type, offsets.into(), values.into(), None) } + .boxed() + } + + fn as_arc(&mut self) -> Arc { + // Safety: + // `MutableBinaryValuesArray` has the same invariants as `BinaryArray` and thus + // `BinaryArray` can be safely created from `MutableBinaryValuesArray` without checks. + let (data_type, offsets, values) = std::mem::take(self).into_inner(); + unsafe { BinaryArray::from_data_unchecked(data_type, offsets.into(), values.into(), None) } + .arced() + } + + fn data_type(&self) -> &DataType { + &self.data_type + } + + fn as_any(&self) -> &dyn std::any::Any { + self + } + + fn as_mut_any(&mut self) -> &mut dyn std::any::Any { + self + } + + #[inline] + fn push_null(&mut self) { + self.push::<&[u8]>(b"") + } + + fn reserve(&mut self, additional: usize) { + self.reserve(additional, 0) + } + + fn shrink_to_fit(&mut self) { + self.shrink_to_fit() + } +} + +impl> FromIterator

for MutableBinaryValuesArray { + fn from_iter>(iter: I) -> Self { + let (offsets, values) = values_iter(iter.into_iter()); + // soundness: T: AsRef<[u8]> and offsets are monotonically increasing + unsafe { Self::new_unchecked(Self::default_data_type(), offsets, values) } + } +} + +impl MutableBinaryValuesArray { + pub(crate) unsafe fn extend_from_trusted_len_iter( + &mut self, + validity: &mut MutableBitmap, + iterator: I, + ) where + P: AsRef<[u8]>, + I: Iterator>, + { + extend_from_trusted_len_iter(&mut self.offsets, &mut self.values, validity, iterator); + } + + /// Extends the [`MutableBinaryValuesArray`] from a [`TrustedLen`] + #[inline] + pub fn extend_trusted_len(&mut self, iterator: I) + where + P: AsRef<[u8]>, + I: TrustedLen, + { + unsafe { self.extend_trusted_len_unchecked(iterator) } + } + + /// Extends [`MutableBinaryValuesArray`] from an iterator of trusted len. + /// # Safety + /// The iterator must be trusted len. + #[inline] + pub unsafe fn extend_trusted_len_unchecked(&mut self, iterator: I) + where + P: AsRef<[u8]>, + I: Iterator, + { + extend_from_trusted_len_values_iter(&mut self.offsets, &mut self.values, iterator); + } + + /// Creates a [`MutableBinaryValuesArray`] from a [`TrustedLen`] + #[inline] + pub fn from_trusted_len_iter(iterator: I) -> Self + where + P: AsRef<[u8]>, + I: TrustedLen, + { + // soundness: I is `TrustedLen` + unsafe { Self::from_trusted_len_iter_unchecked(iterator) } + } + + /// Returns a new [`MutableBinaryValuesArray`] from an iterator of trusted length. + /// # Safety + /// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html). + /// I.e. that `size_hint().1` correctly reports its length. + #[inline] + pub unsafe fn from_trusted_len_iter_unchecked(iterator: I) -> Self + where + P: AsRef<[u8]>, + I: Iterator, + { + let (offsets, values) = trusted_len_values_iter(iterator); + + // soundness: offsets are monotonically increasing + Self::new_unchecked(Self::default_data_type(), offsets, values) + } + + /// Returns a new [`MutableBinaryValuesArray`] from an iterator. + /// # Error + /// This operation errors iff the total length in bytes on the iterator exceeds `O`'s maximum value. + /// (`i32::MAX` or `i64::MAX` respectively). + pub fn try_from_iter, I: IntoIterator>(iter: I) -> Result { + let iterator = iter.into_iter(); + let (lower, _) = iterator.size_hint(); + let mut array = Self::with_capacity(lower); + for item in iterator { + array.try_push(item)?; + } + Ok(array) + } +} + +impl> Extend for MutableBinaryValuesArray { + fn extend>(&mut self, iter: I) { + extend_from_values_iter(&mut self.offsets, &mut self.values, iter.into_iter()); + } +} + +impl> TryExtend for MutableBinaryValuesArray { + fn try_extend>(&mut self, iter: I) -> Result<()> { + let mut iter = iter.into_iter(); + self.reserve(iter.size_hint().0, 0); + iter.try_for_each(|x| self.try_push(x)) + } +} + +impl> TryPush for MutableBinaryValuesArray { + #[inline] + fn try_push(&mut self, value: T) -> Result<()> { + let bytes = value.as_ref(); + self.values.extend_from_slice(bytes); + + let size = O::from_usize(self.values.len()).ok_or(Error::Overflow)?; + + self.offsets.push(size); + Ok(()) + } +} + +unsafe impl<'a, O: Offset> ArrayAccessor<'a> for MutableBinaryValuesArray { + type Item = &'a [u8]; + + #[inline] + unsafe fn value_unchecked(&'a self, index: usize) -> Self::Item { + self.value_unchecked(index) + } + + #[inline] + fn len(&self) -> usize { + self.len() + } +} diff --git a/src/array/mod.rs b/src/array/mod.rs index b78bb3f3b81..621c776692f 100644 --- a/src/array/mod.rs +++ b/src/array/mod.rs @@ -387,7 +387,7 @@ pub use equal::equal; pub use fmt::{get_display, get_value_display}; pub use crate::types::Offset; -pub use binary::{BinaryArray, BinaryValueIter, MutableBinaryArray}; +pub use binary::{BinaryArray, BinaryValueIter, MutableBinaryArray, MutableBinaryValuesArray}; pub use boolean::{BooleanArray, MutableBooleanArray}; pub use dictionary::{DictionaryArray, DictionaryKey, MutableDictionaryArray}; pub use fixed_size_binary::{FixedSizeBinaryArray, MutableFixedSizeBinaryArray}; diff --git a/src/array/specification.rs b/src/array/specification.rs index 2b189b76afc..6b9139feedf 100644 --- a/src/array/specification.rs +++ b/src/array/specification.rs @@ -73,13 +73,6 @@ pub fn try_check_offsets_and_utf8(offsets: &[O], values: &[u8]) -> Re } } -/// # Panics iff: -/// * the `offsets` is not monotonically increasing, or -/// * any offset is larger or equal to `values_len`. -pub fn check_offsets(offsets: &[O], values_len: usize) { - try_check_offsets(offsets, values_len).unwrap() -} - /// Checks that `offsets` is monotonically increasing, and all offsets are less than or equal to /// `values_len`. pub fn try_check_offsets(offsets: &[O], values_len: usize) -> Result<()> { diff --git a/src/array/utf8/mutable.rs b/src/array/utf8/mutable.rs index 0cc1f8d1c5f..cd94d5eae85 100644 --- a/src/array/utf8/mutable.rs +++ b/src/array/utf8/mutable.rs @@ -92,7 +92,11 @@ impl MutableUtf8Array { values: Vec, validity: Option, ) -> Self { - Self::from_data_unchecked(data_type, offsets, values, validity) + let values = MutableUtf8ValuesArray::new_unchecked(data_type, offsets, values); + if let Some(ref validity) = validity { + assert_eq!(values.len(), validity.len()); + } + Self { values, validity } } /// Alias of `new_unchecked` @@ -104,11 +108,7 @@ impl MutableUtf8Array { values: Vec, validity: Option, ) -> Self { - let values = MutableUtf8ValuesArray::new_unchecked(data_type, offsets, values); - if let Some(ref validity) = validity { - assert_eq!(values.len(), validity.len()); - } - Self { values, validity } + Self::new_unchecked(data_type, offsets, values, validity) } /// The canonical method to create a [`MutableUtf8Array`] out of low-end APIs. diff --git a/tests/it/array/binary/mod.rs b/tests/it/array/binary/mod.rs index c36b7121414..d26cf4ad3f4 100644 --- a/tests/it/array/binary/mod.rs +++ b/tests/it/array/binary/mod.rs @@ -6,6 +6,7 @@ use arrow2::{ }; mod mutable; +mod mutable_values; mod to_mutable; #[test] diff --git a/tests/it/array/binary/mutable_values.rs b/tests/it/array/binary/mutable_values.rs new file mode 100644 index 00000000000..af02b1d54b3 --- /dev/null +++ b/tests/it/array/binary/mutable_values.rs @@ -0,0 +1,105 @@ +use arrow2::array::MutableArray; +use arrow2::array::MutableBinaryValuesArray; +use arrow2::datatypes::DataType; + +#[test] +fn capacity() { + let mut b = MutableBinaryValuesArray::::with_capacity(100); + + assert_eq!(b.values().capacity(), 0); + assert!(b.offsets().capacity() >= 101); + b.shrink_to_fit(); + assert!(b.offsets().capacity() < 101); +} + +#[test] +fn offsets_must_be_monotonic_increasing() { + let offsets = vec![0, 5, 4]; + let values = b"abbbbb".to_vec(); + assert!(MutableBinaryValuesArray::::try_new(DataType::Binary, offsets, values).is_err()); +} + +#[test] +fn offsets_must_be_in_bounds() { + let offsets = vec![0, 10]; + let values = b"abbbbb".to_vec(); + assert!(MutableBinaryValuesArray::::try_new(DataType::Binary, offsets, values).is_err()); +} + +#[test] +fn data_type_must_be_consistent() { + let offsets = vec![0, 4]; + let values = b"abbb".to_vec(); + assert!(MutableBinaryValuesArray::::try_new(DataType::Int32, offsets, values).is_err()); +} + +#[test] +fn as_box() { + let offsets = vec![0, 2]; + let values = b"ab".to_vec(); + let mut b = + MutableBinaryValuesArray::::try_new(DataType::Binary, offsets, values).unwrap(); + let _ = b.as_box(); +} + +#[test] +fn as_arc() { + let offsets = vec![0, 2]; + let values = b"ab".to_vec(); + let mut b = + MutableBinaryValuesArray::::try_new(DataType::Binary, offsets, values).unwrap(); + let _ = b.as_arc(); +} + +#[test] +fn extend_trusted_len() { + let offsets = vec![0, 2]; + let values = b"ab".to_vec(); + let mut b = + MutableBinaryValuesArray::::try_new(DataType::Binary, offsets, values).unwrap(); + b.extend_trusted_len(vec!["a", "b"].into_iter()); + + let offsets = vec![0, 2, 3, 4]; + let values = b"abab".to_vec(); + assert_eq!( + b.as_box(), + MutableBinaryValuesArray::::try_new(DataType::Binary, offsets, values) + .unwrap() + .as_box() + ) +} + +#[test] +fn from_trusted_len() { + let mut b = MutableBinaryValuesArray::::from_trusted_len_iter(vec!["a", "b"].into_iter()); + + let offsets = vec![0, 1, 2]; + let values = b"ab".to_vec(); + assert_eq!( + b.as_box(), + MutableBinaryValuesArray::::try_new(DataType::Binary, offsets, values) + .unwrap() + .as_box() + ) +} + +#[test] +fn extend_from_iter() { + let offsets = vec![0, 2]; + let values = b"ab".to_vec(); + let mut b = + MutableBinaryValuesArray::::try_new(DataType::Binary, offsets, values).unwrap(); + b.extend_trusted_len(vec!["a", "b"].into_iter()); + + let a = b.clone(); + b.extend_trusted_len(a.iter()); + + let offsets = vec![0, 2, 3, 4, 6, 7, 8]; + let values = b"abababab".to_vec(); + assert_eq!( + b.as_box(), + MutableBinaryValuesArray::::try_new(DataType::Binary, offsets, values) + .unwrap() + .as_box() + ) +} diff --git a/tests/it/array/utf8/mutable_values.rs b/tests/it/array/utf8/mutable_values.rs index fbb2abf0af6..1ad783b607d 100644 --- a/tests/it/array/utf8/mutable_values.rs +++ b/tests/it/array/utf8/mutable_values.rs @@ -19,6 +19,13 @@ fn offsets_must_be_monotonic_increasing() { assert!(MutableUtf8ValuesArray::::try_new(DataType::Utf8, offsets, values).is_err()); } +#[test] +fn offsets_must_be_in_bounds() { + let offsets = vec![0, 10]; + let values = b"abbbbb".to_vec(); + assert!(MutableUtf8ValuesArray::::try_new(DataType::Utf8, offsets, values).is_err()); +} + #[test] fn data_type_must_be_consistent() { let offsets = vec![0, 4]; @@ -28,9 +35,10 @@ fn data_type_must_be_consistent() { #[test] fn must_be_utf8() { - let offsets = vec![0, 2]; - let values = vec![207, 128]; - assert!(MutableUtf8ValuesArray::::try_new(DataType::Int32, offsets, values).is_err()); + let offsets = vec![0, 4]; + let values = vec![0, 159, 146, 150]; + assert!(std::str::from_utf8(&values).is_err()); + assert!(MutableUtf8ValuesArray::::try_new(DataType::Utf8, offsets, values).is_err()); } #[test]