Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Let the StringBuilder use BinaryBuilder #2181

Merged
merged 2 commits into from Jul 28, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
31 changes: 23 additions & 8 deletions arrow/src/array/array_string.rs
Expand Up @@ -20,8 +20,8 @@ use std::fmt;
use std::{any::Any, iter::FromIterator};

use super::{
array::print_long_array, raw_pointer::RawPtrBox, Array, ArrayData, GenericListArray,
GenericStringIter, OffsetSizeTrait,
array::print_long_array, raw_pointer::RawPtrBox, Array, ArrayData,
GenericBinaryArray, GenericListArray, GenericStringIter, OffsetSizeTrait,
};
use crate::array::array::ArrayAccessor;
use crate::buffer::Buffer;
Expand Down Expand Up @@ -313,6 +313,27 @@ impl<'a, OffsetSize: OffsetSizeTrait> ArrayAccessor
}
}

impl<OffsetSize: OffsetSizeTrait> From<GenericListArray<OffsetSize>>
for GenericStringArray<OffsetSize>
{
fn from(v: GenericListArray<OffsetSize>) -> Self {
GenericStringArray::<OffsetSize>::from_list(v)
}
}

impl<OffsetSize: OffsetSizeTrait> From<GenericBinaryArray<OffsetSize>>
for GenericStringArray<OffsetSize>
{
fn from(v: GenericBinaryArray<OffsetSize>) -> Self {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This would allow non-utf8 data within a StringArray using safe APIs, which would break our safety guarantees

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are right. But this should not be done in this PR. Because it follows the style of the currentStringArray::from_list(https://github.com/apache/arrow-rs/blob/master/arrow/src/array/array_string.rs#L122-L143) which is also unsafe.

We could file a follow-up issue to track this. Maybe implementing both safe and unsafe styles is a good choice:

impl StringArray {
    unsafe fn from_list_unchecked(...) {...}
    unsafe fn from_binary_unchecked (...) {...}
}

impl From<ListArray> for StringArray {
    /// safe method with utf-8 checking
    fn from(...) {...}
}

impl From<BinaryArray> for StringArray {
    /// safe method with utf-8 checking
    fn from(...) {...}
}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Filed #2205 to track this.

let builder = v
.into_data()
.into_builder()
.data_type(Self::get_data_type());
let data = unsafe { builder.build_unchecked() };
Self::from(data)
}
}

impl<OffsetSize: OffsetSizeTrait> From<ArrayData> for GenericStringArray<OffsetSize> {
fn from(data: ArrayData) -> Self {
assert_eq!(
Expand Down Expand Up @@ -385,12 +406,6 @@ pub type StringArray = GenericStringArray<i32>;
/// ```
pub type LargeStringArray = GenericStringArray<i64>;

impl<T: OffsetSizeTrait> From<GenericListArray<T>> for GenericStringArray<T> {
fn from(v: GenericListArray<T>) -> Self {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Move, not remove

GenericStringArray::<T>::from_list(v)
}
}

#[cfg(test)]
mod tests {

Expand Down
23 changes: 23 additions & 0 deletions arrow/src/array/builder/generic_binary_builder.rs
Expand Up @@ -48,6 +48,19 @@ impl<OffsetSize: OffsetSizeTrait> GenericBinaryBuilder<OffsetSize> {
}
}

/// Creates a new [`GenericBinaryBuilder`],
/// `item_capacity` is the number of items to pre-allocate space for in this builder
/// `data_capacity` is the number of bytes to pre-allocate space for in this builder
pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
let mut offsets_builder = BufferBuilder::<OffsetSize>::new(item_capacity + 1);
offsets_builder.append(OffsetSize::zero());
Self {
value_builder: UInt8BufferBuilder::new(data_capacity),
offsets_builder,
null_buffer_builder: NullBufferBuilder::new(item_capacity),
}
}

/// Appends a byte slice into the builder.
#[inline]
pub fn append_value(&mut self, value: impl AsRef<[u8]>) {
Expand Down Expand Up @@ -82,6 +95,16 @@ impl<OffsetSize: OffsetSizeTrait> GenericBinaryBuilder<OffsetSize> {
let array_data = unsafe { array_builder.build_unchecked() };
GenericBinaryArray::<OffsetSize>::from(array_data)
}

/// Returns the current values buffer as a slice
pub fn values_slice(&self) -> &[u8] {
self.value_builder.as_slice()
}

/// Returns the current offsets buffer as a slice
pub fn offsets_slice(&self) -> &[OffsetSize] {
self.offsets_builder.as_slice()
}
}

impl<OffsetSize: OffsetSizeTrait> ArrayBuilder for GenericBinaryBuilder<OffsetSize> {
Expand Down
145 changes: 62 additions & 83 deletions arrow/src/array/builder/generic_string_builder.rs
Expand Up @@ -15,60 +15,46 @@
// specific language governing permissions and limitations
// under the License.

use crate::array::{
ArrayBuilder, ArrayRef, GenericListBuilder, GenericStringArray, OffsetSizeTrait,
UInt8Builder,
};
use crate::array::{ArrayBuilder, ArrayRef, GenericStringArray, OffsetSizeTrait};
use std::any::Any;
use std::sync::Arc;

use super::GenericBinaryBuilder;

/// Array builder for [`GenericStringArray`]
#[derive(Debug)]
pub struct GenericStringBuilder<OffsetSize: OffsetSizeTrait> {
builder: GenericListBuilder<OffsetSize, UInt8Builder>,
builder: GenericBinaryBuilder<OffsetSize>,
}

impl<OffsetSize: OffsetSizeTrait> GenericStringBuilder<OffsetSize> {
/// Creates a new `StringBuilder`,
/// Creates a new [`GenericStringBuilder`],
/// `capacity` is the number of bytes of string data to pre-allocate space for in this builder
pub fn new(capacity: usize) -> Self {
let values_builder = UInt8Builder::new(capacity);
Self {
builder: GenericListBuilder::new(values_builder),
builder: GenericBinaryBuilder::new(capacity),
}
}

/// Creates a new `StringBuilder`,
/// Creates a new [`GenericStringBuilder`],
/// `data_capacity` is the number of bytes of string data to pre-allocate space for in this builder
/// `item_capacity` is the number of items to pre-allocate space for in this builder
pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
let values_builder = UInt8Builder::new(data_capacity);
Self {
builder: GenericListBuilder::with_capacity(values_builder, item_capacity),
builder: GenericBinaryBuilder::with_capacity(item_capacity, data_capacity),
}
}

/// Appends a string into the builder.
///
/// Automatically calls the `append` method to delimit the string appended in as a
/// distinct array element.
#[inline]
pub fn append_value(&mut self, value: impl AsRef<str>) {
self.builder
.values()
.append_slice(value.as_ref().as_bytes());
self.builder.append(true);
}

/// Finish the current variable-length list array slot.
#[inline]
pub fn append(&mut self, is_valid: bool) {
self.builder.append(is_valid)
self.builder.append_value(value.as_ref().as_bytes());
}

/// Append a null value to the array.
#[inline]
pub fn append_null(&mut self) {
self.append(false)
self.builder.append_null()
}

/// Append an `Option` value to the array.
Expand All @@ -80,14 +66,14 @@ impl<OffsetSize: OffsetSizeTrait> GenericStringBuilder<OffsetSize> {
};
}

/// Builds the `StringArray` and reset this builder.
/// Builds the [`GenericStringArray`] and reset this builder.
pub fn finish(&mut self) -> GenericStringArray<OffsetSize> {
GenericStringArray::<OffsetSize>::from(self.builder.finish())
}

/// Returns the current values buffer as a slice
pub fn values_slice(&self) -> &[u8] {
self.builder.values_ref().values_slice()
self.builder.values_slice()
}

/// Returns the current offsets buffer as a slice
Expand Down Expand Up @@ -131,79 +117,72 @@ impl<OffsetSize: OffsetSizeTrait> ArrayBuilder for GenericStringBuilder<OffsetSi

#[cfg(test)]
mod tests {
use crate::array::builder::StringBuilder;
use crate::array::{Array, ArrayBuilder};
use super::*;
use crate::array::{Array, ArrayBuilder, OffsetSizeTrait};

#[test]
fn test_string_array_builder() {
let mut builder = StringBuilder::new(20);
fn _test_generic_string_array_builder<O: OffsetSizeTrait>() {
let mut builder = GenericStringBuilder::<O>::new(20);
let owned = "arrow".to_owned();

builder.append_value("hello");
builder.append(true);
builder.append_value("world");

let string_array = builder.finish();
builder.append_value("");
builder.append_value(&owned);
builder.append_null();
builder.append_option(Some("rust"));
builder.append_option(None::<&str>);
builder.append_option(None::<String>);
assert_eq!(7, builder.len());

assert_eq!(
GenericStringArray::<O>::from(vec![
Some("hello"),
Some(""),
Some("arrow"),
None,
Some("rust"),
None,
None
]),
builder.finish()
);
}

assert_eq!(3, string_array.len());
assert_eq!(0, string_array.null_count());
assert_eq!("hello", string_array.value(0));
assert_eq!("", string_array.value(1));
assert_eq!("world", string_array.value(2));
assert_eq!(5, string_array.value_offsets()[2]);
assert_eq!(5, string_array.value_length(2));
#[test]
fn test_string_array_builder() {
_test_generic_string_array_builder::<i32>()
}

#[test]
fn test_string_array_builder_finish() {
let mut builder = StringBuilder::new(10);
fn test_large_string_array_builder() {
_test_generic_string_array_builder::<i64>()
}

fn _test_generic_string_array_builder_finish<O: OffsetSizeTrait>() {
let mut builder = GenericStringBuilder::<O>::with_capacity(3, 11);

builder.append_value("hello");
builder.append_value("world");
builder.append_value("rust");
builder.append_null();

let mut arr = builder.finish();
assert_eq!(2, arr.len());
assert_eq!(0, builder.len());
builder.finish();
assert!(builder.is_empty());
assert_eq!(&[O::zero()], builder.offsets_slice());

builder.append_value("arrow");
arr = builder.finish();
assert_eq!(1, arr.len());
assert_eq!(0, builder.len());
builder.append_value("parquet");
let arr = builder.finish();
// array should not have null buffer because there is not `null` value.
assert_eq!(None, arr.data().null_buffer());
assert_eq!(GenericStringArray::<O>::from(vec!["arrow", "parquet"]), arr,)
}

#[test]
fn test_string_array_builder_append_string() {
let mut builder = StringBuilder::new(20);

let var = "hello".to_owned();
builder.append_value(&var);
builder.append(true);
builder.append_value("world");

let string_array = builder.finish();

assert_eq!(3, string_array.len());
assert_eq!(0, string_array.null_count());
assert_eq!("hello", string_array.value(0));
assert_eq!("", string_array.value(1));
assert_eq!("world", string_array.value(2));
assert_eq!(5, string_array.value_offsets()[2]);
assert_eq!(5, string_array.value_length(2));
fn test_string_array_builder_finish() {
_test_generic_string_array_builder_finish::<i32>()
}

#[test]
fn test_string_array_builder_append_option() {
let mut builder = StringBuilder::new(20);
builder.append_option(Some("hello"));
builder.append_option(None::<&str>);
builder.append_option(None::<String>);
builder.append_option(Some("world"));

let string_array = builder.finish();

assert_eq!(4, string_array.len());
assert_eq!("hello", string_array.value(0));
assert!(string_array.is_null(1));
assert!(string_array.is_null(2));
assert_eq!("world", string_array.value(3));
fn test_large_string_array_builder_finish() {
_test_generic_string_array_builder_finish::<i64>()
}
}