Skip to content

Commit

Permalink
Support casting StringArray/BinaryArray --> StringView / `Binar…
Browse files Browse the repository at this point in the history
…yView` (#5686)

* Support casting from byte array to byte view array.

* Use new_unchecked.

* Add safety justification comment

* Fix comments :facepalm

---------

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
  • Loading branch information
RinChanNOWWW and alamb committed Apr 26, 2024
1 parent 08af471 commit a61f1dc
Show file tree
Hide file tree
Showing 2 changed files with 139 additions and 2 deletions.
12 changes: 12 additions & 0 deletions arrow-array/src/array/byte_view_array.rs
Expand Up @@ -428,6 +428,18 @@ impl BinaryViewArray {
}
}

impl From<Vec<&[u8]>> for BinaryViewArray {
fn from(v: Vec<&[u8]>) -> Self {
Self::from_iter_values(v)
}
}

impl From<Vec<Option<&[u8]>>> for BinaryViewArray {
fn from(v: Vec<Option<&[u8]>>) -> Self {
v.into_iter().collect()
}
}

/// A [`GenericByteViewArray`] that stores utf8 data
///
/// # Example
Expand Down
129 changes: 127 additions & 2 deletions arrow-cast/src/cast/mod.rs
Expand Up @@ -46,6 +46,8 @@ use crate::cast::dictionary::*;
use crate::cast::list::*;
use crate::cast::string::*;

use arrow_buffer::ScalarBuffer;
use arrow_data::ByteView;
use chrono::{NaiveTime, Offset, TimeZone, Utc};
use std::cmp::Ordering;
use std::sync::Arc;
Expand Down Expand Up @@ -119,6 +121,8 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool {
| Utf8
| LargeBinary
| LargeUtf8
| BinaryView
| Utf8View
| List(_)
| LargeList(_)
| FixedSizeList(_, _)
Expand Down Expand Up @@ -192,8 +196,8 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool {
DataType::is_integer(to_type) || DataType::is_floating(to_type) || to_type == &Utf8 || to_type == &LargeUtf8
}

(Binary, LargeBinary | Utf8 | LargeUtf8 | FixedSizeBinary(_)) => true,
(LargeBinary, Binary | Utf8 | LargeUtf8 | FixedSizeBinary(_)) => true,
(Binary, LargeBinary | Utf8 | LargeUtf8 | FixedSizeBinary(_) | BinaryView) => true,
(LargeBinary, Binary | Utf8 | LargeUtf8 | FixedSizeBinary(_) | BinaryView) => true,
(FixedSizeBinary(_), Binary | LargeBinary) => true,
(
Utf8 | LargeUtf8,
Expand All @@ -213,6 +217,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool {
| Timestamp(Nanosecond, _)
| Interval(_),
) => true,
(Utf8 | LargeUtf8, Utf8View) => true,
(Utf8 | LargeUtf8, _) => to_type.is_numeric() && to_type != &Float16,
(_, Utf8 | LargeUtf8) => from_type.is_primitive(),

Expand Down Expand Up @@ -611,6 +616,8 @@ pub fn cast_with_options(
| Utf8
| LargeBinary
| LargeUtf8
| BinaryView
| Utf8View
| List(_)
| LargeList(_)
| FixedSizeList(_, _)
Expand Down Expand Up @@ -1120,6 +1127,7 @@ pub fn cast_with_options(
let binary = BinaryArray::from(array.as_string::<i32>().clone());
cast_byte_container::<BinaryType, LargeBinaryType>(&binary)
}
Utf8View => cast_byte_to_view::<Utf8Type, StringViewType>(array),
LargeUtf8 => cast_byte_container::<Utf8Type, LargeUtf8Type>(array),
Time32(TimeUnit::Second) => parse_string::<Time32SecondType, i32>(array, cast_options),
Time32(TimeUnit::Millisecond) => {
Expand Down Expand Up @@ -1179,6 +1187,7 @@ pub fn cast_with_options(
LargeBinary => Ok(Arc::new(LargeBinaryArray::from(
array.as_string::<i64>().clone(),
))),
Utf8View => cast_byte_to_view::<LargeUtf8Type, StringViewType>(array),
Time32(TimeUnit::Second) => parse_string::<Time32SecondType, i64>(array, cast_options),
Time32(TimeUnit::Millisecond) => {
parse_string::<Time32MillisecondType, i64>(array, cast_options)
Expand Down Expand Up @@ -1226,6 +1235,7 @@ pub fn cast_with_options(
FixedSizeBinary(size) => {
cast_binary_to_fixed_size_binary::<i32>(array, *size, cast_options)
}
BinaryView => cast_byte_to_view::<BinaryType, BinaryViewType>(array),
_ => Err(ArrowError::CastError(format!(
"Casting from {from_type:?} to {to_type:?} not supported",
))),
Expand All @@ -1240,6 +1250,7 @@ pub fn cast_with_options(
FixedSizeBinary(size) => {
cast_binary_to_fixed_size_binary::<i64>(array, *size, cast_options)
}
BinaryView => cast_byte_to_view::<LargeBinaryType, BinaryViewType>(array),
_ => Err(ArrowError::CastError(format!(
"Casting from {from_type:?} to {to_type:?} not supported",
))),
Expand Down Expand Up @@ -2238,6 +2249,56 @@ where
Ok(Arc::new(GenericByteArray::<TO>::from(array_data)))
}

/// Helper function to cast from one `ByteArrayType` array to `ByteViewType` array.
fn cast_byte_to_view<FROM, V>(array: &dyn Array) -> Result<ArrayRef, ArrowError>
where
FROM: ByteArrayType,
FROM::Offset: OffsetSizeTrait + ToPrimitive,
V: ByteViewType,
{
let data = array.to_data();
assert_eq!(data.data_type(), &FROM::DATA_TYPE);

let len = array.len();
let str_values_buf = data.buffers()[1].clone();
let offsets = data.buffers()[0].typed_data::<FROM::Offset>();

let mut views_builder = BufferBuilder::<u128>::new(len);
for w in offsets.windows(2) {
let offset = w[0].to_u32().unwrap();
let end = w[1].to_u32().unwrap();
let value_buf = &str_values_buf[offset as usize..end as usize];
let length = end - offset;

if length <= 12 {
let mut view_buffer = [0; 16];
view_buffer[0..4].copy_from_slice(&length.to_le_bytes());
view_buffer[4..4 + value_buf.len()].copy_from_slice(value_buf);
views_builder.append(u128::from_le_bytes(view_buffer));
} else {
let view = ByteView {
length,
prefix: u32::from_le_bytes(value_buf[0..4].try_into().unwrap()),
buffer_index: 0,
offset,
};
views_builder.append(view.into());
}
}

assert_eq!(views_builder.len(), len);

// Safety: the input was a valid array so it valid UTF8 (if string). And
// all offsets were valid and we created the views correctly
Ok(Arc::new(unsafe {
GenericByteViewArray::<V>::new_unchecked(
ScalarBuffer::new(views_builder.finish(), 0, len),
vec![str_values_buf],
data.nulls().cloned(),
)
}))
}

#[cfg(test)]
mod tests {
use arrow_buffer::{Buffer, NullBuffer};
Expand Down Expand Up @@ -5044,6 +5105,70 @@ mod tests {
}
}

#[test]
fn test_string_to_view() {
_test_string_to_view::<i32>();
_test_string_to_view::<i64>();
}

fn _test_string_to_view<O>()
where
O: OffsetSizeTrait,
{
let data = vec![
Some("hello"),
Some("world"),
None,
Some("large payload over 12 bytes"),
Some("lulu"),
];

let string_array = GenericStringArray::<O>::from(data.clone());

assert!(can_cast_types(
string_array.data_type(),
&DataType::Utf8View
));

let string_view_array = cast(&string_array, &DataType::Utf8View).unwrap();
assert_eq!(string_view_array.data_type(), &DataType::Utf8View);

let expect_string_view_array = StringViewArray::from(data);
assert_eq!(string_view_array.as_ref(), &expect_string_view_array);
}

#[test]
fn test_bianry_to_view() {
_test_binary_to_view::<i32>();
_test_binary_to_view::<i64>();
}

fn _test_binary_to_view<O>()
where
O: OffsetSizeTrait,
{
let data: Vec<Option<&[u8]>> = vec![
Some(b"hello"),
Some(b"world"),
None,
Some(b"large payload over 12 bytes"),
Some(b"lulu"),
];

let binary_array = GenericBinaryArray::<O>::from(data.clone());

assert!(can_cast_types(
binary_array.data_type(),
&DataType::BinaryView
));

let binary_view_array = cast(&binary_array, &DataType::BinaryView).unwrap();
assert_eq!(binary_view_array.data_type(), &DataType::BinaryView);

let expect_binary_view_array = BinaryViewArray::from(data);
assert_eq!(binary_view_array.as_ref(), &expect_binary_view_array);
}

#[test]
fn test_cast_from_f64() {
let f64_values: Vec<f64> = vec![
Expand Down

0 comments on commit a61f1dc

Please sign in to comment.