Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Utf8array casting #2456

Merged
merged 4 commits into from Aug 16, 2022
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
157 changes: 99 additions & 58 deletions arrow/src/compute/kernels/cast.rs
Expand Up @@ -146,7 +146,8 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool {
(Utf8, LargeUtf8) => true,
(LargeUtf8, Utf8) => true,
(Utf8,
Date32
Binary
| Date32
| Date64
| Time32(TimeUnit::Second)
| Time32(TimeUnit::Millisecond)
Expand All @@ -156,7 +157,8 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool {
) => true,
(Utf8, _) => DataType::is_numeric(to_type),
(LargeUtf8,
Date32
LargeBinary
| Date32
| Date64
| Time32(TimeUnit::Second)
| Time32(TimeUnit::Millisecond)
Expand Down Expand Up @@ -693,6 +695,7 @@ pub fn cast_with_options(
Float64 => cast_string_to_numeric::<Float64Type, i32>(array, cast_options),
Date32 => cast_string_to_date32::<i32>(&**array, cast_options),
Date64 => cast_string_to_date64::<i32>(&**array, cast_options),
Binary => cast_string_to_binary::<i32>(array),
Time32(TimeUnit::Second) => {
cast_string_to_time32second::<i32>(&**array, cast_options)
}
Expand Down Expand Up @@ -839,6 +842,7 @@ pub fn cast_with_options(
Float64 => cast_string_to_numeric::<Float64Type, i64>(array, cast_options),
Date32 => cast_string_to_date32::<i64>(&**array, cast_options),
Date64 => cast_string_to_date64::<i64>(&**array, cast_options),
LargeBinary => cast_string_to_binary::<i64>(array),
Time32(TimeUnit::Second) => {
cast_string_to_time32second::<i64>(&**array, cast_options)
}
Expand Down Expand Up @@ -1254,6 +1258,24 @@ pub fn cast_with_options(
}
}

/// Cast to string array to binary array
fn cast_string_to_binary<OffsetSize>(array: &ArrayRef) -> Result<ArrayRef>
where
OffsetSize: OffsetSizeTrait,
{
let array = array
.as_any()
.downcast_ref::<GenericStringArray<OffsetSize>>()
.unwrap();

Ok(Arc::new(
array
.iter()
.map(|x| x.map(|data| data.as_bytes()))
.collect::<GenericBinaryArray<OffsetSize>>(),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As this isn't changing the size of the offsets, it would be significantly faster to just reuse the existing buffers

Something like

assert_eq(array.data_type(), DataType::Utf8);
array.data().clone().into_builder().data_type(DataType::Binary).build_unchecked()

And similar for LargeUtf8 and LargeBinary

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed it in latest commit

))
}

/// Get the time unit as a multiple of a second
const fn time_unit_multiple(unit: &TimeUnit) -> i64 {
match unit {
Expand Down Expand Up @@ -3471,6 +3493,34 @@ mod tests {
}
}

#[test]
fn test_cast_string_to_binary() {
let string_1 = "Hi";
let string_2 = "Hello";

let bytes_1 = string_1.as_bytes();
let bytes_2 = string_2.as_bytes();

let string_data = vec![Some(string_1), Some(string_2), None];
let a1 = Arc::new(StringArray::from(string_data.clone())) as ArrayRef;
let a2 = Arc::new(LargeStringArray::from(string_data)) as ArrayRef;

let mut array_ref = cast(&a1, &DataType::Binary).unwrap();
let down_cast = array_ref.as_any().downcast_ref::<BinaryArray>().unwrap();
assert_eq!(bytes_1, down_cast.value(0));
assert_eq!(bytes_2, down_cast.value(1));
assert!(down_cast.is_null(2));

array_ref = cast(&a2, &DataType::LargeBinary).unwrap();
let down_cast = array_ref
.as_any()
.downcast_ref::<LargeBinaryArray>()
.unwrap();
assert_eq!(bytes_1, down_cast.value(0));
assert_eq!(bytes_2, down_cast.value(1));
assert!(down_cast.is_null(2));
}

#[test]
fn test_cast_date32_to_int32() {
let a = Date32Array::from(vec![10000, 17890]);
Expand Down Expand Up @@ -3688,15 +3738,15 @@ mod tests {
#[test]
fn test_cast_from_f64() {
let f64_values: Vec<f64> = vec![
std::i64::MIN as f64,
std::i32::MIN as f64,
std::i16::MIN as f64,
std::i8::MIN as f64,
i64::MIN as f64,
i32::MIN as f64,
i16::MIN as f64,
i8::MIN as f64,
0_f64,
std::u8::MAX as f64,
std::u16::MAX as f64,
std::u32::MAX as f64,
std::u64::MAX as f64,
u8::MAX as f64,
u16::MAX as f64,
u32::MAX as f64,
u64::MAX as f64,
];
let f64_array: ArrayRef = Arc::new(Float64Array::from(f64_values));

Expand Down Expand Up @@ -3838,15 +3888,15 @@ mod tests {
#[test]
fn test_cast_from_f32() {
let f32_values: Vec<f32> = vec![
std::i32::MIN as f32,
std::i32::MIN as f32,
std::i16::MIN as f32,
std::i8::MIN as f32,
i32::MIN as f32,
i32::MIN as f32,
i16::MIN as f32,
i8::MIN as f32,
0_f32,
std::u8::MAX as f32,
std::u16::MAX as f32,
std::u32::MAX as f32,
std::u32::MAX as f32,
u8::MAX as f32,
u16::MAX as f32,
u32::MAX as f32,
u32::MAX as f32,
];
let f32_array: ArrayRef = Arc::new(Float32Array::from(f32_values));

Expand Down Expand Up @@ -3975,10 +4025,10 @@ mod tests {
fn test_cast_from_uint64() {
let u64_values: Vec<u64> = vec![
0,
std::u8::MAX as u64,
std::u16::MAX as u64,
std::u32::MAX as u64,
std::u64::MAX,
u8::MAX as u64,
u16::MAX as u64,
u32::MAX as u64,
u64::MAX,
];
let u64_array: ArrayRef = Arc::new(UInt64Array::from(u64_values));

Expand Down Expand Up @@ -4054,12 +4104,8 @@ mod tests {

#[test]
fn test_cast_from_uint32() {
let u32_values: Vec<u32> = vec![
0,
std::u8::MAX as u32,
std::u16::MAX as u32,
std::u32::MAX as u32,
];
let u32_values: Vec<u32> =
vec![0, u8::MAX as u32, u16::MAX as u32, u32::MAX as u32];
let u32_array: ArrayRef = Arc::new(UInt32Array::from(u32_values));

let f64_expected = vec!["0.0", "255.0", "65535.0", "4294967295.0"];
Expand Down Expand Up @@ -4125,7 +4171,7 @@ mod tests {

#[test]
fn test_cast_from_uint16() {
let u16_values: Vec<u16> = vec![0, std::u8::MAX as u16, std::u16::MAX as u16];
let u16_values: Vec<u16> = vec![0, u8::MAX as u16, u16::MAX as u16];
let u16_array: ArrayRef = Arc::new(UInt16Array::from(u16_values));

let f64_expected = vec!["0.0", "255.0", "65535.0"];
Expand Down Expand Up @@ -4191,7 +4237,7 @@ mod tests {

#[test]
fn test_cast_from_uint8() {
let u8_values: Vec<u8> = vec![0, std::u8::MAX];
let u8_values: Vec<u8> = vec![0, u8::MAX];
let u8_array: ArrayRef = Arc::new(UInt8Array::from(u8_values));

let f64_expected = vec!["0.0", "255.0"];
Expand Down Expand Up @@ -4258,15 +4304,15 @@ mod tests {
#[test]
fn test_cast_from_int64() {
let i64_values: Vec<i64> = vec![
std::i64::MIN,
std::i32::MIN as i64,
std::i16::MIN as i64,
std::i8::MIN as i64,
i64::MIN,
i32::MIN as i64,
i16::MIN as i64,
i8::MIN as i64,
0,
std::i8::MAX as i64,
std::i16::MAX as i64,
std::i32::MAX as i64,
std::i64::MAX,
i8::MAX as i64,
i16::MAX as i64,
i32::MAX as i64,
i64::MAX,
];
let i64_array: ArrayRef = Arc::new(Int64Array::from(i64_values));

Expand Down Expand Up @@ -4413,13 +4459,13 @@ mod tests {
#[test]
fn test_cast_from_int32() {
let i32_values: Vec<i32> = vec![
std::i32::MIN as i32,
std::i16::MIN as i32,
std::i8::MIN as i32,
i32::MIN as i32,
i16::MIN as i32,
i8::MIN as i32,
0,
std::i8::MAX as i32,
std::i16::MAX as i32,
std::i32::MAX as i32,
i8::MAX as i32,
i16::MAX as i32,
i32::MAX as i32,
];
let i32_array: ArrayRef = Arc::new(Int32Array::from(i32_values));

Expand Down Expand Up @@ -4507,13 +4553,8 @@ mod tests {

#[test]
fn test_cast_from_int16() {
let i16_values: Vec<i16> = vec![
std::i16::MIN,
std::i8::MIN as i16,
0,
std::i8::MAX as i16,
std::i16::MAX,
];
let i16_values: Vec<i16> =
vec![i16::MIN, i8::MIN as i16, 0, i8::MAX as i16, i16::MAX];
let i16_array: ArrayRef = Arc::new(Int16Array::from(i16_values));

let f64_expected = vec!["-32768.0", "-128.0", "0.0", "127.0", "32767.0"];
Expand Down Expand Up @@ -4580,13 +4621,13 @@ mod tests {
#[test]
fn test_cast_from_date32() {
let i32_values: Vec<i32> = vec![
std::i32::MIN as i32,
std::i16::MIN as i32,
std::i8::MIN as i32,
i32::MIN as i32,
i16::MIN as i32,
i8::MIN as i32,
0,
std::i8::MAX as i32,
std::i16::MAX as i32,
std::i32::MAX as i32,
i8::MAX as i32,
i16::MAX as i32,
i32::MAX as i32,
];
let date32_array: ArrayRef = Arc::new(Date32Array::from(i32_values));

Expand All @@ -4607,7 +4648,7 @@ mod tests {

#[test]
fn test_cast_from_int8() {
let i8_values: Vec<i8> = vec![std::i8::MIN, 0, std::i8::MAX];
let i8_values: Vec<i8> = vec![i8::MIN, 0, i8::MAX];
let i8_array: ArrayRef = Arc::new(Int8Array::from(i8_values));

let f64_expected = vec!["-128.0", "0.0", "127.0"];
Expand Down
2 changes: 1 addition & 1 deletion arrow/src/datatypes/datatype.rs
Expand Up @@ -1391,7 +1391,7 @@ impl DataType {
}
}

/// Returns true if this type is numeric: (UInt*, Unit*, or Float*).
/// Returns true if this type is numeric: (UInt*, Int*, or Float*).
pub fn is_numeric(t: &DataType) -> bool {
use DataType::*;
matches!(
Expand Down