From 4c1cffb93d80287dcba11a2ebbe33f9ee7b064c9 Mon Sep 17 00:00:00 2001 From: psvri Date: Mon, 15 Aug 2022 23:53:18 +0530 Subject: [PATCH 1/4] Implement utf8 to binary in compute::kernels::cast --- arrow/src/compute/kernels/cast.rs | 148 +++++++++++++++++++----------- arrow/src/datatypes/datatype.rs | 2 +- 2 files changed, 97 insertions(+), 53 deletions(-) diff --git a/arrow/src/compute/kernels/cast.rs b/arrow/src/compute/kernels/cast.rs index ddca0c2e935..1f8c563a065 100644 --- a/arrow/src/compute/kernels/cast.rs +++ b/arrow/src/compute/kernels/cast.rs @@ -146,7 +146,8 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (Utf8, LargeUtf8) => true, (LargeUtf8, Utf8) => true, (Utf8, - Date32 + Binary + | Date32 | Date64 | Time32(TimeUnit::Second) | Time32(TimeUnit::Millisecond) @@ -156,7 +157,8 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { ) => true, (Utf8, _) => DataType::is_numeric(to_type), (LargeUtf8, - Date32 + LargeBinary + | Date32 | Date64 | Time32(TimeUnit::Second) | Time32(TimeUnit::Millisecond) @@ -693,6 +695,7 @@ pub fn cast_with_options( Float64 => cast_string_to_numeric::(array, cast_options), Date32 => cast_string_to_date32::(&**array, cast_options), Date64 => cast_string_to_date64::(&**array, cast_options), + Binary => cast_string_to_binary::(array), Time32(TimeUnit::Second) => { cast_string_to_time32second::(&**array, cast_options) } @@ -839,6 +842,7 @@ pub fn cast_with_options( Float64 => cast_string_to_numeric::(array, cast_options), Date32 => cast_string_to_date32::(&**array, cast_options), Date64 => cast_string_to_date64::(&**array, cast_options), + LargeBinary => cast_string_to_binary::(array), Time32(TimeUnit::Second) => { cast_string_to_time32second::(&**array, cast_options) } @@ -1254,6 +1258,21 @@ pub fn cast_with_options( } } +/// Cast to string array to binary array +fn cast_string_to_binary(array: &ArrayRef) -> Result +where + OffsetSize: OffsetSizeTrait, +{ + let array = array.as_any().downcast_ref::>().unwrap(); + + Ok(Arc::new(array.iter().map(|x| { + match x { + Some(data) => Some(data.as_bytes()), + None => None + } + }).collect::>())) +} + /// Get the time unit as a multiple of a second const fn time_unit_multiple(unit: &TimeUnit) -> i64 { match unit { @@ -3471,6 +3490,31 @@ mod tests { } } + #[test] + fn test_cast_string_to_binary() { + let string_1 = "Hi"; + let string_2 = "Hello"; + + let bytes_1 = string_1.as_bytes(); + let bytes_2 = string_2.as_bytes(); + + let string_data = vec![Some(string_1), Some(string_2), None]; + let a1 = Arc::new(StringArray::from(string_data.clone())) as ArrayRef; + let a2 = Arc::new(LargeStringArray::from(string_data)) as ArrayRef; + + let mut array_ref = cast(&a1, &DataType::Binary).unwrap(); + let down_cast = array_ref.as_any().downcast_ref::().unwrap(); + assert_eq!(bytes_1, down_cast.value(0)); + assert_eq!(bytes_2, down_cast.value(1)); + assert!(down_cast.is_null(2)); + + array_ref = cast(&a2, &DataType::LargeBinary).unwrap(); + let down_cast = array_ref.as_any().downcast_ref::().unwrap(); + assert_eq!(bytes_1, down_cast.value(0)); + assert_eq!(bytes_2, down_cast.value(1)); + assert!(down_cast.is_null(2)); + } + #[test] fn test_cast_date32_to_int32() { let a = Date32Array::from(vec![10000, 17890]); @@ -3688,15 +3732,15 @@ mod tests { #[test] fn test_cast_from_f64() { let f64_values: Vec = vec![ - std::i64::MIN as f64, - std::i32::MIN as f64, - std::i16::MIN as f64, - std::i8::MIN as f64, + i64::MIN as f64, + i32::MIN as f64, + i16::MIN as f64, + i8::MIN as f64, 0_f64, - std::u8::MAX as f64, - std::u16::MAX as f64, - std::u32::MAX as f64, - std::u64::MAX as f64, + u8::MAX as f64, + u16::MAX as f64, + u32::MAX as f64, + u64::MAX as f64, ]; let f64_array: ArrayRef = Arc::new(Float64Array::from(f64_values)); @@ -3838,15 +3882,15 @@ mod tests { #[test] fn test_cast_from_f32() { let f32_values: Vec = vec![ - std::i32::MIN as f32, - std::i32::MIN as f32, - std::i16::MIN as f32, - std::i8::MIN as f32, + i32::MIN as f32, + i32::MIN as f32, + i16::MIN as f32, + i8::MIN as f32, 0_f32, - std::u8::MAX as f32, - std::u16::MAX as f32, - std::u32::MAX as f32, - std::u32::MAX as f32, + u8::MAX as f32, + u16::MAX as f32, + u32::MAX as f32, + u32::MAX as f32, ]; let f32_array: ArrayRef = Arc::new(Float32Array::from(f32_values)); @@ -3975,10 +4019,10 @@ mod tests { fn test_cast_from_uint64() { let u64_values: Vec = vec![ 0, - std::u8::MAX as u64, - std::u16::MAX as u64, - std::u32::MAX as u64, - std::u64::MAX, + u8::MAX as u64, + u16::MAX as u64, + u32::MAX as u64, + u64::MAX, ]; let u64_array: ArrayRef = Arc::new(UInt64Array::from(u64_values)); @@ -4056,9 +4100,9 @@ mod tests { fn test_cast_from_uint32() { let u32_values: Vec = vec![ 0, - std::u8::MAX as u32, - std::u16::MAX as u32, - std::u32::MAX as u32, + u8::MAX as u32, + u16::MAX as u32, + u32::MAX as u32, ]; let u32_array: ArrayRef = Arc::new(UInt32Array::from(u32_values)); @@ -4125,7 +4169,7 @@ mod tests { #[test] fn test_cast_from_uint16() { - let u16_values: Vec = vec![0, std::u8::MAX as u16, std::u16::MAX as u16]; + let u16_values: Vec = vec![0, u8::MAX as u16, u16::MAX as u16]; let u16_array: ArrayRef = Arc::new(UInt16Array::from(u16_values)); let f64_expected = vec!["0.0", "255.0", "65535.0"]; @@ -4191,7 +4235,7 @@ mod tests { #[test] fn test_cast_from_uint8() { - let u8_values: Vec = vec![0, std::u8::MAX]; + let u8_values: Vec = vec![0, u8::MAX]; let u8_array: ArrayRef = Arc::new(UInt8Array::from(u8_values)); let f64_expected = vec!["0.0", "255.0"]; @@ -4258,15 +4302,15 @@ mod tests { #[test] fn test_cast_from_int64() { let i64_values: Vec = vec![ - std::i64::MIN, - std::i32::MIN as i64, - std::i16::MIN as i64, - std::i8::MIN as i64, + i64::MIN, + i32::MIN as i64, + i16::MIN as i64, + i8::MIN as i64, 0, - std::i8::MAX as i64, - std::i16::MAX as i64, - std::i32::MAX as i64, - std::i64::MAX, + i8::MAX as i64, + i16::MAX as i64, + i32::MAX as i64, + i64::MAX, ]; let i64_array: ArrayRef = Arc::new(Int64Array::from(i64_values)); @@ -4413,13 +4457,13 @@ mod tests { #[test] fn test_cast_from_int32() { let i32_values: Vec = vec![ - std::i32::MIN as i32, - std::i16::MIN as i32, - std::i8::MIN as i32, + i32::MIN as i32, + i16::MIN as i32, + i8::MIN as i32, 0, - std::i8::MAX as i32, - std::i16::MAX as i32, - std::i32::MAX as i32, + i8::MAX as i32, + i16::MAX as i32, + i32::MAX as i32, ]; let i32_array: ArrayRef = Arc::new(Int32Array::from(i32_values)); @@ -4508,11 +4552,11 @@ mod tests { #[test] fn test_cast_from_int16() { let i16_values: Vec = vec![ - std::i16::MIN, - std::i8::MIN as i16, + i16::MIN, + i8::MIN as i16, 0, - std::i8::MAX as i16, - std::i16::MAX, + i8::MAX as i16, + i16::MAX, ]; let i16_array: ArrayRef = Arc::new(Int16Array::from(i16_values)); @@ -4580,13 +4624,13 @@ mod tests { #[test] fn test_cast_from_date32() { let i32_values: Vec = vec![ - std::i32::MIN as i32, - std::i16::MIN as i32, - std::i8::MIN as i32, + i32::MIN as i32, + i16::MIN as i32, + i8::MIN as i32, 0, - std::i8::MAX as i32, - std::i16::MAX as i32, - std::i32::MAX as i32, + i8::MAX as i32, + i16::MAX as i32, + i32::MAX as i32, ]; let date32_array: ArrayRef = Arc::new(Date32Array::from(i32_values)); @@ -4607,7 +4651,7 @@ mod tests { #[test] fn test_cast_from_int8() { - let i8_values: Vec = vec![std::i8::MIN, 0, std::i8::MAX]; + let i8_values: Vec = vec![i8::MIN, 0, i8::MAX]; let i8_array: ArrayRef = Arc::new(Int8Array::from(i8_values)); let f64_expected = vec!["-128.0", "0.0", "127.0"]; diff --git a/arrow/src/datatypes/datatype.rs b/arrow/src/datatypes/datatype.rs index 97ddc0c4a61..4f65afd2d20 100644 --- a/arrow/src/datatypes/datatype.rs +++ b/arrow/src/datatypes/datatype.rs @@ -1391,7 +1391,7 @@ impl DataType { } } - /// Returns true if this type is numeric: (UInt*, Unit*, or Float*). + /// Returns true if this type is numeric: (UInt*, Int*, or Float*). pub fn is_numeric(t: &DataType) -> bool { use DataType::*; matches!( From 2826e8dd414f024c981834354c55aba219d99247 Mon Sep 17 00:00:00 2001 From: psvri Date: Tue, 16 Aug 2022 00:03:40 +0530 Subject: [PATCH 2/4] Fix clippy lints --- arrow/src/compute/kernels/cast.rs | 39 ++++++++++++++----------------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/arrow/src/compute/kernels/cast.rs b/arrow/src/compute/kernels/cast.rs index 1f8c563a065..bfafe65fb5d 100644 --- a/arrow/src/compute/kernels/cast.rs +++ b/arrow/src/compute/kernels/cast.rs @@ -1263,14 +1263,17 @@ fn cast_string_to_binary(array: &ArrayRef) -> Result where OffsetSize: OffsetSizeTrait, { - let array = array.as_any().downcast_ref::>().unwrap(); + let array = array + .as_any() + .downcast_ref::>() + .unwrap(); - Ok(Arc::new(array.iter().map(|x| { - match x { - Some(data) => Some(data.as_bytes()), - None => None - } - }).collect::>())) + Ok(Arc::new( + array + .iter() + .map(|x| x.map(|data| data.as_bytes())) + .collect::>(), + )) } /// Get the time unit as a multiple of a second @@ -3509,7 +3512,10 @@ mod tests { assert!(down_cast.is_null(2)); array_ref = cast(&a2, &DataType::LargeBinary).unwrap(); - let down_cast = array_ref.as_any().downcast_ref::().unwrap(); + let down_cast = array_ref + .as_any() + .downcast_ref::() + .unwrap(); assert_eq!(bytes_1, down_cast.value(0)); assert_eq!(bytes_2, down_cast.value(1)); assert!(down_cast.is_null(2)); @@ -4098,12 +4104,8 @@ mod tests { #[test] fn test_cast_from_uint32() { - let u32_values: Vec = vec![ - 0, - u8::MAX as u32, - u16::MAX as u32, - u32::MAX as u32, - ]; + let u32_values: Vec = + vec![0, u8::MAX as u32, u16::MAX as u32, u32::MAX as u32]; let u32_array: ArrayRef = Arc::new(UInt32Array::from(u32_values)); let f64_expected = vec!["0.0", "255.0", "65535.0", "4294967295.0"]; @@ -4551,13 +4553,8 @@ mod tests { #[test] fn test_cast_from_int16() { - let i16_values: Vec = vec![ - i16::MIN, - i8::MIN as i16, - 0, - i8::MAX as i16, - i16::MAX, - ]; + let i16_values: Vec = + vec![i16::MIN, i8::MIN as i16, 0, i8::MAX as i16, i16::MAX]; let i16_array: ArrayRef = Arc::new(Int16Array::from(i16_values)); let f64_expected = vec!["-32768.0", "-128.0", "0.0", "127.0", "32767.0"]; From f38e4b607dbb1a90a4529d6277d3c38dc1631f61 Mon Sep 17 00:00:00 2001 From: psvri Date: Tue, 16 Aug 2022 19:15:10 +0530 Subject: [PATCH 3/4] Improve performance --- arrow/src/compute/kernels/cast.rs | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/arrow/src/compute/kernels/cast.rs b/arrow/src/compute/kernels/cast.rs index bfafe65fb5d..586a6dfac95 100644 --- a/arrow/src/compute/kernels/cast.rs +++ b/arrow/src/compute/kernels/cast.rs @@ -695,7 +695,7 @@ pub fn cast_with_options( Float64 => cast_string_to_numeric::(array, cast_options), Date32 => cast_string_to_date32::(&**array, cast_options), Date64 => cast_string_to_date64::(&**array, cast_options), - Binary => cast_string_to_binary::(array), + Binary => cast_string_to_binary(array), Time32(TimeUnit::Second) => { cast_string_to_time32second::(&**array, cast_options) } @@ -842,7 +842,7 @@ pub fn cast_with_options( Float64 => cast_string_to_numeric::(array, cast_options), Date32 => cast_string_to_date32::(&**array, cast_options), Date64 => cast_string_to_date64::(&**array, cast_options), - LargeBinary => cast_string_to_binary::(array), + LargeBinary => cast_string_to_binary(array), Time32(TimeUnit::Second) => { cast_string_to_time32second::(&**array, cast_options) } @@ -1259,21 +1259,22 @@ pub fn cast_with_options( } /// Cast to string array to binary array -fn cast_string_to_binary(array: &ArrayRef) -> Result -where - OffsetSize: OffsetSizeTrait, +fn cast_string_to_binary(array: &ArrayRef) -> Result { - let array = array - .as_any() - .downcast_ref::>() - .unwrap(); + let from_type = array.data_type(); + match *from_type { + DataType::Utf8 => { + let data = unsafe {array.data().clone().into_builder().data_type(DataType::Binary).build_unchecked()}; - Ok(Arc::new( - array - .iter() - .map(|x| x.map(|data| data.as_bytes())) - .collect::>(), - )) + Ok(Arc::new(BinaryArray::from(data)) as ArrayRef) + } + DataType::LargeUtf8 => { + let data = unsafe {array.data().clone().into_builder().data_type(DataType::LargeBinary).build_unchecked()}; + + Ok(Arc::new(LargeBinaryArray::from(data)) as ArrayRef) + } + _ => Err(ArrowError::InvalidArgumentError(format!("{:?} cannot be converted to binary array", from_type))), + } } /// Get the time unit as a multiple of a second From 79fbec38d2f0bc0ff60ef1607b500eeff79b9ed1 Mon Sep 17 00:00:00 2001 From: psvri Date: Tue, 16 Aug 2022 20:02:15 +0530 Subject: [PATCH 4/4] Fix formatting errors --- arrow/src/compute/kernels/cast.rs | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/arrow/src/compute/kernels/cast.rs b/arrow/src/compute/kernels/cast.rs index 586a6dfac95..ebbf4a65b19 100644 --- a/arrow/src/compute/kernels/cast.rs +++ b/arrow/src/compute/kernels/cast.rs @@ -1259,21 +1259,37 @@ pub fn cast_with_options( } /// Cast to string array to binary array -fn cast_string_to_binary(array: &ArrayRef) -> Result -{ +fn cast_string_to_binary(array: &ArrayRef) -> Result { let from_type = array.data_type(); match *from_type { DataType::Utf8 => { - let data = unsafe {array.data().clone().into_builder().data_type(DataType::Binary).build_unchecked()}; + let data = unsafe { + array + .data() + .clone() + .into_builder() + .data_type(DataType::Binary) + .build_unchecked() + }; - Ok(Arc::new(BinaryArray::from(data)) as ArrayRef) + Ok(Arc::new(BinaryArray::from(data)) as ArrayRef) } DataType::LargeUtf8 => { - let data = unsafe {array.data().clone().into_builder().data_type(DataType::LargeBinary).build_unchecked()}; + let data = unsafe { + array + .data() + .clone() + .into_builder() + .data_type(DataType::LargeBinary) + .build_unchecked() + }; - Ok(Arc::new(LargeBinaryArray::from(data)) as ArrayRef) + Ok(Arc::new(LargeBinaryArray::from(data)) as ArrayRef) } - _ => Err(ArrowError::InvalidArgumentError(format!("{:?} cannot be converted to binary array", from_type))), + _ => Err(ArrowError::InvalidArgumentError(format!( + "{:?} cannot be converted to binary array", + from_type + ))), } }