Skip to content

Commit

Permalink
Support comparison between dictionary array and binary array (#2645)
Browse files Browse the repository at this point in the history
* Support comparison between dictionary array and binary array

* Use downcast_dictionary_array
  • Loading branch information
viirya committed Sep 6, 2022
1 parent 0c85233 commit 4e65952
Showing 1 changed file with 165 additions and 45 deletions.
210 changes: 165 additions & 45 deletions arrow/src/compute/kernels/comparison.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ use crate::datatypes::{
TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType,
TimestampSecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type,
};
#[allow(unused_imports)]
use crate::downcast_dictionary_array;
use crate::error::{ArrowError, Result};
use crate::util::bit_util;
use regex::Regex;
Expand Down Expand Up @@ -2172,58 +2174,23 @@ macro_rules! typed_dict_string_array_cmp {
}};
}

#[cfg(feature = "dyn_cmp_dict")]
macro_rules! typed_dict_boolean_array_cmp {
($LEFT: expr, $RIGHT: expr, $LEFT_KEY_TYPE: expr, $OP: expr) => {{
match $LEFT_KEY_TYPE {
DataType::Int8 => {
let left = as_dictionary_array::<Int8Type>($LEFT);
cmp_dict_boolean_array::<_, _>(left, $RIGHT, $OP)
}
DataType::Int16 => {
let left = as_dictionary_array::<Int16Type>($LEFT);
cmp_dict_boolean_array::<_, _>(left, $RIGHT, $OP)
}
DataType::Int32 => {
let left = as_dictionary_array::<Int32Type>($LEFT);
cmp_dict_boolean_array::<_, _>(left, $RIGHT, $OP)
}
DataType::Int64 => {
let left = as_dictionary_array::<Int64Type>($LEFT);
cmp_dict_boolean_array::<_, _>(left, $RIGHT, $OP)
}
DataType::UInt8 => {
let left = as_dictionary_array::<UInt8Type>($LEFT);
cmp_dict_boolean_array::<_, _>(left, $RIGHT, $OP)
}
DataType::UInt16 => {
let left = as_dictionary_array::<UInt16Type>($LEFT);
cmp_dict_boolean_array::<_, _>(left, $RIGHT, $OP)
}
DataType::UInt32 => {
let left = as_dictionary_array::<UInt32Type>($LEFT);
cmp_dict_boolean_array::<_, _>(left, $RIGHT, $OP)
}
DataType::UInt64 => {
let left = as_dictionary_array::<UInt64Type>($LEFT);
cmp_dict_boolean_array::<_, _>(left, $RIGHT, $OP)
}
t => Err(ArrowError::NotYetImplemented(format!(
"Cannot compare dictionary array of key type {}",
t
))),
}
}};
}

#[cfg(feature = "dyn_cmp_dict")]
macro_rules! typed_cmp_dict_non_dict {
($LEFT: expr, $RIGHT: expr, $OP_BOOL: expr, $OP: expr, $OP_FLOAT: expr) => {{
match ($LEFT.data_type(), $RIGHT.data_type()) {
(DataType::Dictionary(left_key_type, left_value_type), right_type) => {
match (left_value_type.as_ref(), right_type) {
(DataType::Boolean, DataType::Boolean) => {
typed_dict_boolean_array_cmp!($LEFT, $RIGHT, left_key_type.as_ref(), $OP_BOOL)
let left = $LEFT;
downcast_dictionary_array!(
left => {
cmp_dict_boolean_array::<_, _>(left, $RIGHT, $OP)
}
_ => Err(ArrowError::NotYetImplemented(format!(
"Cannot compare dictionary array of key type {}",
left_key_type.as_ref()
))),
)
}
(DataType::Int8, DataType::Int8) => {
typed_dict_non_dict_cmp!($LEFT, $RIGHT, left_key_type.as_ref(), Int8Type, $OP_BOOL, $OP)
Expand Down Expand Up @@ -2261,6 +2228,30 @@ macro_rules! typed_cmp_dict_non_dict {
(DataType::LargeUtf8, DataType::LargeUtf8) => {
typed_dict_string_array_cmp!($LEFT, $RIGHT, left_key_type.as_ref(), i64, $OP)
}
(DataType::Binary, DataType::Binary) => {
let left = $LEFT;
downcast_dictionary_array!(
left => {
cmp_dict_binary_array::<_, i32, _>(left, $RIGHT, $OP)
}
_ => Err(ArrowError::NotYetImplemented(format!(
"Cannot compare dictionary array of key type {}",
left_key_type.as_ref()
))),
)
}
(DataType::LargeBinary, DataType::LargeBinary) => {
let left = $LEFT;
downcast_dictionary_array!(
left => {
cmp_dict_binary_array::<_, i64, _>(left, $RIGHT, $OP)
}
_ => Err(ArrowError::NotYetImplemented(format!(
"Cannot compare dictionary array of key type {}",
left_key_type.as_ref()
))),
)
}
(t1, t2) if t1 == t2 => Err(ArrowError::NotYetImplemented(format!(
"Comparing dictionary array of type {} with array of type {} is not yet implemented",
t1, t2
Expand Down Expand Up @@ -2672,6 +2663,29 @@ where
)
}

/// Perform given operation on `DictionaryArray` and `GenericBinaryArray`. The value
/// type of `DictionaryArray` is same as `GenericBinaryArray`'s type.
#[cfg(feature = "dyn_cmp_dict")]
fn cmp_dict_binary_array<K, OffsetSize: OffsetSizeTrait, F>(
left: &DictionaryArray<K>,
right: &dyn Array,
op: F,
) -> Result<BooleanArray>
where
K: ArrowNumericType,
F: Fn(&[u8], &[u8]) -> bool,
{
compare_op(
left.downcast_dict::<GenericBinaryArray<OffsetSize>>()
.unwrap(),
right
.as_any()
.downcast_ref::<GenericBinaryArray<OffsetSize>>()
.unwrap(),
op,
)
}

/// Perform given operation on two `DictionaryArray`s which value type is
/// primitive type. Returns an error if the two arrays have different value
/// type
Expand Down Expand Up @@ -6149,6 +6163,112 @@ mod tests {
);
}

#[test]
#[cfg(feature = "dyn_cmp_dict")]
fn test_eq_dyn_neq_dyn_dictionary_to_binary_array() {
let values: BinaryArray = ["hello", "", "parquet"]
.into_iter()
.map(|b| Some(b.as_bytes()))
.collect();

let keys = UInt64Array::from(vec![Some(0_u64), None, Some(2), Some(2)]);
let dict_array = DictionaryArray::<UInt64Type>::try_new(&keys, &values).unwrap();

let array: BinaryArray = ["hello", "", "parquet", "test"]
.into_iter()
.map(|b| Some(b.as_bytes()))
.collect();

let result = eq_dyn(&dict_array, &array);
assert_eq!(
result.unwrap(),
BooleanArray::from(vec![Some(true), None, Some(true), Some(false)])
);

let result = eq_dyn(&array, &dict_array);
assert_eq!(
result.unwrap(),
BooleanArray::from(vec![Some(true), None, Some(true), Some(false)])
);

let result = neq_dyn(&dict_array, &array);
assert_eq!(
result.unwrap(),
BooleanArray::from(vec![Some(false), None, Some(false), Some(true)])
);

let result = neq_dyn(&array, &dict_array);
assert_eq!(
result.unwrap(),
BooleanArray::from(vec![Some(false), None, Some(false), Some(true)])
);
}

#[test]
#[cfg(feature = "dyn_cmp_dict")]
fn test_lt_dyn_lt_eq_dyn_gt_dyn_gt_eq_dyn_dictionary_to_binary_array() {
let values: BinaryArray = ["hello", "", "parquet"]
.into_iter()
.map(|b| Some(b.as_bytes()))
.collect();

let keys = UInt64Array::from(vec![Some(0_u64), None, Some(2), Some(2)]);
let dict_array = DictionaryArray::<UInt64Type>::try_new(&keys, &values).unwrap();

let array: BinaryArray = ["hello", "", "parquet", "test"]
.into_iter()
.map(|b| Some(b.as_bytes()))
.collect();

let result = lt_dyn(&dict_array, &array);
assert_eq!(
result.unwrap(),
BooleanArray::from(vec![Some(false), None, Some(false), Some(true)])
);

let result = lt_dyn(&array, &dict_array);
assert_eq!(
result.unwrap(),
BooleanArray::from(vec![Some(false), None, Some(false), Some(false)])
);

let result = lt_eq_dyn(&dict_array, &array);
assert_eq!(
result.unwrap(),
BooleanArray::from(vec![Some(true), None, Some(true), Some(true)])
);

let result = lt_eq_dyn(&array, &dict_array);
assert_eq!(
result.unwrap(),
BooleanArray::from(vec![Some(true), None, Some(true), Some(false)])
);

let result = gt_dyn(&dict_array, &array);
assert_eq!(
result.unwrap(),
BooleanArray::from(vec![Some(false), None, Some(false), Some(false)])
);

let result = gt_dyn(&array, &dict_array);
assert_eq!(
result.unwrap(),
BooleanArray::from(vec![Some(false), None, Some(false), Some(true)])
);

let result = gt_eq_dyn(&dict_array, &array);
assert_eq!(
result.unwrap(),
BooleanArray::from(vec![Some(true), None, Some(true), Some(false)])
);

let result = gt_eq_dyn(&array, &dict_array);
assert_eq!(
result.unwrap(),
BooleanArray::from(vec![Some(true), None, Some(true), Some(true)])
);
}

#[test]
fn test_dict_nlike_kernels() {
let data =
Expand Down

0 comments on commit 4e65952

Please sign in to comment.