Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use take for dictionary like comparisons #3313

Merged
merged 4 commits into from Dec 9, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions arrow-string/Cargo.toml
Expand Up @@ -42,6 +42,7 @@ arrow-buffer = { version = "29.0.0", path = "../arrow-buffer" }
arrow-data = { version = "29.0.0", path = "../arrow-data" }
arrow-schema = { version = "29.0.0", path = "../arrow-schema" }
arrow-array = { version = "29.0.0", path = "../arrow-array" }
arrow-select = { version = "29.0.0", path = "../arrow-select" }
regex = { version = "1.7.0", default-features = false, features = ["std", "unicode", "perf"] }
regex-syntax = { version = "0.6.27", default-features = false, features = ["unicode"] }

Expand Down
121 changes: 17 additions & 104 deletions arrow-string/src/like.rs
Expand Up @@ -21,6 +21,7 @@ use arrow_array::*;
use arrow_data::bit_mask::combine_option_bitmap;
use arrow_data::ArrayData;
use arrow_schema::*;
use arrow_select::take::take;
use regex::Regex;
use std::collections::HashMap;

Expand Down Expand Up @@ -214,7 +215,10 @@ pub fn like_utf8_scalar_dyn(
DataType::Dictionary(_, _) => {
downcast_dictionary_array!(
left => {
like_dict_scalar(left, right)
let dict_comparison = like_utf8_scalar_dyn(left.values().as_ref(), right)?;
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The duplication is slightly unfortunate, but will hopefully get cleaned up as part of #3296

// TODO: Use take_boolean (#2967)
let array = take(&dict_comparison, left.keys(), None)?;
Ok(BooleanArray::from(array.data().clone()))
}
t => Err(ArrowError::ComputeError(format!(
"Should be DictionaryArray but got: {}", t
Expand All @@ -240,31 +244,6 @@ pub fn like_utf8_scalar<OffsetSize: OffsetSizeTrait>(
like_scalar(left, right)
}

/// Perform SQL `left LIKE right` operation on [`DictionaryArray`] with values
/// [`StringArray`]/[`LargeStringArray`] and a scalar.
///
/// See the documentation on [`like_utf8`] for more details.
fn like_dict_scalar<K: ArrowPrimitiveType>(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

left: &DictionaryArray<K>,
right: &str,
) -> Result<BooleanArray, ArrowError> {
match left.value_type() {
DataType::Utf8 => {
let left = left.downcast_dict::<GenericStringArray<i32>>().unwrap();
like_scalar(left, right)
}
DataType::LargeUtf8 => {
let left = left.downcast_dict::<GenericStringArray<i64>>().unwrap();
like_scalar(left, right)
}
_ => {
Err(ArrowError::ComputeError(
"like_dict_scalar only supports DictionaryArray with Utf8 or LargeUtf8 values".to_string(),
))
}
}
}

/// Transforms a like `pattern` to a regex compatible pattern. To achieve that, it does:
///
/// 1. Replace like wildcards for regex expressions as the pattern will be evaluated using regex match: `%` => `.*` and `_` => `.`
Expand Down Expand Up @@ -431,7 +410,10 @@ pub fn nlike_utf8_scalar_dyn(
DataType::Dictionary(_, _) => {
downcast_dictionary_array!(
left => {
nlike_dict_scalar(left, right)
let dict_comparison = nlike_utf8_scalar_dyn(left.values().as_ref(), right)?;
// TODO: Use take_boolean (#2967)
let array = take(&dict_comparison, left.keys(), None)?;
Ok(BooleanArray::from(array.data().clone()))
}
t => Err(ArrowError::ComputeError(format!(
"Should be DictionaryArray but got: {}", t
Expand All @@ -457,31 +439,6 @@ pub fn nlike_utf8_scalar<OffsetSize: OffsetSizeTrait>(
nlike_scalar(left, right)
}

/// Perform SQL `left NOT LIKE right` operation on [`DictionaryArray`] with values
/// [`StringArray`]/[`LargeStringArray`] and a scalar.
///
/// See the documentation on [`like_utf8`] for more details.
fn nlike_dict_scalar<K: ArrowPrimitiveType>(
left: &DictionaryArray<K>,
right: &str,
) -> Result<BooleanArray, ArrowError> {
match left.value_type() {
DataType::Utf8 => {
let left = left.downcast_dict::<GenericStringArray<i32>>().unwrap();
nlike_scalar(left, right)
}
DataType::LargeUtf8 => {
let left = left.downcast_dict::<GenericStringArray<i64>>().unwrap();
nlike_scalar(left, right)
}
_ => {
Err(ArrowError::ComputeError(
"nlike_dict_scalar only supports DictionaryArray with Utf8 or LargeUtf8 values".to_string(),
))
}
}
}

/// Perform SQL `left ILIKE right` operation on [`StringArray`] /
/// [`LargeStringArray`].
///
Expand Down Expand Up @@ -663,7 +620,10 @@ pub fn ilike_utf8_scalar_dyn(
DataType::Dictionary(_, _) => {
downcast_dictionary_array!(
left => {
ilike_dict_scalar(left, right)
let dict_comparison = ilike_utf8_scalar_dyn(left.values().as_ref(), right)?;
// TODO: Use take_boolean (#2967)
let array = take(&dict_comparison, left.keys(), None)?;
Ok(BooleanArray::from(array.data().clone()))
}
t => Err(ArrowError::ComputeError(format!(
"Should be DictionaryArray but got: {}", t
Expand All @@ -689,31 +649,6 @@ pub fn ilike_utf8_scalar<OffsetSize: OffsetSizeTrait>(
ilike_scalar(left, right)
}

/// Perform SQL `left ILIKE right` operation on [`DictionaryArray`] with values
/// [`StringArray`]/[`LargeStringArray`] and a scalar.
///
/// See the documentation on [`like_utf8`] for more details.
fn ilike_dict_scalar<K: ArrowPrimitiveType>(
left: &DictionaryArray<K>,
right: &str,
) -> Result<BooleanArray, ArrowError> {
match left.value_type() {
DataType::Utf8 => {
let left = left.downcast_dict::<GenericStringArray<i32>>().unwrap();
ilike_scalar(left, right)
}
DataType::LargeUtf8 => {
let left = left.downcast_dict::<GenericStringArray<i64>>().unwrap();
ilike_scalar(left, right)
}
_ => {
Err(ArrowError::ComputeError(
"ilike_dict_scalar only supports DictionaryArray with Utf8 or LargeUtf8 values".to_string(),
))
}
}
}

/// Perform SQL `left NOT ILIKE right` operation on [`StringArray`] /
/// [`LargeStringArray`].
///
Expand Down Expand Up @@ -843,7 +778,10 @@ pub fn nilike_utf8_scalar_dyn(
DataType::Dictionary(_, _) => {
downcast_dictionary_array!(
left => {
nilike_dict_scalar(left, right)
let dict_comparison = nilike_utf8_scalar_dyn(left.values().as_ref(), right)?;
// TODO: Use take_boolean (#2967)
let array = take(&dict_comparison, left.keys(), None)?;
Ok(BooleanArray::from(array.data().clone()))
}
t => Err(ArrowError::ComputeError(format!(
"Should be DictionaryArray but got: {}", t
Expand All @@ -869,31 +807,6 @@ pub fn nilike_utf8_scalar<OffsetSize: OffsetSizeTrait>(
nilike_scalar(left, right)
}

/// Perform SQL `left NOT ILIKE right` operation on [`DictionaryArray`] with values
/// [`StringArray`]/[`LargeStringArray`] and a scalar.
///
/// See the documentation on [`like_utf8`] for more details.
fn nilike_dict_scalar<K: ArrowPrimitiveType>(
left: &DictionaryArray<K>,
right: &str,
) -> Result<BooleanArray, ArrowError> {
match left.value_type() {
DataType::Utf8 => {
let left = left.downcast_dict::<GenericStringArray<i32>>().unwrap();
nilike_scalar(left, right)
}
DataType::LargeUtf8 => {
let left = left.downcast_dict::<GenericStringArray<i64>>().unwrap();
nilike_scalar(left, right)
}
_ => {
Err(ArrowError::ComputeError(
"nilike_dict_scalar only supports DictionaryArray with Utf8 or LargeUtf8 values".to_string(),
))
}
}
}

fn is_like_pattern(c: char) -> bool {
c == '%' || c == '_'
}
Expand Down
24 changes: 21 additions & 3 deletions arrow/benches/comparison_kernels.rs
Expand Up @@ -314,12 +314,30 @@ fn add_benchmark(c: &mut Criterion) {
b.iter(|| bench_regexp_is_match_utf8_scalar(&arr_string, "xx$"))
});

let dict_arr_a = create_string_dict_array::<Int32Type>(size, 0.0, 4);
let dict_arr_b = create_string_dict_array::<Int32Type>(size, 0.0, 4);
let strings = create_string_array::<i32>(20, 0.);
let dict_arr_a = create_dict_from_values::<Int32Type>(size, 0., &strings);
let dict_arr_b = create_dict_from_values::<Int32Type>(size, 0., &strings);

c.bench_function("dict eq string", |b| {
c.bench_function("eq dictionary[10] string[4])", |b| {
b.iter(|| bench_dict_eq(&dict_arr_a, &dict_arr_b))
});

c.bench_function("eq_dyn_utf8_scalar dictionary[10] string[4])", |b| {
b.iter(|| eq_dyn_utf8_scalar(&dict_arr_a, "test"))
});

c.bench_function(
"gt_eq_dyn_utf8_scalar scalar dictionary[10] string[4])",
|b| b.iter(|| gt_eq_dyn_utf8_scalar(&dict_arr_a, "test")),
);

c.bench_function("like_utf8_scalar_dyn dictionary[10] string[4])", |b| {
b.iter(|| like_utf8_scalar_dyn(&dict_arr_a, "test"))
});

c.bench_function("ilike_utf8_scalar_dyn dictionary[10] string[4])", |b| {
b.iter(|| ilike_utf8_scalar_dyn(&dict_arr_a, "test"))
});
}

criterion_group!(benches, add_benchmark);
Expand Down