Skip to content

Commit

Permalink
Add nilike support in comparison (#1846)
Browse files Browse the repository at this point in the history
  • Loading branch information
MazterQyou committed Jun 15, 2022
1 parent 328c680 commit 9860aa7
Show file tree
Hide file tree
Showing 2 changed files with 162 additions and 0 deletions.
25 changes: 25 additions & 0 deletions arrow/benches/comparison_kernels.rs
Expand Up @@ -124,6 +124,11 @@ fn bench_ilike_utf8_scalar(arr_a: &StringArray, value_b: &str) {
.unwrap();
}

fn bench_nilike_utf8_scalar(arr_a: &StringArray, value_b: &str) {
nilike_utf8_scalar(criterion::black_box(arr_a), criterion::black_box(value_b))
.unwrap();
}

fn bench_regexp_is_match_utf8_scalar(arr_a: &StringArray, value_b: &str) {
regexp_is_match_utf8_scalar(
criterion::black_box(arr_a),
Expand Down Expand Up @@ -254,6 +259,26 @@ fn add_benchmark(c: &mut Criterion) {
b.iter(|| bench_ilike_utf8_scalar(&arr_string, "%xx_xX%xXX"))
});

c.bench_function("nilike_utf8 scalar equals", |b| {
b.iter(|| bench_nilike_utf8_scalar(&arr_string, "xxXX"))
});

c.bench_function("nilike_utf8 scalar contains", |b| {
b.iter(|| bench_nilike_utf8_scalar(&arr_string, "%xxXX%"))
});

c.bench_function("nilike_utf8 scalar ends with", |b| {
b.iter(|| bench_nilike_utf8_scalar(&arr_string, "xXXx%"))
});

c.bench_function("nilike_utf8 scalar starts with", |b| {
b.iter(|| bench_nilike_utf8_scalar(&arr_string, "%XXXx"))
});

c.bench_function("nilike_utf8 scalar complex", |b| {
b.iter(|| bench_nilike_utf8_scalar(&arr_string, "%xx_xX%xXX"))
});

c.bench_function("egexp_matches_utf8 scalar starts with", |b| {
b.iter(|| bench_regexp_is_match_utf8_scalar(&arr_string, "^xx"))
});
Expand Down
137 changes: 137 additions & 0 deletions arrow/src/compute/kernels/comparison.rs
Expand Up @@ -548,6 +548,89 @@ pub fn ilike_utf8_scalar<OffsetSize: OffsetSizeTrait>(
Ok(BooleanArray::from(data))
}

/// Perform SQL `left NOT ILIKE right` operation on [`StringArray`] /
/// [`LargeStringArray`].
///
/// See the documentation on [`like_utf8`] for more details.
pub fn nilike_utf8<OffsetSize: OffsetSizeTrait>(
left: &GenericStringArray<OffsetSize>,
right: &GenericStringArray<OffsetSize>,
) -> Result<BooleanArray> {
regex_like(left, right, true, |re_pattern| {
Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| {
ArrowError::ComputeError(format!(
"Unable to build regex from ILIKE pattern: {}",
e
))
})
})
}

/// Perform SQL `left NOT ILIKE right` operation on [`StringArray`] /
/// [`LargeStringArray`] and a scalar.
///
/// See the documentation on [`like_utf8`] for more details.
pub fn nilike_utf8_scalar<OffsetSize: OffsetSizeTrait>(
left: &GenericStringArray<OffsetSize>,
right: &str,
) -> Result<BooleanArray> {
let null_bit_buffer = left.data().null_buffer().cloned();
let mut result = BooleanBufferBuilder::new(left.len());

if !right.contains(is_like_pattern) {
// fast path, can use equals
for i in 0..left.len() {
result.append(left.value(i) != right);
}
} else if right.ends_with('%') && !right[..right.len() - 1].contains(is_like_pattern)
{
// fast path, can use ends_with
for i in 0..left.len() {
result.append(
!left
.value(i)
.to_uppercase()
.starts_with(&right[..right.len() - 1].to_uppercase()),
);
}
} else if right.starts_with('%') && !right[1..].contains(is_like_pattern) {
// fast path, can use starts_with
for i in 0..left.len() {
result.append(
!left
.value(i)
.to_uppercase()
.ends_with(&right[1..].to_uppercase()),
);
}
} else {
let re_pattern = escape(right).replace('%', ".*").replace('_', ".");
let re = Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| {
ArrowError::ComputeError(format!(
"Unable to build regex from ILIKE pattern: {}",
e
))
})?;
for i in 0..left.len() {
let haystack = left.value(i);
result.append(!re.is_match(haystack));
}
}

let data = unsafe {
ArrayData::new_unchecked(
DataType::Boolean,
left.len(),
None,
null_bit_buffer,
0,
vec![result.finish()],
vec![],
)
};
Ok(BooleanArray::from(data))
}

/// Perform SQL `array ~ regex_array` operation on [`StringArray`] / [`LargeStringArray`].
/// If `regex_array` element has an empty value, the corresponding result value is always true.
///
Expand Down Expand Up @@ -3984,6 +4067,60 @@ mod tests {
vec![false, true, false, false]
);

test_utf8!(
test_utf8_array_nilike,
vec!["arrow", "arrow", "ARROW", "arrow", "ARROW", "ARROWS", "arROw"],
vec!["arrow", "ar%", "%ro%", "foo", "ar%r", "arrow_", "arrow_"],
nilike_utf8,
vec![false, false, false, true, true, false, true]
);
test_utf8_scalar!(
nilike_utf8_scalar_escape_testing,
vec!["varchar(255)", "int(255)", "varchar", "int"],
"%(%)%",
nilike_utf8_scalar,
vec![false, false, true, true]
);
test_utf8_scalar!(
test_utf8_array_nilike_scalar,
vec!["arrow", "parquet", "datafusion", "flight"],
"%AR%",
nilike_utf8_scalar,
vec![false, false, true, true]
);

test_utf8_scalar!(
test_utf8_array_nilike_scalar_start,
vec!["arrow", "parrow", "arrows", "ARR"],
"aRRow%",
nilike_utf8_scalar,
vec![false, true, false, true]
);

test_utf8_scalar!(
test_utf8_array_nilike_scalar_end,
vec!["ArroW", "parrow", "ARRowS", "arr"],
"%arrow",
nilike_utf8_scalar,
vec![false, false, true, true]
);

test_utf8_scalar!(
test_utf8_array_nilike_scalar_equals,
vec!["arrow", "parrow", "arrows", "arr"],
"arrow",
nilike_utf8_scalar,
vec![false, true, true, true]
);

test_utf8_scalar!(
test_utf8_array_nilike_scalar_one,
vec!["arrow", "arrows", "parrow", "arr"],
"arrow_",
nilike_utf8_scalar,
vec![true, false, true, true]
);

test_utf8!(
test_utf8_array_neq,
vec!["arrow", "arrow", "arrow", "arrow"],
Expand Down

0 comments on commit 9860aa7

Please sign in to comment.