diff --git a/arrow/benches/comparison_kernels.rs b/arrow/benches/comparison_kernels.rs index 4dced67ad87..21d83e07eec 100644 --- a/arrow/benches/comparison_kernels.rs +++ b/arrow/benches/comparison_kernels.rs @@ -124,6 +124,11 @@ fn bench_ilike_utf8_scalar(arr_a: &StringArray, value_b: &str) { .unwrap(); } +fn bench_nilike_utf8_scalar(arr_a: &StringArray, value_b: &str) { + nilike_utf8_scalar(criterion::black_box(arr_a), criterion::black_box(value_b)) + .unwrap(); +} + fn bench_regexp_is_match_utf8_scalar(arr_a: &StringArray, value_b: &str) { regexp_is_match_utf8_scalar( criterion::black_box(arr_a), @@ -254,6 +259,26 @@ fn add_benchmark(c: &mut Criterion) { b.iter(|| bench_ilike_utf8_scalar(&arr_string, "%xx_xX%xXX")) }); + c.bench_function("nilike_utf8 scalar equals", |b| { + b.iter(|| bench_nilike_utf8_scalar(&arr_string, "xxXX")) + }); + + c.bench_function("nilike_utf8 scalar contains", |b| { + b.iter(|| bench_nilike_utf8_scalar(&arr_string, "%xxXX%")) + }); + + c.bench_function("nilike_utf8 scalar ends with", |b| { + b.iter(|| bench_nilike_utf8_scalar(&arr_string, "xXXx%")) + }); + + c.bench_function("nilike_utf8 scalar starts with", |b| { + b.iter(|| bench_nilike_utf8_scalar(&arr_string, "%XXXx")) + }); + + c.bench_function("nilike_utf8 scalar complex", |b| { + b.iter(|| bench_nilike_utf8_scalar(&arr_string, "%xx_xX%xXX")) + }); + c.bench_function("egexp_matches_utf8 scalar starts with", |b| { b.iter(|| bench_regexp_is_match_utf8_scalar(&arr_string, "^xx")) }); diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs index acb9ac229b4..068b9dedf59 100644 --- a/arrow/src/compute/kernels/comparison.rs +++ b/arrow/src/compute/kernels/comparison.rs @@ -548,6 +548,89 @@ pub fn ilike_utf8_scalar( Ok(BooleanArray::from(data)) } +/// Perform SQL `left NOT ILIKE right` operation on [`StringArray`] / +/// [`LargeStringArray`]. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn nilike_utf8( + left: &GenericStringArray, + right: &GenericStringArray, +) -> Result { + regex_like(left, right, true, |re_pattern| { + Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { + ArrowError::ComputeError(format!( + "Unable to build regex from ILIKE pattern: {}", + e + )) + }) + }) +} + +/// Perform SQL `left NOT ILIKE right` operation on [`StringArray`] / +/// [`LargeStringArray`] and a scalar. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn nilike_utf8_scalar( + left: &GenericStringArray, + right: &str, +) -> Result { + let null_bit_buffer = left.data().null_buffer().cloned(); + let mut result = BooleanBufferBuilder::new(left.len()); + + if !right.contains(is_like_pattern) { + // fast path, can use equals + for i in 0..left.len() { + result.append(left.value(i) != right); + } + } else if right.ends_with('%') && !right[..right.len() - 1].contains(is_like_pattern) + { + // fast path, can use ends_with + for i in 0..left.len() { + result.append( + !left + .value(i) + .to_uppercase() + .starts_with(&right[..right.len() - 1].to_uppercase()), + ); + } + } else if right.starts_with('%') && !right[1..].contains(is_like_pattern) { + // fast path, can use starts_with + for i in 0..left.len() { + result.append( + !left + .value(i) + .to_uppercase() + .ends_with(&right[1..].to_uppercase()), + ); + } + } else { + let re_pattern = escape(right).replace('%', ".*").replace('_', "."); + let re = Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { + ArrowError::ComputeError(format!( + "Unable to build regex from ILIKE pattern: {}", + e + )) + })?; + for i in 0..left.len() { + let haystack = left.value(i); + result.append(!re.is_match(haystack)); + } + } + + let data = unsafe { + ArrayData::new_unchecked( + DataType::Boolean, + left.len(), + None, + null_bit_buffer, + 0, + vec![result.finish()], + vec![], + ) + }; + Ok(BooleanArray::from(data)) +} + /// Perform SQL `array ~ regex_array` operation on [`StringArray`] / [`LargeStringArray`]. /// If `regex_array` element has an empty value, the corresponding result value is always true. /// @@ -3984,6 +4067,60 @@ mod tests { vec![false, true, false, false] ); + test_utf8!( + test_utf8_array_nilike, + vec!["arrow", "arrow", "ARROW", "arrow", "ARROW", "ARROWS", "arROw"], + vec!["arrow", "ar%", "%ro%", "foo", "ar%r", "arrow_", "arrow_"], + nilike_utf8, + vec![false, false, false, true, true, false, true] + ); + test_utf8_scalar!( + nilike_utf8_scalar_escape_testing, + vec!["varchar(255)", "int(255)", "varchar", "int"], + "%(%)%", + nilike_utf8_scalar, + vec![false, false, true, true] + ); + test_utf8_scalar!( + test_utf8_array_nilike_scalar, + vec!["arrow", "parquet", "datafusion", "flight"], + "%AR%", + nilike_utf8_scalar, + vec![false, false, true, true] + ); + + test_utf8_scalar!( + test_utf8_array_nilike_scalar_start, + vec!["arrow", "parrow", "arrows", "ARR"], + "aRRow%", + nilike_utf8_scalar, + vec![false, true, false, true] + ); + + test_utf8_scalar!( + test_utf8_array_nilike_scalar_end, + vec!["ArroW", "parrow", "ARRowS", "arr"], + "%arrow", + nilike_utf8_scalar, + vec![false, false, true, true] + ); + + test_utf8_scalar!( + test_utf8_array_nilike_scalar_equals, + vec!["arrow", "parrow", "arrows", "arr"], + "arrow", + nilike_utf8_scalar, + vec![false, true, true, true] + ); + + test_utf8_scalar!( + test_utf8_array_nilike_scalar_one, + vec!["arrow", "arrows", "parrow", "arr"], + "arrow_", + nilike_utf8_scalar, + vec![true, false, true, true] + ); + test_utf8!( test_utf8_array_neq, vec!["arrow", "arrow", "arrow", "arrow"],