From 97994391e2fee3f8826de92314726f965b0f7414 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Thu, 8 Dec 2022 13:54:49 +0000 Subject: [PATCH] Split out arrow-string (#2594) --- Cargo.toml | 35 +- arrow-string/Cargo.toml | 49 + .../src}/concat_elements.rs | 12 +- .../kernels => arrow-string/src}/length.rs | 184 +- arrow-string/src/lib.rs | 24 + arrow-string/src/like.rs | 2101 +++++++++++++++++ .../kernels => arrow-string/src}/regexp.rs | 152 +- .../kernels => arrow-string/src}/substring.rs | 73 +- arrow/Cargo.toml | 4 +- arrow/src/compute/kernels/comparison.rs | 2021 +--------------- arrow/src/compute/kernels/mod.rs | 5 +- 11 files changed, 2534 insertions(+), 2126 deletions(-) create mode 100644 arrow-string/Cargo.toml rename {arrow/src/compute/kernels => arrow-string/src}/concat_elements.rs (96%) rename {arrow/src/compute/kernels => arrow-string/src}/length.rs (84%) create mode 100644 arrow-string/src/lib.rs create mode 100644 arrow-string/src/like.rs rename {arrow/src/compute/kernels => arrow-string/src}/regexp.rs (53%) rename {arrow/src/compute/kernels => arrow-string/src}/substring.rs (95%) diff --git a/Cargo.toml b/Cargo.toml index 16b4cb7f89e..556b86a008a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,23 +17,24 @@ [workspace] members = [ - "arrow", - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-csv", - "arrow-data", - "arrow-flight", - "arrow-integration-test", - "arrow-integration-testing", - "arrow-ipc", - "arrow-json", - "arrow-schema", - "arrow-select", - "object_store", - "parquet", - "parquet_derive", - "parquet_derive_test", + "arrow", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-csv", + "arrow-data", + "arrow-flight", + "arrow-integration-test", + "arrow-integration-testing", + "arrow-ipc", + "arrow-json", + "arrow-schema", + "arrow-select", + "arrow-string", + "object_store", + "parquet", + "parquet_derive", + "parquet_derive_test", ] # Enable the version 2 feature resolver, which avoids unifying features for targets that are not being built # diff --git a/arrow-string/Cargo.toml b/arrow-string/Cargo.toml new file mode 100644 index 00000000000..97c4b5ffbf1 --- /dev/null +++ b/arrow-string/Cargo.toml @@ -0,0 +1,49 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "arrow-string" +version = "28.0.0" +description = "String kernels for arrow arrays" +homepage = "https://github.com/apache/arrow-rs" +repository = "https://github.com/apache/arrow-rs" +authors = ["Apache Arrow "] +license = "Apache-2.0" +keywords = ["arrow"] +include = [ + "benches/*.rs", + "src/**/*.rs", + "Cargo.toml", +] +edition = "2021" +rust-version = "1.62" + +[lib] +name = "arrow_string" +path = "src/lib.rs" +bench = false + +[dependencies] +arrow-buffer = { version = "28.0.0", path = "../arrow-buffer" } +arrow-data = { version = "28.0.0", path = "../arrow-data" } +arrow-schema = { version = "28.0.0", path = "../arrow-schema" } +arrow-array = { version = "28.0.0", path = "../arrow-array" } +regex = { version = "1.7.0", default-features = false, features = ["std", "unicode", "perf"] } +regex-syntax = { version = "0.6.27", default-features = false, features = ["unicode"] } + +[features] +dyn_cmp_dict = [] diff --git a/arrow/src/compute/kernels/concat_elements.rs b/arrow-string/src/concat_elements.rs similarity index 96% rename from arrow/src/compute/kernels/concat_elements.rs rename to arrow-string/src/concat_elements.rs index 25c8f60de3f..4fbcc9ce4f0 100644 --- a/arrow/src/compute/kernels/concat_elements.rs +++ b/arrow-string/src/concat_elements.rs @@ -15,9 +15,12 @@ // specific language governing permissions and limitations // under the License. -use crate::array::*; -use crate::error::{ArrowError, Result}; + +use arrow_array::{Array, GenericStringArray, OffsetSizeTrait}; +use arrow_array::builder::BufferBuilder; +use arrow_data::ArrayDataBuilder; use arrow_data::bit_mask::combine_option_bitmap; +use arrow_schema::ArrowError; /// Returns the elementwise concatenation of a [`StringArray`]. /// @@ -36,7 +39,7 @@ use arrow_data::bit_mask::combine_option_bitmap; pub fn concat_elements_utf8( left: &GenericStringArray, right: &GenericStringArray, -) -> Result> { +) -> Result, ArrowError> { if left.len() != right.len() { return Err(ArrowError::ComputeError(format!( "Arrays must have the same length: {} != {}", @@ -89,7 +92,7 @@ pub fn concat_elements_utf8( /// An error will be returned if the [`StringArray`] are of different lengths pub fn concat_elements_utf8_many( arrays: &[&GenericStringArray], -) -> Result> { +) -> Result, ArrowError> { if arrays.is_empty() { return Err(ArrowError::ComputeError( "concat requires input of at least one array".to_string(), @@ -157,6 +160,7 @@ pub fn concat_elements_utf8_many( #[cfg(test)] mod tests { + use arrow_array::StringArray; use super::*; #[test] fn test_string_concat() { diff --git a/arrow/src/compute/kernels/length.rs b/arrow-string/src/length.rs similarity index 84% rename from arrow/src/compute/kernels/length.rs rename to arrow-string/src/length.rs index a68aa2bde4e..f7faa0a6143 100644 --- a/arrow/src/compute/kernels/length.rs +++ b/arrow-string/src/length.rs @@ -17,12 +17,11 @@ //! Defines kernel for length of string arrays and binary arrays -use crate::{array::*, buffer::Buffer, datatypes::ArrowPrimitiveType}; -use crate::{ - datatypes::*, - error::{ArrowError, Result}, -}; - +use arrow_array::types::*; +use arrow_array::*; +use arrow_buffer::Buffer; +use arrow_data::ArrayData; +use arrow_schema::{ArrowError, DataType}; use std::sync::Arc; macro_rules! unary_offsets { @@ -153,7 +152,7 @@ where /// * this only accepts ListArray/LargeListArray, StringArray/LargeStringArray and BinaryArray/LargeBinaryArray, /// or DictionaryArray with above Arrays as values /// * length of null is null. -pub fn length(array: &dyn Array) -> Result { +pub fn length(array: &dyn Array) -> Result { match array.data_type() { DataType::Dictionary(kt, _) => { kernel_dict!( @@ -189,7 +188,7 @@ pub fn length(array: &dyn Array) -> Result { /// or DictionaryArray with above Arrays as values /// * bit_length of null is null. /// * bit_length is in number of bits -pub fn bit_length(array: &dyn Array) -> Result { +pub fn bit_length(array: &dyn Array) -> Result { match array.data_type() { DataType::Dictionary(kt, _) => { kernel_dict!( @@ -220,6 +219,7 @@ pub fn bit_length(array: &dyn Array) -> Result { #[cfg(test)] mod tests { use super::*; + use arrow_array::cast::as_primitive_array; fn double_vec(v: Vec) -> Vec { [&v[..], &v[..]].concat() @@ -245,11 +245,10 @@ mod tests { macro_rules! length_binary_helper { ($offset_ty: ty, $result_ty: ty, $kernel: ident, $value: expr, $expected: expr) => {{ let array = GenericBinaryArray::<$offset_ty>::from($value); - let result = $kernel(&array)?; + let result = $kernel(&array).unwrap(); let result = result.as_any().downcast_ref::<$result_ty>().unwrap(); let expected: $result_ty = $expected.into(); assert_eq!(expected.data(), result.data()); - Ok(()) }}; } @@ -259,64 +258,61 @@ mod tests { GenericListArray::<$offset_ty>::from_iter_primitive::<$element_ty, _, _>( $value, ); - let result = length(&array)?; + let result = length(&array).unwrap(); let result = result.as_any().downcast_ref::<$result_ty>().unwrap(); let expected: $result_ty = $expected.into(); assert_eq!(expected.data(), result.data()); - Ok(()) }}; } #[test] #[cfg_attr(miri, ignore)] // running forever - fn length_test_string() -> Result<()> { + fn length_test_string() { length_cases_string() .into_iter() - .try_for_each(|(input, len, expected)| { + .for_each(|(input, len, expected)| { let array = StringArray::from(input); - let result = length(&array)?; + let result = length(&array).unwrap(); assert_eq!(len, result.len()); let result = result.as_any().downcast_ref::().unwrap(); expected.iter().enumerate().for_each(|(i, value)| { assert_eq!(*value, result.value(i)); }); - Ok(()) }) } #[test] #[cfg_attr(miri, ignore)] // running forever - fn length_test_large_string() -> Result<()> { + fn length_test_large_string() { length_cases_string() .into_iter() - .try_for_each(|(input, len, expected)| { + .for_each(|(input, len, expected)| { let array = LargeStringArray::from(input); - let result = length(&array)?; + let result = length(&array).unwrap(); assert_eq!(len, result.len()); let result = result.as_any().downcast_ref::().unwrap(); expected.iter().enumerate().for_each(|(i, value)| { assert_eq!(*value as i64, result.value(i)); }); - Ok(()) }) } #[test] - fn length_test_binary() -> Result<()> { + fn length_test_binary() { let value: Vec<&[u8]> = vec![b"zero", b"one", &[0xff, 0xf8]]; let result: Vec = vec![4, 3, 2]; length_binary_helper!(i32, Int32Array, length, value, result) } #[test] - fn length_test_large_binary() -> Result<()> { + fn length_test_large_binary() { let value: Vec<&[u8]> = vec![b"zero", &[0xff, 0xf8], b"two"]; let result: Vec = vec![4, 2, 3]; length_binary_helper!(i64, Int64Array, length, value, result) } #[test] - fn length_test_list() -> Result<()> { + fn length_test_list() { let value = vec![ Some(vec![]), Some(vec![Some(1), Some(2), Some(4)]), @@ -327,7 +323,7 @@ mod tests { } #[test] - fn length_test_large_list() -> Result<()> { + fn length_test_large_list() { let value = vec![ Some(vec![]), Some(vec![Some(1.1), Some(2.2), Some(3.3)]), @@ -348,28 +344,27 @@ mod tests { } #[test] - fn length_null_string() -> Result<()> { + fn length_null_string() { length_null_cases_string() .into_iter() - .try_for_each(|(input, len, expected)| { + .for_each(|(input, len, expected)| { let array = StringArray::from(input); - let result = length(&array)?; + let result = length(&array).unwrap(); assert_eq!(len, result.len()); let result = result.as_any().downcast_ref::().unwrap(); let expected: Int32Array = expected.into(); assert_eq!(expected.data(), result.data()); - Ok(()) }) } #[test] - fn length_null_large_string() -> Result<()> { + fn length_null_large_string() { length_null_cases_string() .into_iter() - .try_for_each(|(input, len, expected)| { + .for_each(|(input, len, expected)| { let array = LargeStringArray::from(input); - let result = length(&array)?; + let result = length(&array).unwrap(); assert_eq!(len, result.len()); let result = result.as_any().downcast_ref::().unwrap(); @@ -380,12 +375,11 @@ mod tests { .collect::>() .into(); assert_eq!(expected.data(), result.data()); - Ok(()) }) } #[test] - fn length_null_binary() -> Result<()> { + fn length_null_binary() { let value: Vec> = vec![Some(b"zero"), None, Some(&[0xff, 0xf8]), Some(b"three")]; let result: Vec> = vec![Some(4), None, Some(2), Some(5)]; @@ -393,7 +387,7 @@ mod tests { } #[test] - fn length_null_large_binary() -> Result<()> { + fn length_null_large_binary() { let value: Vec> = vec![Some(&[0xff, 0xf8]), None, Some(b"two"), Some(b"three")]; let result: Vec> = vec![Some(2), None, Some(3), Some(5)]; @@ -401,7 +395,7 @@ mod tests { } #[test] - fn length_null_list() -> Result<()> { + fn length_null_list() { let value = vec![ Some(vec![]), None, @@ -413,7 +407,7 @@ mod tests { } #[test] - fn length_null_large_list() -> Result<()> { + fn length_null_large_list() { let value = vec![ Some(vec![]), None, @@ -434,31 +428,27 @@ mod tests { /// Tests with an offset #[test] - fn length_offsets_string() -> Result<()> { + fn length_offsets_string() { let a = StringArray::from(vec![Some("hello"), Some(" "), Some("world"), None]); let b = a.slice(1, 3); - let result = length(b.as_ref())?; + let result = length(b.as_ref()).unwrap(); let result: &Int32Array = as_primitive_array(&result); let expected = Int32Array::from(vec![Some(1), Some(5), None]); assert_eq!(&expected, result); - - Ok(()) } #[test] - fn length_offsets_binary() -> Result<()> { + fn length_offsets_binary() { let value: Vec> = vec![Some(b"hello"), Some(b" "), Some(&[0xff, 0xf8]), None]; let a = BinaryArray::from(value); let b = a.slice(1, 3); - let result = length(b.as_ref())?; + let result = length(b.as_ref()).unwrap(); let result: &Int32Array = as_primitive_array(&result); let expected = Int32Array::from(vec![Some(1), Some(2), None]); assert_eq!(&expected, result); - - Ok(()) } fn bit_length_cases() -> Vec<(Vec<&'static str>, usize, Vec)> { @@ -480,47 +470,45 @@ mod tests { #[test] #[cfg_attr(miri, ignore)] // error: this test uses too much memory to run on CI - fn bit_length_test_string() -> Result<()> { + fn bit_length_test_string() { bit_length_cases() .into_iter() - .try_for_each(|(input, len, expected)| { + .for_each(|(input, len, expected)| { let array = StringArray::from(input); - let result = bit_length(&array)?; + let result = bit_length(&array).unwrap(); assert_eq!(len, result.len()); let result = result.as_any().downcast_ref::().unwrap(); expected.iter().enumerate().for_each(|(i, value)| { assert_eq!(*value, result.value(i)); }); - Ok(()) }) } #[test] #[cfg_attr(miri, ignore)] // error: this test uses too much memory to run on CI - fn bit_length_test_large_string() -> Result<()> { + fn bit_length_test_large_string() { bit_length_cases() .into_iter() - .try_for_each(|(input, len, expected)| { + .for_each(|(input, len, expected)| { let array = LargeStringArray::from(input); - let result = bit_length(&array)?; + let result = bit_length(&array).unwrap(); assert_eq!(len, result.len()); let result = result.as_any().downcast_ref::().unwrap(); expected.iter().enumerate().for_each(|(i, value)| { assert_eq!(*value as i64, result.value(i)); }); - Ok(()) }) } #[test] - fn bit_length_binary() -> Result<()> { + fn bit_length_binary() { let value: Vec<&[u8]> = vec![b"one", &[0xff, 0xf8], b"three"]; let expected: Vec = vec![24, 16, 40]; length_binary_helper!(i32, Int32Array, bit_length, value, expected) } #[test] - fn bit_length_large_binary() -> Result<()> { + fn bit_length_large_binary() { let value: Vec<&[u8]> = vec![b"zero", b" ", &[0xff, 0xf8]]; let expected: Vec = vec![32, 8, 16]; length_binary_helper!(i64, Int64Array, bit_length, value, expected) @@ -535,28 +523,27 @@ mod tests { } #[test] - fn bit_length_null_string() -> Result<()> { + fn bit_length_null_string() { bit_length_null_cases() .into_iter() - .try_for_each(|(input, len, expected)| { + .for_each(|(input, len, expected)| { let array = StringArray::from(input); - let result = bit_length(&array)?; + let result = bit_length(&array).unwrap(); assert_eq!(len, result.len()); let result = result.as_any().downcast_ref::().unwrap(); let expected: Int32Array = expected.into(); assert_eq!(expected.data(), result.data()); - Ok(()) }) } #[test] - fn bit_length_null_large_string() -> Result<()> { + fn bit_length_null_large_string() { bit_length_null_cases() .into_iter() - .try_for_each(|(input, len, expected)| { + .for_each(|(input, len, expected)| { let array = LargeStringArray::from(input); - let result = bit_length(&array)?; + let result = bit_length(&array).unwrap(); assert_eq!(len, result.len()); let result = result.as_any().downcast_ref::().unwrap(); @@ -567,12 +554,11 @@ mod tests { .collect::>() .into(); assert_eq!(expected.data(), result.data()); - Ok(()) }) } #[test] - fn bit_length_null_binary() -> Result<()> { + fn bit_length_null_binary() { let value: Vec> = vec![Some(b"one"), None, Some(b"three"), Some(&[0xff, 0xf8])]; let expected: Vec> = vec![Some(24), None, Some(40), Some(16)]; @@ -580,7 +566,7 @@ mod tests { } #[test] - fn bit_length_null_large_binary() -> Result<()> { + fn bit_length_null_large_binary() { let value: Vec> = vec![Some(b"one"), None, Some(&[0xff, 0xf8]), Some(b"four")]; let expected: Vec> = vec![Some(24), None, Some(16), Some(32)]; @@ -597,47 +583,42 @@ mod tests { /// Tests with an offset #[test] - fn bit_length_offsets_string() -> Result<()> { + fn bit_length_offsets_string() { let a = StringArray::from(vec![Some("hello"), Some(" "), Some("world"), None]); let b = a.slice(1, 3); - let result = bit_length(b.as_ref())?; + let result = bit_length(b.as_ref()).unwrap(); let result: &Int32Array = as_primitive_array(&result); let expected = Int32Array::from(vec![Some(8), Some(40), None]); assert_eq!(&expected, result); - - Ok(()) } #[test] - fn bit_length_offsets_binary() -> Result<()> { + fn bit_length_offsets_binary() { let value: Vec> = vec![Some(b"hello"), Some(&[]), Some(b"world"), None]; let a = BinaryArray::from(value); let b = a.slice(1, 3); - let result = bit_length(b.as_ref())?; + let result = bit_length(b.as_ref()).unwrap(); let result: &Int32Array = as_primitive_array(&result); let expected = Int32Array::from(vec![Some(0), Some(40), None]); assert_eq!(&expected, result); - - Ok(()) } #[test] - fn length_dictionary() -> Result<()> { - _length_dictionary::()?; - _length_dictionary::()?; - _length_dictionary::()?; - _length_dictionary::()?; - _length_dictionary::()?; - _length_dictionary::()?; - _length_dictionary::()?; - _length_dictionary::()?; - Ok(()) - } - - fn _length_dictionary() -> Result<()> { + fn length_dictionary() { + _length_dictionary::(); + _length_dictionary::(); + _length_dictionary::(); + _length_dictionary::(); + _length_dictionary::(); + _length_dictionary::(); + _length_dictionary::(); + _length_dictionary::(); + } + + fn _length_dictionary() { const TOTAL: i32 = 100; let v = ["aaaa", "bb", "ccccc", "ddd", "eeeeee"]; @@ -657,7 +638,7 @@ mod tests { let expected: Vec> = data.iter().map(|opt| opt.map(|s| s.len() as i32)).collect(); - let res = length(&dict_array)?; + let res = length(&dict_array).unwrap(); let actual = res.as_any().downcast_ref::>().unwrap(); let actual: Vec> = actual .values() @@ -670,24 +651,21 @@ mod tests { for i in 0..TOTAL as usize { assert_eq!(expected[i], actual[i],); } - - Ok(()) } #[test] - fn bit_length_dictionary() -> Result<()> { - _bit_length_dictionary::()?; - _bit_length_dictionary::()?; - _bit_length_dictionary::()?; - _bit_length_dictionary::()?; - _bit_length_dictionary::()?; - _bit_length_dictionary::()?; - _bit_length_dictionary::()?; - _bit_length_dictionary::()?; - Ok(()) - } - - fn _bit_length_dictionary() -> Result<()> { + fn bit_length_dictionary() { + _bit_length_dictionary::(); + _bit_length_dictionary::(); + _bit_length_dictionary::(); + _bit_length_dictionary::(); + _bit_length_dictionary::(); + _bit_length_dictionary::(); + _bit_length_dictionary::(); + _bit_length_dictionary::(); + } + + fn _bit_length_dictionary() { const TOTAL: i32 = 100; let v = ["aaaa", "bb", "ccccc", "ddd", "eeeeee"]; @@ -709,7 +687,7 @@ mod tests { .map(|opt| opt.map(|s| (s.chars().count() * 8) as i32)) .collect(); - let res = bit_length(&dict_array)?; + let res = bit_length(&dict_array).unwrap(); let actual = res.as_any().downcast_ref::>().unwrap(); let actual: Vec> = actual .values() @@ -722,7 +700,5 @@ mod tests { for i in 0..TOTAL as usize { assert_eq!(expected[i], actual[i],); } - - Ok(()) } } diff --git a/arrow-string/src/lib.rs b/arrow-string/src/lib.rs new file mode 100644 index 00000000000..4bd4d282656 --- /dev/null +++ b/arrow-string/src/lib.rs @@ -0,0 +1,24 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Arrow string kernels + +pub mod concat_elements; +pub mod length; +pub mod like; +pub mod regexp; +pub mod substring; diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs new file mode 100644 index 00000000000..e8fdc5cc0f6 --- /dev/null +++ b/arrow-string/src/like.rs @@ -0,0 +1,2101 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_array::builder::BooleanBufferBuilder; +use arrow_array::cast::*; +use arrow_array::*; +use arrow_buffer::{bit_util, MutableBuffer}; +use arrow_data::bit_mask::combine_option_bitmap; +use arrow_data::ArrayData; +use arrow_schema::*; +use regex::Regex; +use std::collections::HashMap; + +/// Perform SQL `left LIKE right` operation on [`StringArray`] / [`LargeStringArray`]. +/// +/// There are two wildcards supported with the LIKE operator: +/// +/// 1. `%` - The percent sign represents zero, one, or multiple characters +/// 2. `_` - The underscore represents a single character +/// +/// For example: +/// ``` +/// use arrow_array::{StringArray, BooleanArray}; +/// use arrow_string::like::like_utf8; +/// +/// let strings = StringArray::from(vec!["Arrow", "Arrow", "Arrow", "Ar"]); +/// let patterns = StringArray::from(vec!["A%", "B%", "A.", "A_"]); +/// +/// let result = like_utf8(&strings, &patterns).unwrap(); +/// assert_eq!(result, BooleanArray::from(vec![true, false, false, true])); +/// ``` +pub fn like_utf8( + left: &GenericStringArray, + right: &GenericStringArray, +) -> Result { + regex_like(left, right, false, |re_pattern| { + Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { + ArrowError::ComputeError(format!( + "Unable to build regex from LIKE pattern: {}", + e + )) + }) + }) +} + +/// Perform SQL `left LIKE right` operation on [`StringArray`] / +/// [`LargeStringArray`], or [`DictionaryArray`] with values +/// [`StringArray`]/[`LargeStringArray`]. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn like_dyn(left: &dyn Array, right: &dyn Array) -> Result { + match (left.data_type(), right.data_type()) { + (DataType::Utf8, DataType::Utf8) => { + let left = as_string_array(left); + let right = as_string_array(right); + like_utf8(left, right) + } + (DataType::LargeUtf8, DataType::LargeUtf8) => { + let left = as_largestring_array(left); + let right = as_largestring_array(right); + like_utf8(left, right) + } + #[cfg(feature = "dyn_cmp_dict")] + (DataType::Dictionary(_, _), DataType::Dictionary(_, _)) => { + downcast_dictionary_array!( + left => { + let right = as_dictionary_array(right); + like_dict(left, right) + } + t => Err(ArrowError::ComputeError(format!( + "Should be DictionaryArray but got: {}", t + ))) + ) + } + _ => { + Err(ArrowError::ComputeError( + "like_dyn only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values".to_string(), + )) + } + } +} + +/// Perform SQL `left LIKE right` operation on on [`DictionaryArray`] with values +/// [`StringArray`]/[`LargeStringArray`]. +/// +/// See the documentation on [`like_utf8`] for more details. +#[cfg(feature = "dyn_cmp_dict")] +fn like_dict( + left: &DictionaryArray, + right: &DictionaryArray, +) -> Result { + match (left.value_type(), right.value_type()) { + (DataType::Utf8, DataType::Utf8) => { + let left = left.downcast_dict::>().unwrap(); + let right = right.downcast_dict::>().unwrap(); + + regex_like(left, right, false, |re_pattern| { + Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { + ArrowError::ComputeError(format!( + "Unable to build regex from LIKE pattern: {}", + e + )) + }) + }) + } + (DataType::LargeUtf8, DataType::LargeUtf8) => { + let left = left.downcast_dict::>().unwrap(); + let right = right.downcast_dict::>().unwrap(); + + regex_like(left, right, false, |re_pattern| { + Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { + ArrowError::ComputeError(format!( + "Unable to build regex from LIKE pattern: {}", + e + )) + }) + }) + } + _ => Err(ArrowError::ComputeError( + "like_dict only supports DictionaryArray with Utf8 or LargeUtf8 values" + .to_string(), + )), + } +} + +#[inline] +fn like_scalar_op<'a, F: Fn(bool) -> bool, L: ArrayAccessor>( + left: L, + right: &str, + op: F, +) -> Result { + if !right.contains(is_like_pattern) { + // fast path, can use equals + Ok(BooleanArray::from_unary(left, |item| op(item == right))) + } else if right.ends_with('%') + && !right.ends_with("\\%") + && !right[..right.len() - 1].contains(is_like_pattern) + { + // fast path, can use starts_with + let starts_with = &right[..right.len() - 1]; + + Ok(BooleanArray::from_unary(left, |item| { + op(item.starts_with(starts_with)) + })) + } else if right.starts_with('%') && !right[1..].contains(is_like_pattern) { + // fast path, can use ends_with + let ends_with = &right[1..]; + + Ok(BooleanArray::from_unary(left, |item| { + op(item.ends_with(ends_with)) + })) + } else if right.starts_with('%') + && right.ends_with('%') + && !right.ends_with("\\%") + && !right[1..right.len() - 1].contains(is_like_pattern) + { + let contains = &right[1..right.len() - 1]; + + Ok(BooleanArray::from_unary(left, |item| { + op(item.contains(contains)) + })) + } else { + let re_pattern = replace_like_wildcards(right)?; + let re = Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { + ArrowError::ComputeError(format!( + "Unable to build regex from LIKE pattern: {}", + e + )) + })?; + + Ok(BooleanArray::from_unary(left, |item| op(re.is_match(item)))) + } +} + +#[inline] +fn like_scalar<'a, L: ArrayAccessor>( + left: L, + right: &str, +) -> Result { + like_scalar_op(left, right, |x| x) +} + +/// Perform SQL `left LIKE right` operation on [`StringArray`] / +/// [`LargeStringArray`], or [`DictionaryArray`] with values +/// [`StringArray`]/[`LargeStringArray`] and a scalar. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn like_utf8_scalar_dyn( + left: &dyn Array, + right: &str, +) -> Result { + match left.data_type() { + DataType::Utf8 => { + let left = as_string_array(left); + like_scalar(left, right) + } + DataType::LargeUtf8 => { + let left = as_largestring_array(left); + like_scalar(left, right) + } + DataType::Dictionary(_, _) => { + downcast_dictionary_array!( + left => { + like_dict_scalar(left, right) + } + t => Err(ArrowError::ComputeError(format!( + "Should be DictionaryArray but got: {}", t + ))) + ) + } + _ => { + Err(ArrowError::ComputeError( + "like_utf8_scalar_dyn only supports Utf8, LargeUtf8 or DictionaryArray with Utf8 or LargeUtf8 values".to_string(), + )) + } + } +} + +/// Perform SQL `left LIKE right` operation on [`StringArray`] / +/// [`LargeStringArray`] and a scalar. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn like_utf8_scalar( + left: &GenericStringArray, + right: &str, +) -> Result { + like_scalar(left, right) +} + +/// Perform SQL `left LIKE right` operation on [`DictionaryArray`] with values +/// [`StringArray`]/[`LargeStringArray`] and a scalar. +/// +/// See the documentation on [`like_utf8`] for more details. +fn like_dict_scalar( + left: &DictionaryArray, + right: &str, +) -> Result { + match left.value_type() { + DataType::Utf8 => { + let left = left.downcast_dict::>().unwrap(); + like_scalar(left, right) + } + DataType::LargeUtf8 => { + let left = left.downcast_dict::>().unwrap(); + like_scalar(left, right) + } + _ => { + Err(ArrowError::ComputeError( + "like_dict_scalar only supports DictionaryArray with Utf8 or LargeUtf8 values".to_string(), + )) + } + } +} + +/// Transforms a like `pattern` to a regex compatible pattern. To achieve that, it does: +/// +/// 1. Replace like wildcards for regex expressions as the pattern will be evaluated using regex match: `%` => `.*` and `_` => `.` +/// 2. Escape regex meta characters to match them and not be evaluated as regex special chars. For example: `.` => `\\.` +/// 3. Replace escaped like wildcards removing the escape characters to be able to match it as a regex. For example: `\\%` => `%` +fn replace_like_wildcards(pattern: &str) -> Result { + let mut result = String::new(); + let pattern = String::from(pattern); + let mut chars_iter = pattern.chars().peekable(); + while let Some(c) = chars_iter.next() { + if c == '\\' { + let next = chars_iter.peek(); + match next { + Some(next) if is_like_pattern(*next) => { + result.push(*next); + // Skipping the next char as it is already appended + chars_iter.next(); + } + _ => { + result.push('\\'); + result.push('\\'); + } + } + } else if regex_syntax::is_meta_character(c) { + result.push('\\'); + result.push(c); + } else if c == '%' { + result.push_str(".*"); + } else if c == '_' { + result.push('.'); + } else { + result.push(c); + } + } + Ok(result) +} + +/// Perform SQL `left NOT LIKE right` operation on [`StringArray`] / +/// [`LargeStringArray`]. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn nlike_utf8( + left: &GenericStringArray, + right: &GenericStringArray, +) -> Result { + regex_like(left, right, true, |re_pattern| { + Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { + ArrowError::ComputeError(format!( + "Unable to build regex from LIKE pattern: {}", + e + )) + }) + }) +} + +/// Perform SQL `left NOT LIKE right` operation on on [`DictionaryArray`] with values +/// [`StringArray`]/[`LargeStringArray`]. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn nlike_dyn( + left: &dyn Array, + right: &dyn Array, +) -> Result { + match (left.data_type(), right.data_type()) { + (DataType::Utf8, DataType::Utf8) => { + let left = as_string_array(left); + let right = as_string_array(right); + nlike_utf8(left, right) + } + (DataType::LargeUtf8, DataType::LargeUtf8) => { + let left = as_largestring_array(left); + let right = as_largestring_array(right); + nlike_utf8(left, right) + } + #[cfg(feature = "dyn_cmp_dict")] + (DataType::Dictionary(_, _), DataType::Dictionary(_, _)) => { + downcast_dictionary_array!( + left => { + let right = as_dictionary_array(right); + nlike_dict(left, right) + } + t => Err(ArrowError::ComputeError(format!( + "Should be DictionaryArray but got: {}", t + ))) + ) + } + _ => { + Err(ArrowError::ComputeError( + "nlike_dyn only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values".to_string(), + )) + } + } +} + +/// Perform SQL `left NOT LIKE right` operation on on [`DictionaryArray`] with values +/// [`StringArray`]/[`LargeStringArray`]. +/// +/// See the documentation on [`like_utf8`] for more details. +#[cfg(feature = "dyn_cmp_dict")] +fn nlike_dict( + left: &DictionaryArray, + right: &DictionaryArray, +) -> Result { + match (left.value_type(), right.value_type()) { + (DataType::Utf8, DataType::Utf8) => { + let left = left.downcast_dict::>().unwrap(); + let right = right.downcast_dict::>().unwrap(); + + regex_like(left, right, true, |re_pattern| { + Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { + ArrowError::ComputeError(format!( + "Unable to build regex from LIKE pattern: {}", + e + )) + }) + }) + } + (DataType::LargeUtf8, DataType::LargeUtf8) => { + let left = left.downcast_dict::>().unwrap(); + let right = right.downcast_dict::>().unwrap(); + + regex_like(left, right, true, |re_pattern| { + Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { + ArrowError::ComputeError(format!( + "Unable to build regex from LIKE pattern: {}", + e + )) + }) + }) + } + _ => Err(ArrowError::ComputeError( + "nlike_dict only supports DictionaryArray with Utf8 or LargeUtf8 values" + .to_string(), + )), + } +} + +#[inline] +fn nlike_scalar<'a, L: ArrayAccessor>( + left: L, + right: &str, +) -> Result { + like_scalar_op(left, right, |x| !x) +} + +/// Perform SQL `left NOT LIKE right` operation on [`StringArray`] / +/// [`LargeStringArray`], or [`DictionaryArray`] with values +/// [`StringArray`]/[`LargeStringArray`] and a scalar. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn nlike_utf8_scalar_dyn( + left: &dyn Array, + right: &str, +) -> Result { + match left.data_type() { + DataType::Utf8 => { + let left = as_string_array(left); + nlike_scalar(left, right) + } + DataType::LargeUtf8 => { + let left = as_largestring_array(left); + nlike_scalar(left, right) + } + DataType::Dictionary(_, _) => { + downcast_dictionary_array!( + left => { + nlike_dict_scalar(left, right) + } + t => Err(ArrowError::ComputeError(format!( + "Should be DictionaryArray but got: {}", t + ))) + ) + } + _ => { + Err(ArrowError::ComputeError( + "nlike_utf8_scalar_dyn only supports Utf8, LargeUtf8 or DictionaryArray with Utf8 or LargeUtf8 values".to_string(), + )) + } + } +} + +/// Perform SQL `left NOT LIKE right` operation on [`StringArray`] / +/// [`LargeStringArray`] and a scalar. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn nlike_utf8_scalar( + left: &GenericStringArray, + right: &str, +) -> Result { + nlike_scalar(left, right) +} + +/// Perform SQL `left NOT LIKE right` operation on [`DictionaryArray`] with values +/// [`StringArray`]/[`LargeStringArray`] and a scalar. +/// +/// See the documentation on [`like_utf8`] for more details. +fn nlike_dict_scalar( + left: &DictionaryArray, + right: &str, +) -> Result { + match left.value_type() { + DataType::Utf8 => { + let left = left.downcast_dict::>().unwrap(); + nlike_scalar(left, right) + } + DataType::LargeUtf8 => { + let left = left.downcast_dict::>().unwrap(); + nlike_scalar(left, right) + } + _ => { + Err(ArrowError::ComputeError( + "nlike_dict_scalar only supports DictionaryArray with Utf8 or LargeUtf8 values".to_string(), + )) + } + } +} + +/// Perform SQL `left ILIKE right` operation on [`StringArray`] / +/// [`LargeStringArray`]. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn ilike_utf8( + left: &GenericStringArray, + right: &GenericStringArray, +) -> Result { + regex_like(left, right, false, |re_pattern| { + Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { + ArrowError::ComputeError(format!( + "Unable to build regex from ILIKE pattern: {}", + e + )) + }) + }) +} + +/// Perform SQL `left ILIKE right` operation on on [`DictionaryArray`] with values +/// [`StringArray`]/[`LargeStringArray`]. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn ilike_dyn( + left: &dyn Array, + right: &dyn Array, +) -> Result { + match (left.data_type(), right.data_type()) { + (DataType::Utf8, DataType::Utf8) => { + let left = as_string_array(left); + let right = as_string_array(right); + ilike_utf8(left, right) + } + (DataType::LargeUtf8, DataType::LargeUtf8) => { + let left = as_largestring_array(left); + let right = as_largestring_array(right); + ilike_utf8(left, right) + } + #[cfg(feature = "dyn_cmp_dict")] + (DataType::Dictionary(_, _), DataType::Dictionary(_, _)) => { + downcast_dictionary_array!( + left => { + let right = as_dictionary_array(right); + ilike_dict(left, right) + } + t => Err(ArrowError::ComputeError(format!( + "Should be DictionaryArray but got: {}", t + ))) + ) + } + _ => { + Err(ArrowError::ComputeError( + "ilike_dyn only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values".to_string(), + )) + } + } +} + +/// Perform SQL `left ILIKE right` operation on on [`DictionaryArray`] with values +/// [`StringArray`]/[`LargeStringArray`]. +/// +/// See the documentation on [`like_utf8`] for more details. +#[cfg(feature = "dyn_cmp_dict")] +fn ilike_dict( + left: &DictionaryArray, + right: &DictionaryArray, +) -> Result { + match (left.value_type(), right.value_type()) { + (DataType::Utf8, DataType::Utf8) => { + let left = left.downcast_dict::>().unwrap(); + let right = right.downcast_dict::>().unwrap(); + + regex_like(left, right, false, |re_pattern| { + Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { + ArrowError::ComputeError(format!( + "Unable to build regex from ILIKE pattern: {}", + e + )) + }) + }) + } + (DataType::LargeUtf8, DataType::LargeUtf8) => { + let left = left.downcast_dict::>().unwrap(); + let right = right.downcast_dict::>().unwrap(); + + regex_like(left, right, false, |re_pattern| { + Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { + ArrowError::ComputeError(format!( + "Unable to build regex from ILIKE pattern: {}", + e + )) + }) + }) + } + _ => Err(ArrowError::ComputeError( + "ilike_dict only supports DictionaryArray with Utf8 or LargeUtf8 values" + .to_string(), + )), + } +} + +#[inline] +fn ilike_scalar<'a, L: ArrayAccessor>( + left: L, + right: &str, +) -> Result { + let null_bit_buffer = left.data().null_buffer().cloned(); + let bytes = bit_util::ceil(left.len(), 8); + let mut bool_buf = MutableBuffer::from_len_zeroed(bytes); + let bool_slice = bool_buf.as_slice_mut(); + + if !right.contains(is_like_pattern) { + // fast path, can use equals + let right_uppercase = right.to_uppercase(); + for i in 0..left.len() { + unsafe { + if left.value_unchecked(i).to_uppercase() == right_uppercase { + bit_util::set_bit(bool_slice, i); + } + } + } + } else if right.ends_with('%') + && !right.ends_with("\\%") + && !right[..right.len() - 1].contains(is_like_pattern) + { + // fast path, can use starts_with + let start_str = &right[..right.len() - 1].to_uppercase(); + for i in 0..left.len() { + unsafe { + if left + .value_unchecked(i) + .to_uppercase() + .starts_with(start_str) + { + bit_util::set_bit(bool_slice, i); + } + } + } + } else if right.starts_with('%') && !right[1..].contains(is_like_pattern) { + // fast path, can use ends_with + let ends_str = &right[1..].to_uppercase(); + + for i in 0..left.len() { + unsafe { + if left.value_unchecked(i).to_uppercase().ends_with(ends_str) { + bit_util::set_bit(bool_slice, i); + } + } + } + } else if right.starts_with('%') + && right.ends_with('%') + && !right[1..right.len() - 1].contains(is_like_pattern) + { + // fast path, can use contains + let contains = &right[1..right.len() - 1].to_uppercase(); + for i in 0..left.len() { + unsafe { + if left.value_unchecked(i).to_uppercase().contains(contains) { + bit_util::set_bit(bool_slice, i); + } + } + } + } else { + let re_pattern = replace_like_wildcards(right)?; + let re = Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { + ArrowError::ComputeError(format!( + "Unable to build regex from ILIKE pattern: {}", + e + )) + })?; + + for i in 0..left.len() { + let haystack = unsafe { left.value_unchecked(i) }; + if re.is_match(haystack) { + bit_util::set_bit(bool_slice, i); + } + } + }; + + let data = unsafe { + ArrayData::new_unchecked( + DataType::Boolean, + left.len(), + None, + null_bit_buffer, + 0, + vec![bool_buf.into()], + vec![], + ) + }; + Ok(BooleanArray::from(data)) +} + +/// Perform SQL `left ILIKE right` operation on [`StringArray`] / +/// [`LargeStringArray`], or [`DictionaryArray`] with values +/// [`StringArray`]/[`LargeStringArray`] and a scalar. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn ilike_utf8_scalar_dyn( + left: &dyn Array, + right: &str, +) -> Result { + match left.data_type() { + DataType::Utf8 => { + let left = as_string_array(left); + ilike_scalar(left, right) + } + DataType::LargeUtf8 => { + let left = as_largestring_array(left); + ilike_scalar(left, right) + } + DataType::Dictionary(_, _) => { + downcast_dictionary_array!( + left => { + ilike_dict_scalar(left, right) + } + t => Err(ArrowError::ComputeError(format!( + "Should be DictionaryArray but got: {}", t + ))) + ) + } + _ => { + Err(ArrowError::ComputeError( + "ilike_utf8_scalar_dyn only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values".to_string(), + )) + } + } +} + +/// Perform SQL `left ILIKE right` operation on [`StringArray`] / +/// [`LargeStringArray`] and a scalar. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn ilike_utf8_scalar( + left: &GenericStringArray, + right: &str, +) -> Result { + ilike_scalar(left, right) +} + +/// Perform SQL `left ILIKE right` operation on [`DictionaryArray`] with values +/// [`StringArray`]/[`LargeStringArray`] and a scalar. +/// +/// See the documentation on [`like_utf8`] for more details. +fn ilike_dict_scalar( + left: &DictionaryArray, + right: &str, +) -> Result { + match left.value_type() { + DataType::Utf8 => { + let left = left.downcast_dict::>().unwrap(); + ilike_scalar(left, right) + } + DataType::LargeUtf8 => { + let left = left.downcast_dict::>().unwrap(); + ilike_scalar(left, right) + } + _ => { + Err(ArrowError::ComputeError( + "ilike_dict_scalar only supports DictionaryArray with Utf8 or LargeUtf8 values".to_string(), + )) + } + } +} + +/// Perform SQL `left NOT ILIKE right` operation on [`StringArray`] / +/// [`LargeStringArray`]. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn nilike_utf8( + left: &GenericStringArray, + right: &GenericStringArray, +) -> Result { + regex_like(left, right, true, |re_pattern| { + Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { + ArrowError::ComputeError(format!( + "Unable to build regex from ILIKE pattern: {}", + e + )) + }) + }) +} + +/// Perform SQL `left NOT ILIKE right` operation on on [`DictionaryArray`] with values +/// [`StringArray`]/[`LargeStringArray`]. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn nilike_dyn( + left: &dyn Array, + right: &dyn Array, +) -> Result { + match (left.data_type(), right.data_type()) { + (DataType::Utf8, DataType::Utf8) => { + let left = as_string_array(left); + let right = as_string_array(right); + nilike_utf8(left, right) + } + (DataType::LargeUtf8, DataType::LargeUtf8) => { + let left = as_largestring_array(left); + let right = as_largestring_array(right); + nilike_utf8(left, right) + } + #[cfg(feature = "dyn_cmp_dict")] + (DataType::Dictionary(_, _), DataType::Dictionary(_, _)) => { + downcast_dictionary_array!( + left => { + let right = as_dictionary_array(right); + nilike_dict(left, right) + } + t => Err(ArrowError::ComputeError(format!( + "Should be DictionaryArray but got: {}", t + ))) + ) + } + _ => { + Err(ArrowError::ComputeError( + "nilike_dyn only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values".to_string(), + )) + } + } +} + +/// Perform SQL `left NOT ILIKE right` operation on on [`DictionaryArray`] with values +/// [`StringArray`]/[`LargeStringArray`]. +/// +/// See the documentation on [`like_utf8`] for more details. +#[cfg(feature = "dyn_cmp_dict")] +fn nilike_dict( + left: &DictionaryArray, + right: &DictionaryArray, +) -> Result { + match (left.value_type(), right.value_type()) { + (DataType::Utf8, DataType::Utf8) => { + let left = left.downcast_dict::>().unwrap(); + let right = right.downcast_dict::>().unwrap(); + + regex_like(left, right, true, |re_pattern| { + Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { + ArrowError::ComputeError(format!( + "Unable to build regex from ILIKE pattern: {}", + e + )) + }) + }) + } + (DataType::LargeUtf8, DataType::LargeUtf8) => { + let left = left.downcast_dict::>().unwrap(); + let right = right.downcast_dict::>().unwrap(); + + regex_like(left, right, true, |re_pattern| { + Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { + ArrowError::ComputeError(format!( + "Unable to build regex from ILIKE pattern: {}", + e + )) + }) + }) + } + _ => Err(ArrowError::ComputeError( + "nilike_dict only supports DictionaryArray with Utf8 or LargeUtf8 values" + .to_string(), + )), + } +} + +#[inline] +fn nilike_scalar<'a, L: ArrayAccessor>( + left: L, + right: &str, +) -> Result { + let null_bit_buffer = left.data().null_buffer().cloned(); + let bytes = bit_util::ceil(left.len(), 8); + let mut bool_buf = MutableBuffer::from_len_zeroed(bytes); + let bool_slice = bool_buf.as_slice_mut(); + + if !right.contains(is_like_pattern) { + // fast path, can use equals + let right_uppercase = right.to_uppercase(); + for i in 0..left.len() { + unsafe { + if left.value_unchecked(i).to_uppercase() != right_uppercase { + bit_util::set_bit(bool_slice, i); + } + } + } + } else if right.ends_with('%') + && !right.ends_with("\\%") + && !right[..right.len() - 1].contains(is_like_pattern) + { + // fast path, can use starts_with + let start_str = &right[..right.len() - 1].to_uppercase(); + for i in 0..left.len() { + unsafe { + if !(left + .value_unchecked(i) + .to_uppercase() + .starts_with(start_str)) + { + bit_util::set_bit(bool_slice, i); + } + } + } + } else if right.starts_with('%') && !right[1..].contains(is_like_pattern) { + // fast path, can use ends_with + let ends_str = &right[1..].to_uppercase(); + + for i in 0..left.len() { + unsafe { + if !(left.value_unchecked(i).to_uppercase().ends_with(ends_str)) { + bit_util::set_bit(bool_slice, i); + } + } + } + } else if right.starts_with('%') + && right.ends_with('%') + && !right[1..right.len() - 1].contains(is_like_pattern) + { + // fast path, can use contains + let contains = &right[1..right.len() - 1].to_uppercase(); + for i in 0..left.len() { + unsafe { + if !(left.value_unchecked(i).to_uppercase().contains(contains)) { + bit_util::set_bit(bool_slice, i); + } + } + } + } else { + let re_pattern = replace_like_wildcards(right)?; + let re = Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { + ArrowError::ComputeError(format!( + "Unable to build regex from ILIKE pattern: {}", + e + )) + })?; + + for i in 0..left.len() { + let haystack = unsafe { left.value_unchecked(i) }; + if !re.is_match(haystack) { + bit_util::set_bit(bool_slice, i); + } + } + }; + + let data = unsafe { + ArrayData::new_unchecked( + DataType::Boolean, + left.len(), + None, + null_bit_buffer, + 0, + vec![bool_buf.into()], + vec![], + ) + }; + Ok(BooleanArray::from(data)) +} + +/// Perform SQL `left NOT ILIKE right` operation on [`StringArray`] / +/// [`LargeStringArray`], or [`DictionaryArray`] with values +/// [`StringArray`]/[`LargeStringArray`] and a scalar. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn nilike_utf8_scalar_dyn( + left: &dyn Array, + right: &str, +) -> Result { + match left.data_type() { + DataType::Utf8 => { + let left = as_string_array(left); + nilike_scalar(left, right) + } + DataType::LargeUtf8 => { + let left = as_largestring_array(left); + nilike_scalar(left, right) + } + DataType::Dictionary(_, _) => { + downcast_dictionary_array!( + left => { + nilike_dict_scalar(left, right) + } + t => Err(ArrowError::ComputeError(format!( + "Should be DictionaryArray but got: {}", t + ))) + ) + } + _ => { + Err(ArrowError::ComputeError( + "nilike_utf8_scalar_dyn only supports Utf8, LargeUtf8 or DictionaryArray with Utf8 or LargeUtf8 values".to_string(), + )) + } + } +} + +/// Perform SQL `left NOT ILIKE right` operation on [`StringArray`] / +/// [`LargeStringArray`] and a scalar. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn nilike_utf8_scalar( + left: &GenericStringArray, + right: &str, +) -> Result { + nilike_scalar(left, right) +} + +/// Perform SQL `left NOT ILIKE right` operation on [`DictionaryArray`] with values +/// [`StringArray`]/[`LargeStringArray`] and a scalar. +/// +/// See the documentation on [`like_utf8`] for more details. +fn nilike_dict_scalar( + left: &DictionaryArray, + right: &str, +) -> Result { + match left.value_type() { + DataType::Utf8 => { + let left = left.downcast_dict::>().unwrap(); + nilike_scalar(left, right) + } + DataType::LargeUtf8 => { + let left = left.downcast_dict::>().unwrap(); + nilike_scalar(left, right) + } + _ => { + Err(ArrowError::ComputeError( + "nilike_dict_scalar only supports DictionaryArray with Utf8 or LargeUtf8 values".to_string(), + )) + } + } +} + +fn is_like_pattern(c: char) -> bool { + c == '%' || c == '_' +} + +/// Evaluate regex `op(left)` matching `right` on [`StringArray`] / [`LargeStringArray`] +/// +/// If `negate_regex` is true, the regex expression will be negated. (for example, with `not like`) +fn regex_like<'a, S: ArrayAccessor, F>( + left: S, + right: S, + negate_regex: bool, + op: F, +) -> Result +where + F: Fn(&str) -> Result, +{ + let mut map = HashMap::new(); + if left.len() != right.len() { + return Err(ArrowError::ComputeError( + "Cannot perform comparison operation on arrays of different length" + .to_string(), + )); + } + + let null_bit_buffer = + combine_option_bitmap(&[left.data_ref(), right.data_ref()], left.len()); + + let mut result = BooleanBufferBuilder::new(left.len()); + for i in 0..left.len() { + let haystack = left.value(i); + let pat = right.value(i); + let re = if let Some(ref regex) = map.get(pat) { + regex + } else { + let re_pattern = replace_like_wildcards(pat)?; + let re = op(&re_pattern)?; + map.insert(pat, re); + map.get(pat).unwrap() + }; + + result.append(if negate_regex { + !re.is_match(haystack) + } else { + re.is_match(haystack) + }); + } + + let data = unsafe { + ArrayData::new_unchecked( + DataType::Boolean, + left.len(), + None, + null_bit_buffer, + 0, + vec![result.finish()], + vec![], + ) + }; + Ok(BooleanArray::from(data)) +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow_array::types::Int8Type; + + macro_rules! test_utf8 { + ($test_name:ident, $left:expr, $right:expr, $op:expr, $expected:expr) => { + #[test] + fn $test_name() { + let left = StringArray::from($left); + let right = StringArray::from($right); + let res = $op(&left, &right).unwrap(); + let expected = $expected; + assert_eq!(expected.len(), res.len()); + for i in 0..res.len() { + let v = res.value(i); + assert_eq!(v, expected[i]); + } + } + }; + } + + macro_rules! test_dict_utf8 { + ($test_name:ident, $left:expr, $right:expr, $op:expr, $expected:expr) => { + #[test] + #[cfg(feature = "dyn_cmp_dict")] + fn $test_name() { + let left: DictionaryArray = $left.into_iter().collect(); + let right: DictionaryArray = $right.into_iter().collect(); + let res = $op(&left, &right).unwrap(); + let expected = $expected; + assert_eq!(expected.len(), res.len()); + for i in 0..res.len() { + let v = res.value(i); + assert_eq!(v, expected[i]); + } + } + }; + } + + macro_rules! test_utf8_scalar { + ($test_name:ident, $left:expr, $right:expr, $op:expr, $expected:expr) => { + #[test] + fn $test_name() { + let left = StringArray::from($left); + let res = $op(&left, $right).unwrap(); + let expected = $expected; + assert_eq!(expected.len(), res.len()); + for i in 0..res.len() { + let v = res.value(i); + assert_eq!( + v, + expected[i], + "unexpected result when comparing {} at position {} to {} ", + left.value(i), + i, + $right + ); + } + + let left = LargeStringArray::from($left); + let res = $op(&left, $right).unwrap(); + let expected = $expected; + assert_eq!(expected.len(), res.len()); + for i in 0..res.len() { + let v = res.value(i); + assert_eq!( + v, + expected[i], + "unexpected result when comparing {} at position {} to {} ", + left.value(i), + i, + $right + ); + } + } + }; + ($test_name:ident, $test_name_dyn:ident, $left:expr, $right:expr, $op:expr, $op_dyn:expr, $expected:expr) => { + test_utf8_scalar!($test_name, $left, $right, $op, $expected); + test_utf8_scalar!($test_name_dyn, $left, $right, $op_dyn, $expected); + }; + } + + + test_utf8!( + test_utf8_array_like, + vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow", "arrow"], + vec!["arrow", "ar%", "%ro%", "foo", "arr", "arrow_", "arrow_", ".*"], + like_utf8, + vec![true, true, true, false, false, true, false, false] + ); + + test_dict_utf8!( + test_utf8_array_like_dict, + vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow", "arrow"], + vec!["arrow", "ar%", "%ro%", "foo", "arr", "arrow_", "arrow_", ".*"], + like_dyn, + vec![true, true, true, false, false, true, false, false] + ); + + test_utf8_scalar!( + test_utf8_array_like_scalar_escape_testing, + test_utf8_array_like_scalar_dyn_escape_testing, + vec!["varchar(255)", "int(255)", "varchar", "int"], + "%(%)%", + like_utf8_scalar, + like_utf8_scalar_dyn, + vec![true, true, false, false] + ); + + test_utf8_scalar!( + test_utf8_array_like_scalar_escape_regex, + test_utf8_array_like_scalar_dyn_escape_regex, + vec![".*", "a", "*"], + ".*", + like_utf8_scalar, + like_utf8_scalar_dyn, + vec![true, false, false] + ); + + test_utf8_scalar!( + test_utf8_array_like_scalar_escape_regex_dot, + test_utf8_array_like_scalar_dyn_escape_regex_dot, + vec![".", "a", "*"], + ".", + like_utf8_scalar, + like_utf8_scalar_dyn, + vec![true, false, false] + ); + + test_utf8_scalar!( + test_utf8_array_like_scalar, + test_utf8_array_like_scalar_dyn, + vec!["arrow", "parquet", "datafusion", "flight"], + "%ar%", + like_utf8_scalar, + like_utf8_scalar_dyn, + vec![true, true, false, false] + ); + + test_utf8_scalar!( + test_utf8_array_like_scalar_start, + test_utf8_array_like_scalar_dyn_start, + vec!["arrow", "parrow", "arrows", "arr"], + "arrow%", + like_utf8_scalar, + like_utf8_scalar_dyn, + vec![true, false, true, false] + ); + + test_utf8_scalar!( + test_utf8_array_like_scalar_end, + test_utf8_array_like_scalar_dyn_end, + vec!["arrow", "parrow", "arrows", "arr"], + "%arrow", + like_utf8_scalar, + like_utf8_scalar_dyn, + vec![true, true, false, false] + ); + + test_utf8_scalar!( + test_utf8_array_like_scalar_equals, + test_utf8_array_like_scalar_dyn_equals, + vec!["arrow", "parrow", "arrows", "arr"], + "arrow", + like_utf8_scalar, + like_utf8_scalar_dyn, + vec![true, false, false, false] + ); + + test_utf8_scalar!( + test_utf8_array_like_scalar_one, + test_utf8_array_like_scalar_dyn_one, + vec!["arrow", "arrows", "parrow", "arr"], + "arrow_", + like_utf8_scalar, + like_utf8_scalar_dyn, + vec![false, true, false, false] + ); + + test_utf8_scalar!( + test_utf8_scalar_like_escape, + test_utf8_scalar_like_dyn_escape, + vec!["a%", "a\\x"], + "a\\%", + like_utf8_scalar, + like_utf8_scalar_dyn, + vec![true, false] + ); + + test_utf8_scalar!( + test_utf8_scalar_like_escape_contains, + test_utf8_scalar_like_dyn_escape_contains, + vec!["ba%", "ba\\x"], + "%a\\%", + like_utf8_scalar, + like_utf8_scalar_dyn, + vec![true, false] + ); + + test_utf8!( + test_utf8_scalar_ilike_regex, + vec!["%%%"], + vec![r#"\%_\%"#], + ilike_utf8, + vec![true] + ); + + test_dict_utf8!( + test_utf8_scalar_ilike_regex_dict, + vec!["%%%"], + vec![r#"\%_\%"#], + ilike_dyn, + vec![true] + ); + + #[test] + fn test_replace_like_wildcards() { + let a_eq = "_%"; + let expected = "..*"; + assert_eq!(replace_like_wildcards(a_eq).unwrap(), expected); + } + + #[test] + fn test_replace_like_wildcards_leave_like_meta_chars() { + let a_eq = "\\%\\_"; + let expected = "%_"; + assert_eq!(replace_like_wildcards(a_eq).unwrap(), expected); + } + + #[test] + fn test_replace_like_wildcards_with_multiple_escape_chars() { + let a_eq = "\\\\%"; + let expected = "\\\\%"; + assert_eq!(replace_like_wildcards(a_eq).unwrap(), expected); + } + + #[test] + fn test_replace_like_wildcards_escape_regex_meta_char() { + let a_eq = "."; + let expected = "\\."; + assert_eq!(replace_like_wildcards(a_eq).unwrap(), expected); + } + + test_utf8!( + test_utf8_array_nlike, + vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow"], + vec!["arrow", "ar%", "%ro%", "foo", "arr", "arrow_", "arrow_"], + nlike_utf8, + vec![false, false, false, true, true, false, true] + ); + + test_dict_utf8!( + test_utf8_array_nlike_dict, + vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow"], + vec!["arrow", "ar%", "%ro%", "foo", "arr", "arrow_", "arrow_"], + nlike_dyn, + vec![false, false, false, true, true, false, true] + ); + + test_utf8_scalar!( + test_utf8_array_nlike_escape_testing, + test_utf8_array_nlike_escape_dyn_testing_dyn, + vec!["varchar(255)", "int(255)", "varchar", "int"], + "%(%)%", + nlike_utf8_scalar, + nlike_utf8_scalar_dyn, + vec![false, false, true, true] + ); + + test_utf8_scalar!( + test_utf8_array_nlike_scalar_escape_regex, + test_utf8_array_nlike_scalar_dyn_escape_regex, + vec![".*", "a", "*"], + ".*", + nlike_utf8_scalar, + nlike_utf8_scalar_dyn, + vec![false, true, true] + ); + + test_utf8_scalar!( + test_utf8_array_nlike_scalar_escape_regex_dot, + test_utf8_array_nlike_scalar_dyn_escape_regex_dot, + vec![".", "a", "*"], + ".", + nlike_utf8_scalar, + nlike_utf8_scalar_dyn, + vec![false, true, true] + ); + test_utf8_scalar!( + test_utf8_array_nlike_scalar, + test_utf8_array_nlike_scalar_dyn, + vec!["arrow", "parquet", "datafusion", "flight"], + "%ar%", + nlike_utf8_scalar, + nlike_utf8_scalar_dyn, + vec![false, false, true, true] + ); + + test_utf8_scalar!( + test_utf8_array_nlike_scalar_start, + test_utf8_array_nlike_scalar_dyn_start, + vec!["arrow", "parrow", "arrows", "arr"], + "arrow%", + nlike_utf8_scalar, + nlike_utf8_scalar_dyn, + vec![false, true, false, true] + ); + + test_utf8_scalar!( + test_utf8_array_nlike_scalar_end, + test_utf8_array_nlike_scalar_dyn_end, + vec!["arrow", "parrow", "arrows", "arr"], + "%arrow", + nlike_utf8_scalar, + nlike_utf8_scalar_dyn, + vec![false, false, true, true] + ); + + test_utf8_scalar!( + test_utf8_array_nlike_scalar_equals, + test_utf8_array_nlike_scalar_dyn_equals, + vec!["arrow", "parrow", "arrows", "arr"], + "arrow", + nlike_utf8_scalar, + nlike_utf8_scalar_dyn, + vec![false, true, true, true] + ); + + test_utf8_scalar!( + test_utf8_array_nlike_scalar_one, + test_utf8_array_nlike_scalar_dyn_one, + vec!["arrow", "arrows", "parrow", "arr"], + "arrow_", + nlike_utf8_scalar, + nlike_utf8_scalar_dyn, + vec![true, false, true, true] + ); + + test_utf8!( + test_utf8_array_ilike, + vec!["arrow", "arrow", "ARROW", "arrow", "ARROW", "ARROWS", "arROw"], + vec!["arrow", "ar%", "%ro%", "foo", "ar%r", "arrow_", "arrow_"], + ilike_utf8, + vec![true, true, true, false, false, true, false] + ); + + test_dict_utf8!( + test_utf8_array_ilike_dict, + vec!["arrow", "arrow", "ARROW", "arrow", "ARROW", "ARROWS", "arROw"], + vec!["arrow", "ar%", "%ro%", "foo", "ar%r", "arrow_", "arrow_"], + ilike_dyn, + vec![true, true, true, false, false, true, false] + ); + + test_utf8_scalar!( + ilike_utf8_scalar_escape_testing, + ilike_utf8_scalar_escape_dyn_testing, + vec!["varchar(255)", "int(255)", "varchar", "int"], + "%(%)%", + ilike_utf8_scalar, + ilike_utf8_scalar_dyn, + vec![true, true, false, false] + ); + + test_utf8_scalar!( + test_utf8_array_ilike_scalar, + test_utf8_array_ilike_dyn_scalar, + vec!["arrow", "parquet", "datafusion", "flight"], + "%AR%", + ilike_utf8_scalar, + ilike_utf8_scalar_dyn, + vec![true, true, false, false] + ); + + test_utf8_scalar!( + test_utf8_array_ilike_scalar_start, + test_utf8_array_ilike_scalar_dyn_start, + vec!["arrow", "parrow", "arrows", "ARR"], + "aRRow%", + ilike_utf8_scalar, + ilike_utf8_scalar_dyn, + vec![true, false, true, false] + ); + + test_utf8_scalar!( + test_utf8_array_ilike_scalar_end, + test_utf8_array_ilike_scalar_dyn_end, + vec!["ArroW", "parrow", "ARRowS", "arr"], + "%arrow", + ilike_utf8_scalar, + ilike_utf8_scalar_dyn, + vec![true, true, false, false] + ); + + test_utf8_scalar!( + test_utf8_array_ilike_scalar_equals, + test_utf8_array_ilike_scalar_dyn_equals, + vec!["arrow", "parrow", "arrows", "arr"], + "Arrow", + ilike_utf8_scalar, + ilike_utf8_scalar_dyn, + vec![true, false, false, false] + ); + + test_utf8_scalar!( + test_utf8_array_ilike_scalar_one, + test_utf8_array_ilike_scalar_dyn_one, + vec!["arrow", "arrows", "parrow", "arr"], + "arrow_", + ilike_utf8_scalar, + ilike_utf8_scalar_dyn, + vec![false, true, false, false] + ); + + test_utf8!( + test_utf8_array_nilike, + vec!["arrow", "arrow", "ARROW", "arrow", "ARROW", "ARROWS", "arROw"], + vec!["arrow", "ar%", "%ro%", "foo", "ar%r", "arrow_", "arrow_"], + nilike_utf8, + vec![false, false, false, true, true, false, true] + ); + + test_dict_utf8!( + test_utf8_array_nilike_dict, + vec!["arrow", "arrow", "ARROW", "arrow", "ARROW", "ARROWS", "arROw"], + vec!["arrow", "ar%", "%ro%", "foo", "ar%r", "arrow_", "arrow_"], + nilike_dyn, + vec![false, false, false, true, true, false, true] + ); + + test_utf8_scalar!( + nilike_utf8_scalar_escape_testing, + nilike_utf8_scalar_escape_dyn_testing, + vec!["varchar(255)", "int(255)", "varchar", "int"], + "%(%)%", + nilike_utf8_scalar, + nilike_utf8_scalar_dyn, + vec![false, false, true, true] + ); + + test_utf8_scalar!( + test_utf8_array_nilike_scalar, + test_utf8_array_nilike_dyn_scalar, + vec!["arrow", "parquet", "datafusion", "flight"], + "%AR%", + nilike_utf8_scalar, + nilike_utf8_scalar_dyn, + vec![false, false, true, true] + ); + + test_utf8_scalar!( + test_utf8_array_nilike_scalar_start, + test_utf8_array_nilike_scalar_dyn_start, + vec!["arrow", "parrow", "arrows", "ARR"], + "aRRow%", + nilike_utf8_scalar, + nilike_utf8_scalar_dyn, + vec![false, true, false, true] + ); + + test_utf8_scalar!( + test_utf8_array_nilike_scalar_end, + test_utf8_array_nilike_scalar_dyn_end, + vec!["ArroW", "parrow", "ARRowS", "arr"], + "%arrow", + nilike_utf8_scalar, + nilike_utf8_scalar_dyn, + vec![false, false, true, true] + ); + + test_utf8_scalar!( + test_utf8_array_nilike_scalar_equals, + test_utf8_array_nilike_scalar_dyn_equals, + vec!["arRow", "parrow", "arrows", "arr"], + "Arrow", + nilike_utf8_scalar, + nilike_utf8_scalar_dyn, + vec![false, true, true, true] + ); + + test_utf8_scalar!( + test_utf8_array_nilike_scalar_one, + test_utf8_array_nilike_scalar_dyn_one, + vec!["arrow", "arrows", "parrow", "arr"], + "arrow_", + nilike_utf8_scalar, + nilike_utf8_scalar_dyn, + vec![true, false, true, true] + ); + + #[test] + fn test_dict_like_kernels() { + let data = vec![ + Some("Earth"), + Some("Fire"), + Some("Water"), + Some("Air"), + None, + Some("Air"), + ]; + + let dict_array: DictionaryArray = data.into_iter().collect(); + + assert_eq!( + like_utf8_scalar_dyn(&dict_array, "Air").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(false), + Some(false), + Some(true), + None, + Some(true) + ]), + ); + + assert_eq!( + like_utf8_scalar_dyn(&dict_array, "Air").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(false), + Some(false), + Some(true), + None, + Some(true) + ]), + ); + + assert_eq!( + like_utf8_scalar_dyn(&dict_array, "Wa%").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(false), + Some(true), + Some(false), + None, + Some(false) + ]), + ); + + assert_eq!( + like_utf8_scalar_dyn(&dict_array, "Wa%").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(false), + Some(true), + Some(false), + None, + Some(false) + ]), + ); + + assert_eq!( + like_utf8_scalar_dyn(&dict_array, "%r").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(false), + Some(true), + Some(true), + None, + Some(true) + ]), + ); + + assert_eq!( + like_utf8_scalar_dyn(&dict_array, "%r").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(false), + Some(true), + Some(true), + None, + Some(true) + ]), + ); + + assert_eq!( + like_utf8_scalar_dyn(&dict_array, "%i%").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(true), + Some(false), + Some(true), + None, + Some(true) + ]), + ); + + assert_eq!( + like_utf8_scalar_dyn(&dict_array, "%i%").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(true), + Some(false), + Some(true), + None, + Some(true) + ]), + ); + + assert_eq!( + like_utf8_scalar_dyn(&dict_array, "%a%r%").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(false), + Some(true), + Some(false), + None, + Some(false) + ]), + ); + + assert_eq!( + like_utf8_scalar_dyn(&dict_array, "%a%r%").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(false), + Some(true), + Some(false), + None, + Some(false) + ]), + ); + } + + #[test] + fn test_dict_nlike_kernels() { + let data = vec![ + Some("Earth"), + Some("Fire"), + Some("Water"), + Some("Air"), + None, + Some("Air"), + ]; + + let dict_array: DictionaryArray = data.into_iter().collect(); + + assert_eq!( + nlike_utf8_scalar_dyn(&dict_array, "Air").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(true), + Some(true), + Some(false), + None, + Some(false) + ]), + ); + + assert_eq!( + nlike_utf8_scalar_dyn(&dict_array, "Air").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(true), + Some(true), + Some(false), + None, + Some(false) + ]), + ); + + assert_eq!( + nlike_utf8_scalar_dyn(&dict_array, "Wa%").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(true), + Some(false), + Some(true), + None, + Some(true) + ]), + ); + + assert_eq!( + nlike_utf8_scalar_dyn(&dict_array, "Wa%").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(true), + Some(false), + Some(true), + None, + Some(true) + ]), + ); + + assert_eq!( + nlike_utf8_scalar_dyn(&dict_array, "%r").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(true), + Some(false), + Some(false), + None, + Some(false) + ]), + ); + + assert_eq!( + nlike_utf8_scalar_dyn(&dict_array, "%r").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(true), + Some(false), + Some(false), + None, + Some(false) + ]), + ); + + assert_eq!( + nlike_utf8_scalar_dyn(&dict_array, "%i%").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(false), + Some(true), + Some(false), + None, + Some(false) + ]), + ); + + assert_eq!( + nlike_utf8_scalar_dyn(&dict_array, "%i%").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(false), + Some(true), + Some(false), + None, + Some(false) + ]), + ); + + assert_eq!( + nlike_utf8_scalar_dyn(&dict_array, "%a%r%").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(true), + Some(false), + Some(true), + None, + Some(true) + ]), + ); + + assert_eq!( + nlike_utf8_scalar_dyn(&dict_array, "%a%r%").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(true), + Some(false), + Some(true), + None, + Some(true) + ]), + ); + } + + #[test] + fn test_dict_ilike_kernels() { + let data = vec![ + Some("Earth"), + Some("Fire"), + Some("Water"), + Some("Air"), + None, + Some("Air"), + ]; + + let dict_array: DictionaryArray = data.into_iter().collect(); + + assert_eq!( + ilike_utf8_scalar_dyn(&dict_array, "air").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(false), + Some(false), + Some(true), + None, + Some(true) + ]), + ); + + assert_eq!( + ilike_utf8_scalar_dyn(&dict_array, "air").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(false), + Some(false), + Some(true), + None, + Some(true) + ]), + ); + + assert_eq!( + ilike_utf8_scalar_dyn(&dict_array, "wa%").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(false), + Some(true), + Some(false), + None, + Some(false) + ]), + ); + + assert_eq!( + ilike_utf8_scalar_dyn(&dict_array, "wa%").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(false), + Some(true), + Some(false), + None, + Some(false) + ]), + ); + + assert_eq!( + ilike_utf8_scalar_dyn(&dict_array, "%R").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(false), + Some(true), + Some(true), + None, + Some(true) + ]), + ); + + assert_eq!( + ilike_utf8_scalar_dyn(&dict_array, "%R").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(false), + Some(true), + Some(true), + None, + Some(true) + ]), + ); + + assert_eq!( + ilike_utf8_scalar_dyn(&dict_array, "%I%").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(true), + Some(false), + Some(true), + None, + Some(true) + ]), + ); + + assert_eq!( + ilike_utf8_scalar_dyn(&dict_array, "%I%").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(true), + Some(false), + Some(true), + None, + Some(true) + ]), + ); + + assert_eq!( + ilike_utf8_scalar_dyn(&dict_array, "%A%r%").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(false), + Some(true), + Some(true), + None, + Some(true) + ]), + ); + + assert_eq!( + ilike_utf8_scalar_dyn(&dict_array, "%A%r%").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(false), + Some(true), + Some(true), + None, + Some(true) + ]), + ); + } + + #[test] + fn test_dict_nilike_kernels() { + let data = vec![ + Some("Earth"), + Some("Fire"), + Some("Water"), + Some("Air"), + None, + Some("Air"), + ]; + + let dict_array: DictionaryArray = data.into_iter().collect(); + + assert_eq!( + nilike_utf8_scalar_dyn(&dict_array, "air").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(true), + Some(true), + Some(false), + None, + Some(false) + ]), + ); + + assert_eq!( + nilike_utf8_scalar_dyn(&dict_array, "air").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(true), + Some(true), + Some(false), + None, + Some(false) + ]), + ); + + assert_eq!( + nilike_utf8_scalar_dyn(&dict_array, "wa%").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(true), + Some(false), + Some(true), + None, + Some(true) + ]), + ); + + assert_eq!( + nilike_utf8_scalar_dyn(&dict_array, "wa%").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(true), + Some(false), + Some(true), + None, + Some(true) + ]), + ); + + assert_eq!( + nilike_utf8_scalar_dyn(&dict_array, "%R").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(true), + Some(false), + Some(false), + None, + Some(false) + ]), + ); + + assert_eq!( + nilike_utf8_scalar_dyn(&dict_array, "%R").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(true), + Some(false), + Some(false), + None, + Some(false) + ]), + ); + + assert_eq!( + nilike_utf8_scalar_dyn(&dict_array, "%I%").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(false), + Some(true), + Some(false), + None, + Some(false) + ]), + ); + + assert_eq!( + nilike_utf8_scalar_dyn(&dict_array, "%I%").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(false), + Some(true), + Some(false), + None, + Some(false) + ]), + ); + + assert_eq!( + nilike_utf8_scalar_dyn(&dict_array, "%A%r%").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(true), + Some(false), + Some(false), + None, + Some(false) + ]), + ); + + assert_eq!( + nilike_utf8_scalar_dyn(&dict_array, "%A%r%").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(true), + Some(false), + Some(false), + None, + Some(false) + ]), + ); + } +} diff --git a/arrow/src/compute/kernels/regexp.rs b/arrow-string/src/regexp.rs similarity index 53% rename from arrow/src/compute/kernels/regexp.rs rename to arrow-string/src/regexp.rs index 1c5fa192775..da8cd9d522b 100644 --- a/arrow/src/compute/kernels/regexp.rs +++ b/arrow-string/src/regexp.rs @@ -18,22 +18,154 @@ //! Defines kernel to extract substrings based on a regular //! expression of a \[Large\]StringArray -use crate::array::{ - ArrayRef, GenericStringArray, GenericStringBuilder, ListBuilder, OffsetSizeTrait, -}; -use crate::error::{ArrowError, Result}; +use arrow_array::builder::{BooleanBufferBuilder, GenericStringBuilder, ListBuilder}; +use arrow_array::{Array, ArrayRef, BooleanArray, GenericStringArray, OffsetSizeTrait}; +use arrow_data::bit_mask::combine_option_bitmap; +use arrow_data::ArrayData; +use arrow_schema::{ArrowError, DataType}; +use regex::Regex; use std::collections::HashMap; - use std::sync::Arc; -use regex::Regex; +/// Perform SQL `array ~ regex_array` operation on [`StringArray`] / [`LargeStringArray`]. +/// If `regex_array` element has an empty value, the corresponding result value is always true. +/// +/// `flags_array` are optional [`StringArray`] / [`LargeStringArray`] flag, which allow +/// special search modes, such as case insensitive and multi-line mode. +/// See the documentation [here](https://docs.rs/regex/1.5.4/regex/#grouping-and-flags) +/// for more information. +pub fn regexp_is_match_utf8( + array: &GenericStringArray, + regex_array: &GenericStringArray, + flags_array: Option<&GenericStringArray>, +) -> Result { + if array.len() != regex_array.len() { + return Err(ArrowError::ComputeError( + "Cannot perform comparison operation on arrays of different length" + .to_string(), + )); + } + let null_bit_buffer = + combine_option_bitmap(&[array.data_ref(), regex_array.data_ref()], array.len()); + + let mut patterns: HashMap = HashMap::new(); + let mut result = BooleanBufferBuilder::new(array.len()); + + let complete_pattern = match flags_array { + Some(flags) => Box::new(regex_array.iter().zip(flags.iter()).map( + |(pattern, flags)| { + pattern.map(|pattern| match flags { + Some(flag) => format!("(?{}){}", flag, pattern), + None => pattern.to_string(), + }) + }, + )) as Box>>, + None => Box::new( + regex_array + .iter() + .map(|pattern| pattern.map(|pattern| pattern.to_string())), + ), + }; + + array + .iter() + .zip(complete_pattern) + .map(|(value, pattern)| { + match (value, pattern) { + // Required for Postgres compatibility: + // SELECT 'foobarbequebaz' ~ ''); = true + (Some(_), Some(pattern)) if pattern == *"" => { + result.append(true); + } + (Some(value), Some(pattern)) => { + let existing_pattern = patterns.get(&pattern); + let re = match existing_pattern { + Some(re) => re.clone(), + None => { + let re = Regex::new(pattern.as_str()).map_err(|e| { + ArrowError::ComputeError(format!( + "Regular expression did not compile: {:?}", + e + )) + })?; + patterns.insert(pattern, re.clone()); + re + } + }; + result.append(re.is_match(value)); + } + _ => result.append(false), + } + Ok(()) + }) + .collect::, ArrowError>>()?; + + let data = unsafe { + ArrayData::new_unchecked( + DataType::Boolean, + array.len(), + None, + null_bit_buffer, + 0, + vec![result.finish()], + vec![], + ) + }; + Ok(BooleanArray::from(data)) +} + +/// Perform SQL `array ~ regex_array` operation on [`StringArray`] / +/// [`LargeStringArray`] and a scalar. +/// +/// See the documentation on [`regexp_is_match_utf8`] for more details. +pub fn regexp_is_match_utf8_scalar( + array: &GenericStringArray, + regex: &str, + flag: Option<&str>, +) -> Result { + let null_bit_buffer = array.data().null_buffer().cloned(); + let mut result = BooleanBufferBuilder::new(array.len()); + + let pattern = match flag { + Some(flag) => format!("(?{}){}", flag, regex), + None => regex.to_string(), + }; + if pattern.is_empty() { + result.append_n(array.len(), true); + } else { + let re = Regex::new(pattern.as_str()).map_err(|e| { + ArrowError::ComputeError(format!( + "Regular expression did not compile: {:?}", + e + )) + })?; + for i in 0..array.len() { + let value = array.value(i); + result.append(re.is_match(value)); + } + } + + let buffer = result.finish(); + let data = unsafe { + ArrayData::new_unchecked( + DataType::Boolean, + array.len(), + None, + null_bit_buffer, + 0, + vec![buffer], + vec![], + ) + }; + Ok(BooleanArray::from(data)) +} /// Extract all groups matched by a regular expression for a given String array. pub fn regexp_match( array: &GenericStringArray, regex_array: &GenericStringArray, flags_array: Option<&GenericStringArray>, -) -> Result { +) -> Result { let mut patterns: HashMap = HashMap::new(); let builder: GenericStringBuilder = GenericStringBuilder::with_capacity(0, 0); @@ -94,14 +226,14 @@ pub fn regexp_match( } Ok(()) }) - .collect::>>()?; + .collect::, ArrowError>>()?; Ok(Arc::new(list_builder.finish())) } #[cfg(test)] mod tests { use super::*; - use crate::array::{ListArray, StringArray}; + use arrow_array::{ListArray, StringArray}; #[test] fn match_single_group() { @@ -117,7 +249,7 @@ mod tests { let mut pattern_values = vec![r".*-(\d*)-.*"; 4]; pattern_values.push(r"(bar)(bequ1e)"); pattern_values.push(""); - let pattern = StringArray::from(pattern_values); + let pattern = GenericStringArray::::from(pattern_values); let actual = regexp_match(&array, &pattern, None).unwrap(); let elem_builder: GenericStringBuilder = GenericStringBuilder::new(); let mut expected_builder = ListBuilder::new(elem_builder); diff --git a/arrow/src/compute/kernels/substring.rs b/arrow-string/src/substring.rs similarity index 95% rename from arrow/src/compute/kernels/substring.rs rename to arrow-string/src/substring.rs index 23cb2c19fdd..ece36755341 100644 --- a/arrow/src/compute/kernels/substring.rs +++ b/arrow-string/src/substring.rs @@ -19,14 +19,12 @@ //! Supported array types: //! [GenericStringArray], [GenericBinaryArray], [FixedSizeBinaryArray], [DictionaryArray] -use crate::array::DictionaryArray; -use crate::buffer::MutableBuffer; -use crate::datatypes::*; -use crate::{array::*, buffer::Buffer}; -use crate::{ - datatypes::DataType, - error::{ArrowError, Result}, -}; +use arrow_array::builder::BufferBuilder; +use arrow_array::types::*; +use arrow_array::*; +use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer}; +use arrow_data::ArrayData; +use arrow_schema::{ArrowError, DataType}; use std::cmp::Ordering; use std::sync::Arc; @@ -45,8 +43,8 @@ use std::sync::Arc; /// /// # Basic usage /// ``` -/// # use arrow::array::StringArray; -/// # use arrow::compute::kernels::substring::substring; +/// # use arrow_array::StringArray; +/// # use arrow_string::substring::substring; /// let array = StringArray::from(vec![Some("arrow"), None, Some("rust")]); /// let result = substring(&array, 1, Some(4)).unwrap(); /// let result = result.as_any().downcast_ref::().unwrap(); @@ -61,13 +59,17 @@ use std::sync::Arc; /// /// ## Example of trying to get an invalid utf-8 format substring /// ``` -/// # use arrow::array::StringArray; -/// # use arrow::compute::kernels::substring::substring; +/// # use arrow_array::StringArray; +/// # use arrow_string::substring::substring; /// let array = StringArray::from(vec![Some("E=mc²")]); /// let error = substring(&array, 0, Some(5)).unwrap_err().to_string(); /// assert!(error.contains("invalid utf-8 boundary")); /// ``` -pub fn substring(array: &dyn Array, start: i64, length: Option) -> Result { +pub fn substring( + array: &dyn Array, + start: i64, + length: Option, +) -> Result { macro_rules! substring_dict { ($kt: ident, $($t: ident: $gt: ident), *) => { match $kt.as_ref() { @@ -171,8 +173,8 @@ pub fn substring(array: &dyn Array, start: i64, length: Option) -> Result( array: &GenericStringArray, start: i64, length: Option, -) -> Result> { +) -> Result, ArrowError> { let mut vals = BufferBuilder::::new({ let offsets = array.value_offsets(); (offsets[array.len()] - offsets[0]).to_usize().unwrap() @@ -251,7 +253,7 @@ fn binary_substring( array: &GenericBinaryArray, start: OffsetSize, length: Option, -) -> Result { +) -> Result { let offsets = array.value_offsets(); let data = array.value_data(); let zero = OffsetSize::zero(); @@ -312,7 +314,7 @@ fn fixed_size_binary_substring( old_len: i32, start: i32, length: Option, -) -> Result { +) -> Result { let new_start = if start >= 0 { start.min(old_len) } else { @@ -361,7 +363,7 @@ fn utf8_substring( array: &GenericStringArray, start: OffsetSize, length: Option, -) -> Result { +) -> Result { let offsets = array.value_offsets(); let data = array.value_data(); let zero = OffsetSize::zero(); @@ -391,21 +393,23 @@ fn utf8_substring( let mut len_so_far = zero; new_offsets.push(zero); - offsets.windows(2).try_for_each(|pair| -> Result<()> { - let new_start = match start.cmp(&zero) { - Ordering::Greater => check_char_boundary((pair[0] + start).min(pair[1]))?, - Ordering::Equal => pair[0], - Ordering::Less => check_char_boundary((pair[1] + start).max(pair[0]))?, - }; - let new_end = match length { - Some(length) => check_char_boundary((length + new_start).min(pair[1]))?, - None => pair[1], - }; - len_so_far += new_end - new_start; - new_starts_ends.push((new_start, new_end)); - new_offsets.push(len_so_far); - Ok(()) - })?; + offsets + .windows(2) + .try_for_each(|pair| -> Result<(), ArrowError> { + let new_start = match start.cmp(&zero) { + Ordering::Greater => check_char_boundary((pair[0] + start).min(pair[1]))?, + Ordering::Equal => pair[0], + Ordering::Less => check_char_boundary((pair[1] + start).max(pair[0]))?, + }; + let new_end = match length { + Some(length) => check_char_boundary((length + new_start).min(pair[1]))?, + None => pair[1], + }; + len_so_far += new_end - new_start; + new_starts_ends.push((new_start, new_end)); + new_offsets.push(len_so_far); + Ok(()) + })?; // concatenate substrings into a buffer let mut new_values = MutableBuffer::new(new_offsets.last().unwrap().as_usize()); @@ -439,7 +443,6 @@ fn utf8_substring( #[cfg(test)] mod tests { use super::*; - use crate::datatypes::*; /// A helper macro to generate test cases. /// # Arguments diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 876d0d65084..17f88c084cb 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -54,12 +54,12 @@ arrow-ipc = { version = "28.0.0", path = "../arrow-ipc", optional = true } arrow-json = { version = "28.0.0", path = "../arrow-json", optional = true } arrow-schema = { version = "28.0.0", path = "../arrow-schema" } arrow-select = { version = "28.0.0", path = "../arrow-select" } +arrow-string = { version = "28.0.0", path = "../arrow-string" } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } num = { version = "0.4", default-features = false, features = ["std"] } half = { version = "2.1", default-features = false, features = ["num-traits"] } hashbrown = { version = "0.13", default-features = false } regex = { version = "1.7.0", default-features = false, features = ["std", "unicode", "perf"] } -regex-syntax = { version = "0.6.27", default-features = false, features = ["unicode"] } packed_simd = { version = "0.3", default-features = false, optional = true, package = "packed_simd_2" } chrono = { version = "0.4.23", default-features = false, features = ["clock"] } comfy-table = { version = "6.0", optional = true, default-features = false } @@ -92,7 +92,7 @@ force_validate = ["arrow-data/force_validate"] ffi = ["bitflags"] # Enable dyn-comparison of dictionary arrays with other arrays # Note: this does not impact comparison against scalars -dyn_cmp_dict = [] +dyn_cmp_dict = ["arrow-string/dyn_cmp_dict"] # Enable dyn-arithmetic kernels for dictionary arrays # Note: this does not impact arithmetic with scalars dyn_arith_dict = [] diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs index b672410fec1..cc5aca1527b 100644 --- a/arrow/src/compute/kernels/comparison.rs +++ b/arrow/src/compute/kernels/comparison.rs @@ -23,1227 +23,75 @@ //! [here](https://doc.rust-lang.org/stable/core/arch/) for more information. //! -use crate::array::*; -use crate::buffer::{buffer_unary_not, Buffer, MutableBuffer}; -use crate::datatypes::*; -#[allow(unused_imports)] -use crate::downcast_dictionary_array; -use crate::error::{ArrowError, Result}; -use crate::util::bit_util; -use arrow_data::bit_mask::combine_option_bitmap; -use arrow_select::take::take; -use num::ToPrimitive; -use regex::Regex; -use std::collections::HashMap; - -/// Helper function to perform boolean lambda function on values from two array accessors, this -/// version does not attempt to use SIMD. -fn compare_op( - left: T, - right: S, - op: F, -) -> Result -where - F: Fn(T::Item, S::Item) -> bool, -{ - if left.len() != right.len() { - return Err(ArrowError::ComputeError( - "Cannot perform comparison operation on arrays of different length" - .to_string(), - )); - } - - Ok(BooleanArray::from_binary(left, right, op)) -} - -/// Helper function to perform boolean lambda function on values from array accessor, this -/// version does not attempt to use SIMD. -fn compare_op_scalar(left: T, op: F) -> Result -where - F: Fn(T::Item) -> bool, -{ - Ok(BooleanArray::from_unary(left, op)) -} - -/// Evaluate `op(left, right)` for [`PrimitiveArray`]s using a specified -/// comparison function. -pub fn no_simd_compare_op( - left: &PrimitiveArray, - right: &PrimitiveArray, - op: F, -) -> Result -where - T: ArrowNumericType, - F: Fn(T::Native, T::Native) -> bool, -{ - compare_op(left, right, op) -} - -/// Evaluate `op(left, right)` for [`PrimitiveArray`] and scalar using -/// a specified comparison function. -pub fn no_simd_compare_op_scalar( - left: &PrimitiveArray, - right: T::Native, - op: F, -) -> Result -where - T: ArrowNumericType, - F: Fn(T::Native, T::Native) -> bool, -{ - compare_op_scalar(left, |l| op(l, right)) -} - -fn is_like_pattern(c: char) -> bool { - c == '%' || c == '_' -} - -/// Evaluate regex `op(left)` matching `right` on [`StringArray`] / [`LargeStringArray`] -/// -/// If `negate_regex` is true, the regex expression will be negated. (for example, with `not like`) -fn regex_like<'a, S: ArrayAccessor, F>( - left: S, - right: S, - negate_regex: bool, - op: F, -) -> Result -where - F: Fn(&str) -> Result, -{ - let mut map = HashMap::new(); - if left.len() != right.len() { - return Err(ArrowError::ComputeError( - "Cannot perform comparison operation on arrays of different length" - .to_string(), - )); - } - - let null_bit_buffer = - combine_option_bitmap(&[left.data_ref(), right.data_ref()], left.len()); - - let mut result = BooleanBufferBuilder::new(left.len()); - for i in 0..left.len() { - let haystack = left.value(i); - let pat = right.value(i); - let re = if let Some(ref regex) = map.get(pat) { - regex - } else { - let re_pattern = replace_like_wildcards(pat)?; - let re = op(&re_pattern)?; - map.insert(pat, re); - map.get(pat).unwrap() - }; - - result.append(if negate_regex { - !re.is_match(haystack) - } else { - re.is_match(haystack) - }); - } - - let data = unsafe { - ArrayData::new_unchecked( - DataType::Boolean, - left.len(), - None, - null_bit_buffer, - 0, - vec![result.finish()], - vec![], - ) - }; - Ok(BooleanArray::from(data)) -} - -/// Perform SQL `left LIKE right` operation on [`StringArray`] / [`LargeStringArray`]. -/// -/// There are two wildcards supported with the LIKE operator: -/// -/// 1. `%` - The percent sign represents zero, one, or multiple characters -/// 2. `_` - The underscore represents a single character -/// -/// For example: -/// ``` -/// use arrow::array::{StringArray, BooleanArray}; -/// use arrow::compute::like_utf8; -/// -/// let strings = StringArray::from(vec!["Arrow", "Arrow", "Arrow", "Ar"]); -/// let patterns = StringArray::from(vec!["A%", "B%", "A.", "A_"]); -/// -/// let result = like_utf8(&strings, &patterns).unwrap(); -/// assert_eq!(result, BooleanArray::from(vec![true, false, false, true])); -/// ``` -pub fn like_utf8( - left: &GenericStringArray, - right: &GenericStringArray, -) -> Result { - regex_like(left, right, false, |re_pattern| { - Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from LIKE pattern: {}", - e - )) - }) - }) -} - -/// Perform SQL `left LIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`], or [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`]. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn like_dyn(left: &dyn Array, right: &dyn Array) -> Result { - match (left.data_type(), right.data_type()) { - (DataType::Utf8, DataType::Utf8) => { - let left = as_string_array(left); - let right = as_string_array(right); - like_utf8(left, right) - } - (DataType::LargeUtf8, DataType::LargeUtf8) => { - let left = as_largestring_array(left); - let right = as_largestring_array(right); - like_utf8(left, right) - } - #[cfg(feature = "dyn_cmp_dict")] - (DataType::Dictionary(_, _), DataType::Dictionary(_, _)) => { - downcast_dictionary_array!( - left => { - let right = as_dictionary_array(right); - like_dict(left, right) - } - t => Err(ArrowError::ComputeError(format!( - "Should be DictionaryArray but got: {}", t - ))) - ) - } - _ => { - Err(ArrowError::ComputeError( - "like_dyn only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values".to_string(), - )) - } - } -} - -/// Perform SQL `left LIKE right` operation on on [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`]. -/// -/// See the documentation on [`like_utf8`] for more details. -#[cfg(feature = "dyn_cmp_dict")] -fn like_dict( - left: &DictionaryArray, - right: &DictionaryArray, -) -> Result { - match (left.value_type(), right.value_type()) { - (DataType::Utf8, DataType::Utf8) => { - let left = left.downcast_dict::>().unwrap(); - let right = right.downcast_dict::>().unwrap(); - - regex_like(left, right, false, |re_pattern| { - Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from LIKE pattern: {}", - e - )) - }) - }) - } - (DataType::LargeUtf8, DataType::LargeUtf8) => { - let left = left.downcast_dict::>().unwrap(); - let right = right.downcast_dict::>().unwrap(); - - regex_like(left, right, false, |re_pattern| { - Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from LIKE pattern: {}", - e - )) - }) - }) - } - _ => Err(ArrowError::ComputeError( - "like_dict only supports DictionaryArray with Utf8 or LargeUtf8 values" - .to_string(), - )), - } -} - -#[inline] -fn like_scalar_op<'a, F: Fn(bool) -> bool, L: ArrayAccessor>( - left: L, - right: &str, - op: F, -) -> Result { - if !right.contains(is_like_pattern) { - // fast path, can use equals - compare_op_scalar(left, |item| op(item == right)) - } else if right.ends_with('%') - && !right.ends_with("\\%") - && !right[..right.len() - 1].contains(is_like_pattern) - { - // fast path, can use starts_with - let starts_with = &right[..right.len() - 1]; - - compare_op_scalar(left, |item| op(item.starts_with(starts_with))) - } else if right.starts_with('%') && !right[1..].contains(is_like_pattern) { - // fast path, can use ends_with - let ends_with = &right[1..]; - - compare_op_scalar(left, |item| op(item.ends_with(ends_with))) - } else if right.starts_with('%') - && right.ends_with('%') - && !right.ends_with("\\%") - && !right[1..right.len() - 1].contains(is_like_pattern) - { - let contains = &right[1..right.len() - 1]; - - compare_op_scalar(left, |item| op(item.contains(contains))) - } else { - let re_pattern = replace_like_wildcards(right)?; - let re = Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from LIKE pattern: {}", - e - )) - })?; - - compare_op_scalar(left, |item| op(re.is_match(item))) - } -} - -#[inline] -fn like_scalar<'a, L: ArrayAccessor>( - left: L, - right: &str, -) -> Result { - like_scalar_op(left, right, |x| x) -} - -/// Perform SQL `left LIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`], or [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn like_utf8_scalar_dyn(left: &dyn Array, right: &str) -> Result { - match left.data_type() { - DataType::Utf8 => { - let left = as_string_array(left); - like_scalar(left, right) - } - DataType::LargeUtf8 => { - let left = as_largestring_array(left); - like_scalar(left, right) - } - DataType::Dictionary(_, _) => { - downcast_dictionary_array!( - left => { - like_dict_scalar(left, right) - } - t => Err(ArrowError::ComputeError(format!( - "Should be DictionaryArray but got: {}", t - ))) - ) - } - _ => { - Err(ArrowError::ComputeError( - "like_utf8_scalar_dyn only supports Utf8, LargeUtf8 or DictionaryArray with Utf8 or LargeUtf8 values".to_string(), - )) - } - } -} - -/// Perform SQL `left LIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn like_utf8_scalar( - left: &GenericStringArray, - right: &str, -) -> Result { - like_scalar(left, right) -} - -/// Perform SQL `left LIKE right` operation on [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -fn like_dict_scalar( - left: &DictionaryArray, - right: &str, -) -> Result { - match left.value_type() { - DataType::Utf8 => { - let left = left.downcast_dict::>().unwrap(); - like_scalar(left, right) - } - DataType::LargeUtf8 => { - let left = left.downcast_dict::>().unwrap(); - like_scalar(left, right) - } - _ => { - Err(ArrowError::ComputeError( - "like_dict_scalar only supports DictionaryArray with Utf8 or LargeUtf8 values".to_string(), - )) - } - } -} - -/// Transforms a like `pattern` to a regex compatible pattern. To achieve that, it does: -/// -/// 1. Replace like wildcards for regex expressions as the pattern will be evaluated using regex match: `%` => `.*` and `_` => `.` -/// 2. Escape regex meta characters to match them and not be evaluated as regex special chars. For example: `.` => `\\.` -/// 3. Replace escaped like wildcards removing the escape characters to be able to match it as a regex. For example: `\\%` => `%` -fn replace_like_wildcards(pattern: &str) -> Result { - let mut result = String::new(); - let pattern = String::from(pattern); - let mut chars_iter = pattern.chars().peekable(); - while let Some(c) = chars_iter.next() { - if c == '\\' { - let next = chars_iter.peek(); - match next { - Some(next) if is_like_pattern(*next) => { - result.push(*next); - // Skipping the next char as it is already appended - chars_iter.next(); - } - _ => { - result.push('\\'); - result.push('\\'); - } - } - } else if regex_syntax::is_meta_character(c) { - result.push('\\'); - result.push(c); - } else if c == '%' { - result.push_str(".*"); - } else if c == '_' { - result.push('.'); - } else { - result.push(c); - } - } - Ok(result) -} - -/// Perform SQL `left NOT LIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`]. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn nlike_utf8( - left: &GenericStringArray, - right: &GenericStringArray, -) -> Result { - regex_like(left, right, true, |re_pattern| { - Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from LIKE pattern: {}", - e - )) - }) - }) -} - -/// Perform SQL `left NOT LIKE right` operation on on [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`]. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn nlike_dyn(left: &dyn Array, right: &dyn Array) -> Result { - match (left.data_type(), right.data_type()) { - (DataType::Utf8, DataType::Utf8) => { - let left = as_string_array(left); - let right = as_string_array(right); - nlike_utf8(left, right) - } - (DataType::LargeUtf8, DataType::LargeUtf8) => { - let left = as_largestring_array(left); - let right = as_largestring_array(right); - nlike_utf8(left, right) - } - #[cfg(feature = "dyn_cmp_dict")] - (DataType::Dictionary(_, _), DataType::Dictionary(_, _)) => { - downcast_dictionary_array!( - left => { - let right = as_dictionary_array(right); - nlike_dict(left, right) - } - t => Err(ArrowError::ComputeError(format!( - "Should be DictionaryArray but got: {}", t - ))) - ) - } - _ => { - Err(ArrowError::ComputeError( - "nlike_dyn only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values".to_string(), - )) - } - } -} - -/// Perform SQL `left NOT LIKE right` operation on on [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`]. -/// -/// See the documentation on [`like_utf8`] for more details. -#[cfg(feature = "dyn_cmp_dict")] -fn nlike_dict( - left: &DictionaryArray, - right: &DictionaryArray, -) -> Result { - match (left.value_type(), right.value_type()) { - (DataType::Utf8, DataType::Utf8) => { - let left = left.downcast_dict::>().unwrap(); - let right = right.downcast_dict::>().unwrap(); - - regex_like(left, right, true, |re_pattern| { - Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from LIKE pattern: {}", - e - )) - }) - }) - } - (DataType::LargeUtf8, DataType::LargeUtf8) => { - let left = left.downcast_dict::>().unwrap(); - let right = right.downcast_dict::>().unwrap(); - - regex_like(left, right, true, |re_pattern| { - Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from LIKE pattern: {}", - e - )) - }) - }) - } - _ => Err(ArrowError::ComputeError( - "nlike_dict only supports DictionaryArray with Utf8 or LargeUtf8 values" - .to_string(), - )), - } -} - -#[inline] -fn nlike_scalar<'a, L: ArrayAccessor>( - left: L, - right: &str, -) -> Result { - like_scalar_op(left, right, |x| !x) -} - -/// Perform SQL `left NOT LIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`], or [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn nlike_utf8_scalar_dyn(left: &dyn Array, right: &str) -> Result { - match left.data_type() { - DataType::Utf8 => { - let left = as_string_array(left); - nlike_scalar(left, right) - } - DataType::LargeUtf8 => { - let left = as_largestring_array(left); - nlike_scalar(left, right) - } - DataType::Dictionary(_, _) => { - downcast_dictionary_array!( - left => { - nlike_dict_scalar(left, right) - } - t => Err(ArrowError::ComputeError(format!( - "Should be DictionaryArray but got: {}", t - ))) - ) - } - _ => { - Err(ArrowError::ComputeError( - "nlike_utf8_scalar_dyn only supports Utf8, LargeUtf8 or DictionaryArray with Utf8 or LargeUtf8 values".to_string(), - )) - } - } -} - -/// Perform SQL `left NOT LIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn nlike_utf8_scalar( - left: &GenericStringArray, - right: &str, -) -> Result { - nlike_scalar(left, right) -} - -/// Perform SQL `left NOT LIKE right` operation on [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -fn nlike_dict_scalar( - left: &DictionaryArray, - right: &str, -) -> Result { - match left.value_type() { - DataType::Utf8 => { - let left = left.downcast_dict::>().unwrap(); - nlike_scalar(left, right) - } - DataType::LargeUtf8 => { - let left = left.downcast_dict::>().unwrap(); - nlike_scalar(left, right) - } - _ => { - Err(ArrowError::ComputeError( - "nlike_dict_scalar only supports DictionaryArray with Utf8 or LargeUtf8 values".to_string(), - )) - } - } -} - -/// Perform SQL `left ILIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`]. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn ilike_utf8( - left: &GenericStringArray, - right: &GenericStringArray, -) -> Result { - regex_like(left, right, false, |re_pattern| { - Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from ILIKE pattern: {}", - e - )) - }) - }) -} - -/// Perform SQL `left ILIKE right` operation on on [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`]. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn ilike_dyn(left: &dyn Array, right: &dyn Array) -> Result { - match (left.data_type(), right.data_type()) { - (DataType::Utf8, DataType::Utf8) => { - let left = as_string_array(left); - let right = as_string_array(right); - ilike_utf8(left, right) - } - (DataType::LargeUtf8, DataType::LargeUtf8) => { - let left = as_largestring_array(left); - let right = as_largestring_array(right); - ilike_utf8(left, right) - } - #[cfg(feature = "dyn_cmp_dict")] - (DataType::Dictionary(_, _), DataType::Dictionary(_, _)) => { - downcast_dictionary_array!( - left => { - let right = as_dictionary_array(right); - ilike_dict(left, right) - } - t => Err(ArrowError::ComputeError(format!( - "Should be DictionaryArray but got: {}", t - ))) - ) - } - _ => { - Err(ArrowError::ComputeError( - "ilike_dyn only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values".to_string(), - )) - } - } -} - -/// Perform SQL `left ILIKE right` operation on on [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`]. -/// -/// See the documentation on [`like_utf8`] for more details. -#[cfg(feature = "dyn_cmp_dict")] -fn ilike_dict( - left: &DictionaryArray, - right: &DictionaryArray, -) -> Result { - match (left.value_type(), right.value_type()) { - (DataType::Utf8, DataType::Utf8) => { - let left = left.downcast_dict::>().unwrap(); - let right = right.downcast_dict::>().unwrap(); - - regex_like(left, right, false, |re_pattern| { - Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from ILIKE pattern: {}", - e - )) - }) - }) - } - (DataType::LargeUtf8, DataType::LargeUtf8) => { - let left = left.downcast_dict::>().unwrap(); - let right = right.downcast_dict::>().unwrap(); - - regex_like(left, right, false, |re_pattern| { - Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from ILIKE pattern: {}", - e - )) - }) - }) - } - _ => Err(ArrowError::ComputeError( - "ilike_dict only supports DictionaryArray with Utf8 or LargeUtf8 values" - .to_string(), - )), - } -} - -#[inline] -fn ilike_scalar<'a, L: ArrayAccessor>( - left: L, - right: &str, -) -> Result { - let null_bit_buffer = left.data().null_buffer().cloned(); - let bytes = bit_util::ceil(left.len(), 8); - let mut bool_buf = MutableBuffer::from_len_zeroed(bytes); - let bool_slice = bool_buf.as_slice_mut(); - - if !right.contains(is_like_pattern) { - // fast path, can use equals - let right_uppercase = right.to_uppercase(); - for i in 0..left.len() { - unsafe { - if left.value_unchecked(i).to_uppercase() == right_uppercase { - bit_util::set_bit(bool_slice, i); - } - } - } - } else if right.ends_with('%') - && !right.ends_with("\\%") - && !right[..right.len() - 1].contains(is_like_pattern) - { - // fast path, can use starts_with - let start_str = &right[..right.len() - 1].to_uppercase(); - for i in 0..left.len() { - unsafe { - if left - .value_unchecked(i) - .to_uppercase() - .starts_with(start_str) - { - bit_util::set_bit(bool_slice, i); - } - } - } - } else if right.starts_with('%') && !right[1..].contains(is_like_pattern) { - // fast path, can use ends_with - let ends_str = &right[1..].to_uppercase(); - - for i in 0..left.len() { - unsafe { - if left.value_unchecked(i).to_uppercase().ends_with(ends_str) { - bit_util::set_bit(bool_slice, i); - } - } - } - } else if right.starts_with('%') - && right.ends_with('%') - && !right[1..right.len() - 1].contains(is_like_pattern) - { - // fast path, can use contains - let contains = &right[1..right.len() - 1].to_uppercase(); - for i in 0..left.len() { - unsafe { - if left.value_unchecked(i).to_uppercase().contains(contains) { - bit_util::set_bit(bool_slice, i); - } - } - } - } else { - let re_pattern = replace_like_wildcards(right)?; - let re = Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from ILIKE pattern: {}", - e - )) - })?; - - for i in 0..left.len() { - let haystack = unsafe { left.value_unchecked(i) }; - if re.is_match(haystack) { - bit_util::set_bit(bool_slice, i); - } - } - }; - - let data = unsafe { - ArrayData::new_unchecked( - DataType::Boolean, - left.len(), - None, - null_bit_buffer, - 0, - vec![bool_buf.into()], - vec![], - ) - }; - Ok(BooleanArray::from(data)) -} - -/// Perform SQL `left ILIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`], or [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn ilike_utf8_scalar_dyn(left: &dyn Array, right: &str) -> Result { - match left.data_type() { - DataType::Utf8 => { - let left = as_string_array(left); - ilike_scalar(left, right) - } - DataType::LargeUtf8 => { - let left = as_largestring_array(left); - ilike_scalar(left, right) - } - DataType::Dictionary(_, _) => { - downcast_dictionary_array!( - left => { - ilike_dict_scalar(left, right) - } - t => Err(ArrowError::ComputeError(format!( - "Should be DictionaryArray but got: {}", t - ))) - ) - } - _ => { - Err(ArrowError::ComputeError( - "ilike_utf8_scalar_dyn only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values".to_string(), - )) - } - } -} - -/// Perform SQL `left ILIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn ilike_utf8_scalar( - left: &GenericStringArray, - right: &str, -) -> Result { - ilike_scalar(left, right) -} - -/// Perform SQL `left ILIKE right` operation on [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -fn ilike_dict_scalar( - left: &DictionaryArray, - right: &str, -) -> Result { - match left.value_type() { - DataType::Utf8 => { - let left = left.downcast_dict::>().unwrap(); - ilike_scalar(left, right) - } - DataType::LargeUtf8 => { - let left = left.downcast_dict::>().unwrap(); - ilike_scalar(left, right) - } - _ => { - Err(ArrowError::ComputeError( - "ilike_dict_scalar only supports DictionaryArray with Utf8 or LargeUtf8 values".to_string(), - )) - } - } -} - -/// Perform SQL `left NOT ILIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`]. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn nilike_utf8( - left: &GenericStringArray, - right: &GenericStringArray, -) -> Result { - regex_like(left, right, true, |re_pattern| { - Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from ILIKE pattern: {}", - e - )) - }) - }) -} - -/// Perform SQL `left NOT ILIKE right` operation on on [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`]. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn nilike_dyn(left: &dyn Array, right: &dyn Array) -> Result { - match (left.data_type(), right.data_type()) { - (DataType::Utf8, DataType::Utf8) => { - let left = as_string_array(left); - let right = as_string_array(right); - nilike_utf8(left, right) - } - (DataType::LargeUtf8, DataType::LargeUtf8) => { - let left = as_largestring_array(left); - let right = as_largestring_array(right); - nilike_utf8(left, right) - } - #[cfg(feature = "dyn_cmp_dict")] - (DataType::Dictionary(_, _), DataType::Dictionary(_, _)) => { - downcast_dictionary_array!( - left => { - let right = as_dictionary_array(right); - nilike_dict(left, right) - } - t => Err(ArrowError::ComputeError(format!( - "Should be DictionaryArray but got: {}", t - ))) - ) - } - _ => { - Err(ArrowError::ComputeError( - "nilike_dyn only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values".to_string(), - )) - } - } -} - -/// Perform SQL `left NOT ILIKE right` operation on on [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`]. -/// -/// See the documentation on [`like_utf8`] for more details. -#[cfg(feature = "dyn_cmp_dict")] -fn nilike_dict( - left: &DictionaryArray, - right: &DictionaryArray, -) -> Result { - match (left.value_type(), right.value_type()) { - (DataType::Utf8, DataType::Utf8) => { - let left = left.downcast_dict::>().unwrap(); - let right = right.downcast_dict::>().unwrap(); - - regex_like(left, right, true, |re_pattern| { - Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from ILIKE pattern: {}", - e - )) - }) - }) - } - (DataType::LargeUtf8, DataType::LargeUtf8) => { - let left = left.downcast_dict::>().unwrap(); - let right = right.downcast_dict::>().unwrap(); - - regex_like(left, right, true, |re_pattern| { - Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from ILIKE pattern: {}", - e - )) - }) - }) - } - _ => Err(ArrowError::ComputeError( - "nilike_dict only supports DictionaryArray with Utf8 or LargeUtf8 values" - .to_string(), - )), - } -} - -#[inline] -fn nilike_scalar<'a, L: ArrayAccessor>( - left: L, - right: &str, -) -> Result { - let null_bit_buffer = left.data().null_buffer().cloned(); - let bytes = bit_util::ceil(left.len(), 8); - let mut bool_buf = MutableBuffer::from_len_zeroed(bytes); - let bool_slice = bool_buf.as_slice_mut(); - - if !right.contains(is_like_pattern) { - // fast path, can use equals - let right_uppercase = right.to_uppercase(); - for i in 0..left.len() { - unsafe { - if left.value_unchecked(i).to_uppercase() != right_uppercase { - bit_util::set_bit(bool_slice, i); - } - } - } - } else if right.ends_with('%') - && !right.ends_with("\\%") - && !right[..right.len() - 1].contains(is_like_pattern) - { - // fast path, can use starts_with - let start_str = &right[..right.len() - 1].to_uppercase(); - for i in 0..left.len() { - unsafe { - if !(left - .value_unchecked(i) - .to_uppercase() - .starts_with(start_str)) - { - bit_util::set_bit(bool_slice, i); - } - } - } - } else if right.starts_with('%') && !right[1..].contains(is_like_pattern) { - // fast path, can use ends_with - let ends_str = &right[1..].to_uppercase(); - - for i in 0..left.len() { - unsafe { - if !(left.value_unchecked(i).to_uppercase().ends_with(ends_str)) { - bit_util::set_bit(bool_slice, i); - } - } - } - } else if right.starts_with('%') - && right.ends_with('%') - && !right[1..right.len() - 1].contains(is_like_pattern) - { - // fast path, can use contains - let contains = &right[1..right.len() - 1].to_uppercase(); - for i in 0..left.len() { - unsafe { - if !(left.value_unchecked(i).to_uppercase().contains(contains)) { - bit_util::set_bit(bool_slice, i); - } - } - } - } else { - let re_pattern = replace_like_wildcards(right)?; - let re = Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from ILIKE pattern: {}", - e - )) - })?; - - for i in 0..left.len() { - let haystack = unsafe { left.value_unchecked(i) }; - if !re.is_match(haystack) { - bit_util::set_bit(bool_slice, i); - } - } - }; - - let data = unsafe { - ArrayData::new_unchecked( - DataType::Boolean, - left.len(), - None, - null_bit_buffer, - 0, - vec![bool_buf.into()], - vec![], - ) - }; - Ok(BooleanArray::from(data)) -} - -/// Perform SQL `left NOT ILIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`], or [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn nilike_utf8_scalar_dyn(left: &dyn Array, right: &str) -> Result { - match left.data_type() { - DataType::Utf8 => { - let left = as_string_array(left); - nilike_scalar(left, right) - } - DataType::LargeUtf8 => { - let left = as_largestring_array(left); - nilike_scalar(left, right) - } - DataType::Dictionary(_, _) => { - downcast_dictionary_array!( - left => { - nilike_dict_scalar(left, right) - } - t => Err(ArrowError::ComputeError(format!( - "Should be DictionaryArray but got: {}", t - ))) - ) - } - _ => { - Err(ArrowError::ComputeError( - "nilike_utf8_scalar_dyn only supports Utf8, LargeUtf8 or DictionaryArray with Utf8 or LargeUtf8 values".to_string(), - )) - } - } -} - -/// Perform SQL `left NOT ILIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn nilike_utf8_scalar( - left: &GenericStringArray, - right: &str, -) -> Result { - nilike_scalar(left, right) -} +pub use arrow_string::like::*; +pub use arrow_string::regexp::{regexp_is_match_utf8, regexp_is_match_utf8_scalar}; -/// Perform SQL `left NOT ILIKE right` operation on [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -fn nilike_dict_scalar( - left: &DictionaryArray, - right: &str, -) -> Result { - match left.value_type() { - DataType::Utf8 => { - let left = left.downcast_dict::>().unwrap(); - nilike_scalar(left, right) - } - DataType::LargeUtf8 => { - let left = left.downcast_dict::>().unwrap(); - nilike_scalar(left, right) - } - _ => { - Err(ArrowError::ComputeError( - "nilike_dict_scalar only supports DictionaryArray with Utf8 or LargeUtf8 values".to_string(), - )) - } - } -} +use crate::array::*; +use crate::buffer::{buffer_unary_not, Buffer, MutableBuffer}; +use crate::datatypes::*; +#[allow(unused_imports)] +use crate::downcast_dictionary_array; +use crate::error::{ArrowError, Result}; +use crate::util::bit_util; +use arrow_data::bit_mask::combine_option_bitmap; +use arrow_select::take::take; +use num::ToPrimitive; -/// Perform SQL `array ~ regex_array` operation on [`StringArray`] / [`LargeStringArray`]. -/// If `regex_array` element has an empty value, the corresponding result value is always true. -/// -/// `flags_array` are optional [`StringArray`] / [`LargeStringArray`] flag, which allow -/// special search modes, such as case insensitive and multi-line mode. -/// See the documentation [here](https://docs.rs/regex/1.5.4/regex/#grouping-and-flags) -/// for more information. -pub fn regexp_is_match_utf8( - array: &GenericStringArray, - regex_array: &GenericStringArray, - flags_array: Option<&GenericStringArray>, -) -> Result { - if array.len() != regex_array.len() { +/// Helper function to perform boolean lambda function on values from two array accessors, this +/// version does not attempt to use SIMD. +fn compare_op( + left: T, + right: S, + op: F, +) -> Result +where + F: Fn(T::Item, S::Item) -> bool, +{ + if left.len() != right.len() { return Err(ArrowError::ComputeError( "Cannot perform comparison operation on arrays of different length" .to_string(), )); } - let null_bit_buffer = - combine_option_bitmap(&[array.data_ref(), regex_array.data_ref()], array.len()); - - let mut patterns: HashMap = HashMap::new(); - let mut result = BooleanBufferBuilder::new(array.len()); - - let complete_pattern = match flags_array { - Some(flags) => Box::new(regex_array.iter().zip(flags.iter()).map( - |(pattern, flags)| { - pattern.map(|pattern| match flags { - Some(flag) => format!("(?{}){}", flag, pattern), - None => pattern.to_string(), - }) - }, - )) as Box>>, - None => Box::new( - regex_array - .iter() - .map(|pattern| pattern.map(|pattern| pattern.to_string())), - ), - }; - - array - .iter() - .zip(complete_pattern) - .map(|(value, pattern)| { - match (value, pattern) { - // Required for Postgres compatibility: - // SELECT 'foobarbequebaz' ~ ''); = true - (Some(_), Some(pattern)) if pattern == *"" => { - result.append(true); - } - (Some(value), Some(pattern)) => { - let existing_pattern = patterns.get(&pattern); - let re = match existing_pattern { - Some(re) => re.clone(), - None => { - let re = Regex::new(pattern.as_str()).map_err(|e| { - ArrowError::ComputeError(format!( - "Regular expression did not compile: {:?}", - e - )) - })?; - patterns.insert(pattern, re.clone()); - re - } - }; - result.append(re.is_match(value)); - } - _ => result.append(false), - } - Ok(()) - }) - .collect::>>()?; - let data = unsafe { - ArrayData::new_unchecked( - DataType::Boolean, - array.len(), - None, - null_bit_buffer, - 0, - vec![result.finish()], - vec![], - ) - }; - Ok(BooleanArray::from(data)) + Ok(BooleanArray::from_binary(left, right, op)) } -/// Perform SQL `array ~ regex_array` operation on [`StringArray`] / -/// [`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`regexp_is_match_utf8`] for more details. -pub fn regexp_is_match_utf8_scalar( - array: &GenericStringArray, - regex: &str, - flag: Option<&str>, -) -> Result { - let null_bit_buffer = array.data().null_buffer().cloned(); - let mut result = BooleanBufferBuilder::new(array.len()); +/// Helper function to perform boolean lambda function on values from array accessor, this +/// version does not attempt to use SIMD. +fn compare_op_scalar(left: T, op: F) -> Result +where + F: Fn(T::Item) -> bool, +{ + Ok(BooleanArray::from_unary(left, op)) +} - let pattern = match flag { - Some(flag) => format!("(?{}){}", flag, regex), - None => regex.to_string(), - }; - if pattern.is_empty() { - result.append_n(array.len(), true); - } else { - let re = Regex::new(pattern.as_str()).map_err(|e| { - ArrowError::ComputeError(format!( - "Regular expression did not compile: {:?}", - e - )) - })?; - for i in 0..array.len() { - let value = array.value(i); - result.append(re.is_match(value)); - } - } +/// Evaluate `op(left, right)` for [`PrimitiveArray`]s using a specified +/// comparison function. +pub fn no_simd_compare_op( + left: &PrimitiveArray, + right: &PrimitiveArray, + op: F, +) -> Result +where + T: ArrowNumericType, + F: Fn(T::Native, T::Native) -> bool, +{ + compare_op(left, right, op) +} - let buffer = result.finish(); - let data = unsafe { - ArrayData::new_unchecked( - DataType::Boolean, - array.len(), - None, - null_bit_buffer, - 0, - vec![buffer], - vec![], - ) - }; - Ok(BooleanArray::from(data)) +/// Evaluate `op(left, right)` for [`PrimitiveArray`] and scalar using +/// a specified comparison function. +pub fn no_simd_compare_op_scalar( + left: &PrimitiveArray, + right: T::Native, + op: F, +) -> Result +where + T: ArrowNumericType, + F: Fn(T::Native, T::Native) -> bool, +{ + compare_op_scalar(left, |l| op(l, right)) } /// Perform `left == right` operation on [`StringArray`] / [`LargeStringArray`]. @@ -4879,432 +3727,25 @@ mod tests { left.value(i), i, $right - ); - } - } - }; - } - - test_utf8!( - test_utf8_array_like, - vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow", "arrow"], - vec!["arrow", "ar%", "%ro%", "foo", "arr", "arrow_", "arrow_", ".*"], - like_utf8, - vec![true, true, true, false, false, true, false, false] - ); - - test_dict_utf8!( - test_utf8_array_like_dict, - vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow", "arrow"], - vec!["arrow", "ar%", "%ro%", "foo", "arr", "arrow_", "arrow_", ".*"], - like_dyn, - vec![true, true, true, false, false, true, false, false] - ); - - test_utf8_scalar!( - test_utf8_array_like_scalar_escape_testing, - test_utf8_array_like_scalar_dyn_escape_testing, - vec!["varchar(255)", "int(255)", "varchar", "int"], - "%(%)%", - like_utf8_scalar, - like_utf8_scalar_dyn, - vec![true, true, false, false] - ); - - test_utf8_scalar!( - test_utf8_array_like_scalar_escape_regex, - test_utf8_array_like_scalar_dyn_escape_regex, - vec![".*", "a", "*"], - ".*", - like_utf8_scalar, - like_utf8_scalar_dyn, - vec![true, false, false] - ); - - test_utf8_scalar!( - test_utf8_array_like_scalar_escape_regex_dot, - test_utf8_array_like_scalar_dyn_escape_regex_dot, - vec![".", "a", "*"], - ".", - like_utf8_scalar, - like_utf8_scalar_dyn, - vec![true, false, false] - ); - - test_utf8_scalar!( - test_utf8_array_like_scalar, - test_utf8_array_like_scalar_dyn, - vec!["arrow", "parquet", "datafusion", "flight"], - "%ar%", - like_utf8_scalar, - like_utf8_scalar_dyn, - vec![true, true, false, false] - ); - - test_utf8_scalar!( - test_utf8_array_like_scalar_start, - test_utf8_array_like_scalar_dyn_start, - vec!["arrow", "parrow", "arrows", "arr"], - "arrow%", - like_utf8_scalar, - like_utf8_scalar_dyn, - vec![true, false, true, false] - ); - - test_utf8_scalar!( - test_utf8_array_like_scalar_end, - test_utf8_array_like_scalar_dyn_end, - vec!["arrow", "parrow", "arrows", "arr"], - "%arrow", - like_utf8_scalar, - like_utf8_scalar_dyn, - vec![true, true, false, false] - ); - - test_utf8_scalar!( - test_utf8_array_like_scalar_equals, - test_utf8_array_like_scalar_dyn_equals, - vec!["arrow", "parrow", "arrows", "arr"], - "arrow", - like_utf8_scalar, - like_utf8_scalar_dyn, - vec![true, false, false, false] - ); - - test_utf8_scalar!( - test_utf8_array_like_scalar_one, - test_utf8_array_like_scalar_dyn_one, - vec!["arrow", "arrows", "parrow", "arr"], - "arrow_", - like_utf8_scalar, - like_utf8_scalar_dyn, - vec![false, true, false, false] - ); - - test_utf8_scalar!( - test_utf8_scalar_like_escape, - test_utf8_scalar_like_dyn_escape, - vec!["a%", "a\\x"], - "a\\%", - like_utf8_scalar, - like_utf8_scalar_dyn, - vec![true, false] - ); - - test_utf8_scalar!( - test_utf8_scalar_like_escape_contains, - test_utf8_scalar_like_dyn_escape_contains, - vec!["ba%", "ba\\x"], - "%a\\%", - like_utf8_scalar, - like_utf8_scalar_dyn, - vec![true, false] - ); - - test_utf8!( - test_utf8_scalar_ilike_regex, - vec!["%%%"], - vec![r#"\%_\%"#], - ilike_utf8, - vec![true] - ); - - test_dict_utf8!( - test_utf8_scalar_ilike_regex_dict, - vec!["%%%"], - vec![r#"\%_\%"#], - ilike_dyn, - vec![true] - ); - - #[test] - fn test_replace_like_wildcards() { - let a_eq = "_%"; - let expected = "..*"; - assert_eq!(replace_like_wildcards(a_eq).unwrap(), expected); - } - - #[test] - fn test_replace_like_wildcards_leave_like_meta_chars() { - let a_eq = "\\%\\_"; - let expected = "%_"; - assert_eq!(replace_like_wildcards(a_eq).unwrap(), expected); - } - - #[test] - fn test_replace_like_wildcards_with_multiple_escape_chars() { - let a_eq = "\\\\%"; - let expected = "\\\\%"; - assert_eq!(replace_like_wildcards(a_eq).unwrap(), expected); - } - - #[test] - fn test_replace_like_wildcards_escape_regex_meta_char() { - let a_eq = "."; - let expected = "\\."; - assert_eq!(replace_like_wildcards(a_eq).unwrap(), expected); - } - - test_utf8!( - test_utf8_array_eq, - vec!["arrow", "arrow", "arrow", "arrow"], - vec!["arrow", "parquet", "datafusion", "flight"], - eq_utf8, - vec![true, false, false, false] - ); - test_utf8_scalar!( - test_utf8_array_eq_scalar, - vec!["arrow", "parquet", "datafusion", "flight"], - "arrow", - eq_utf8_scalar, - vec![true, false, false, false] - ); - - test_utf8!( - test_utf8_array_nlike, - vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow"], - vec!["arrow", "ar%", "%ro%", "foo", "arr", "arrow_", "arrow_"], - nlike_utf8, - vec![false, false, false, true, true, false, true] - ); - - test_dict_utf8!( - test_utf8_array_nlike_dict, - vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow"], - vec!["arrow", "ar%", "%ro%", "foo", "arr", "arrow_", "arrow_"], - nlike_dyn, - vec![false, false, false, true, true, false, true] - ); - - test_utf8_scalar!( - test_utf8_array_nlike_escape_testing, - test_utf8_array_nlike_escape_dyn_testing_dyn, - vec!["varchar(255)", "int(255)", "varchar", "int"], - "%(%)%", - nlike_utf8_scalar, - nlike_utf8_scalar_dyn, - vec![false, false, true, true] - ); - - test_utf8_scalar!( - test_utf8_array_nlike_scalar_escape_regex, - test_utf8_array_nlike_scalar_dyn_escape_regex, - vec![".*", "a", "*"], - ".*", - nlike_utf8_scalar, - nlike_utf8_scalar_dyn, - vec![false, true, true] - ); - - test_utf8_scalar!( - test_utf8_array_nlike_scalar_escape_regex_dot, - test_utf8_array_nlike_scalar_dyn_escape_regex_dot, - vec![".", "a", "*"], - ".", - nlike_utf8_scalar, - nlike_utf8_scalar_dyn, - vec![false, true, true] - ); - test_utf8_scalar!( - test_utf8_array_nlike_scalar, - test_utf8_array_nlike_scalar_dyn, - vec!["arrow", "parquet", "datafusion", "flight"], - "%ar%", - nlike_utf8_scalar, - nlike_utf8_scalar_dyn, - vec![false, false, true, true] - ); - - test_utf8_scalar!( - test_utf8_array_nlike_scalar_start, - test_utf8_array_nlike_scalar_dyn_start, - vec!["arrow", "parrow", "arrows", "arr"], - "arrow%", - nlike_utf8_scalar, - nlike_utf8_scalar_dyn, - vec![false, true, false, true] - ); - - test_utf8_scalar!( - test_utf8_array_nlike_scalar_end, - test_utf8_array_nlike_scalar_dyn_end, - vec!["arrow", "parrow", "arrows", "arr"], - "%arrow", - nlike_utf8_scalar, - nlike_utf8_scalar_dyn, - vec![false, false, true, true] - ); - - test_utf8_scalar!( - test_utf8_array_nlike_scalar_equals, - test_utf8_array_nlike_scalar_dyn_equals, - vec!["arrow", "parrow", "arrows", "arr"], - "arrow", - nlike_utf8_scalar, - nlike_utf8_scalar_dyn, - vec![false, true, true, true] - ); - - test_utf8_scalar!( - test_utf8_array_nlike_scalar_one, - test_utf8_array_nlike_scalar_dyn_one, - vec!["arrow", "arrows", "parrow", "arr"], - "arrow_", - nlike_utf8_scalar, - nlike_utf8_scalar_dyn, - vec![true, false, true, true] - ); - - test_utf8!( - test_utf8_array_ilike, - vec!["arrow", "arrow", "ARROW", "arrow", "ARROW", "ARROWS", "arROw"], - vec!["arrow", "ar%", "%ro%", "foo", "ar%r", "arrow_", "arrow_"], - ilike_utf8, - vec![true, true, true, false, false, true, false] - ); - - test_dict_utf8!( - test_utf8_array_ilike_dict, - vec!["arrow", "arrow", "ARROW", "arrow", "ARROW", "ARROWS", "arROw"], - vec!["arrow", "ar%", "%ro%", "foo", "ar%r", "arrow_", "arrow_"], - ilike_dyn, - vec![true, true, true, false, false, true, false] - ); - - test_utf8_scalar!( - ilike_utf8_scalar_escape_testing, - ilike_utf8_scalar_escape_dyn_testing, - vec!["varchar(255)", "int(255)", "varchar", "int"], - "%(%)%", - ilike_utf8_scalar, - ilike_utf8_scalar_dyn, - vec![true, true, false, false] - ); - - test_utf8_scalar!( - test_utf8_array_ilike_scalar, - test_utf8_array_ilike_dyn_scalar, - vec!["arrow", "parquet", "datafusion", "flight"], - "%AR%", - ilike_utf8_scalar, - ilike_utf8_scalar_dyn, - vec![true, true, false, false] - ); - - test_utf8_scalar!( - test_utf8_array_ilike_scalar_start, - test_utf8_array_ilike_scalar_dyn_start, - vec!["arrow", "parrow", "arrows", "ARR"], - "aRRow%", - ilike_utf8_scalar, - ilike_utf8_scalar_dyn, - vec![true, false, true, false] - ); - - test_utf8_scalar!( - test_utf8_array_ilike_scalar_end, - test_utf8_array_ilike_scalar_dyn_end, - vec!["ArroW", "parrow", "ARRowS", "arr"], - "%arrow", - ilike_utf8_scalar, - ilike_utf8_scalar_dyn, - vec![true, true, false, false] - ); - - test_utf8_scalar!( - test_utf8_array_ilike_scalar_equals, - test_utf8_array_ilike_scalar_dyn_equals, - vec!["arrow", "parrow", "arrows", "arr"], - "Arrow", - ilike_utf8_scalar, - ilike_utf8_scalar_dyn, - vec![true, false, false, false] - ); - - test_utf8_scalar!( - test_utf8_array_ilike_scalar_one, - test_utf8_array_ilike_scalar_dyn_one, - vec!["arrow", "arrows", "parrow", "arr"], - "arrow_", - ilike_utf8_scalar, - ilike_utf8_scalar_dyn, - vec![false, true, false, false] - ); + ); + } + } + }; + } test_utf8!( - test_utf8_array_nilike, - vec!["arrow", "arrow", "ARROW", "arrow", "ARROW", "ARROWS", "arROw"], - vec!["arrow", "ar%", "%ro%", "foo", "ar%r", "arrow_", "arrow_"], - nilike_utf8, - vec![false, false, false, true, true, false, true] - ); - - test_dict_utf8!( - test_utf8_array_nilike_dict, - vec!["arrow", "arrow", "ARROW", "arrow", "ARROW", "ARROWS", "arROw"], - vec!["arrow", "ar%", "%ro%", "foo", "ar%r", "arrow_", "arrow_"], - nilike_dyn, - vec![false, false, false, true, true, false, true] - ); - - test_utf8_scalar!( - nilike_utf8_scalar_escape_testing, - nilike_utf8_scalar_escape_dyn_testing, - vec!["varchar(255)", "int(255)", "varchar", "int"], - "%(%)%", - nilike_utf8_scalar, - nilike_utf8_scalar_dyn, - vec![false, false, true, true] - ); - - test_utf8_scalar!( - test_utf8_array_nilike_scalar, - test_utf8_array_nilike_dyn_scalar, + test_utf8_array_eq, + vec!["arrow", "arrow", "arrow", "arrow"], vec!["arrow", "parquet", "datafusion", "flight"], - "%AR%", - nilike_utf8_scalar, - nilike_utf8_scalar_dyn, - vec![false, false, true, true] - ); - - test_utf8_scalar!( - test_utf8_array_nilike_scalar_start, - test_utf8_array_nilike_scalar_dyn_start, - vec!["arrow", "parrow", "arrows", "ARR"], - "aRRow%", - nilike_utf8_scalar, - nilike_utf8_scalar_dyn, - vec![false, true, false, true] - ); - - test_utf8_scalar!( - test_utf8_array_nilike_scalar_end, - test_utf8_array_nilike_scalar_dyn_end, - vec!["ArroW", "parrow", "ARRowS", "arr"], - "%arrow", - nilike_utf8_scalar, - nilike_utf8_scalar_dyn, - vec![false, false, true, true] - ); - - test_utf8_scalar!( - test_utf8_array_nilike_scalar_equals, - test_utf8_array_nilike_scalar_dyn_equals, - vec!["arRow", "parrow", "arrows", "arr"], - "Arrow", - nilike_utf8_scalar, - nilike_utf8_scalar_dyn, - vec![false, true, true, true] + eq_utf8, + vec![true, false, false, false] ); - test_utf8_scalar!( - test_utf8_array_nilike_scalar_one, - test_utf8_array_nilike_scalar_dyn_one, - vec!["arrow", "arrows", "parrow", "arr"], - "arrow_", - nilike_utf8_scalar, - nilike_utf8_scalar_dyn, - vec![true, false, true, true] + test_utf8_array_eq_scalar, + vec!["arrow", "parquet", "datafusion", "flight"], + "arrow", + eq_utf8_scalar, + vec![true, false, false, false] ); test_utf8!( @@ -6667,86 +5108,6 @@ mod tests { assert_eq!(gt_eq_dyn_scalar(&array, f64::NAN).unwrap(), expected); } - #[test] - fn test_dict_like_kernels() { - let data = - vec![Some("Earth"), Some("Fire"), Some("Water"), Some("Air"), None, Some("Air")]; - - let dict_array: DictionaryArray = data.into_iter().collect(); - - let dict_arrayref = Arc::new(dict_array.clone()) as ArrayRef; - - assert_eq!( - like_utf8_scalar_dyn(&dict_array, "Air").unwrap(), - BooleanArray::from( - vec![Some(false), Some(false), Some(false), Some(true), None, Some(true)] - ), - ); - - assert_eq!( - like_utf8_scalar_dyn(&dict_arrayref, "Air").unwrap(), - BooleanArray::from( - vec![Some(false), Some(false), Some(false), Some(true), None, Some(true)] - ), - ); - - assert_eq!( - like_utf8_scalar_dyn(&dict_array, "Wa%").unwrap(), - BooleanArray::from( - vec![Some(false), Some(false), Some(true), Some(false), None, Some(false)] - ), - ); - - assert_eq!( - like_utf8_scalar_dyn(&dict_arrayref, "Wa%").unwrap(), - BooleanArray::from( - vec![Some(false), Some(false), Some(true), Some(false), None, Some(false)] - ), - ); - - assert_eq!( - like_utf8_scalar_dyn(&dict_array, "%r").unwrap(), - BooleanArray::from( - vec![Some(false), Some(false), Some(true), Some(true), None, Some(true)] - ), - ); - - assert_eq!( - like_utf8_scalar_dyn(&dict_arrayref, "%r").unwrap(), - BooleanArray::from( - vec![Some(false), Some(false), Some(true), Some(true), None, Some(true)] - ), - ); - - assert_eq!( - like_utf8_scalar_dyn(&dict_array, "%i%").unwrap(), - BooleanArray::from( - vec![Some(false), Some(true), Some(false), Some(true), None, Some(true)] - ), - ); - - assert_eq!( - like_utf8_scalar_dyn(&dict_arrayref, "%i%").unwrap(), - BooleanArray::from( - vec![Some(false), Some(true), Some(false), Some(true), None, Some(true)] - ), - ); - - assert_eq!( - like_utf8_scalar_dyn(&dict_array, "%a%r%").unwrap(), - BooleanArray::from( - vec![Some(true), Some(false), Some(true), Some(false), None, Some(false)] - ), - ); - - assert_eq!( - like_utf8_scalar_dyn(&dict_arrayref, "%a%r%").unwrap(), - BooleanArray::from( - vec![Some(true), Some(false), Some(true), Some(false), None, Some(false)] - ), - ); - } - #[test] #[cfg(feature = "dyn_cmp_dict")] fn test_eq_dyn_neq_dyn_dictionary_to_utf8_array() { @@ -6959,246 +5320,6 @@ mod tests { ); } - #[test] - fn test_dict_nlike_kernels() { - let data = - vec![Some("Earth"), Some("Fire"), Some("Water"), Some("Air"), None, Some("Air")]; - - let dict_array: DictionaryArray = data.into_iter().collect(); - - let dict_arrayref = Arc::new(dict_array.clone()) as ArrayRef; - - assert_eq!( - nlike_utf8_scalar_dyn(&dict_array, "Air").unwrap(), - BooleanArray::from( - vec![Some(true), Some(true), Some(true), Some(false), None, Some(false)] - ), - ); - - assert_eq!( - nlike_utf8_scalar_dyn(&dict_arrayref, "Air").unwrap(), - BooleanArray::from( - vec![Some(true), Some(true), Some(true), Some(false), None, Some(false)] - ), - ); - - assert_eq!( - nlike_utf8_scalar_dyn(&dict_array, "Wa%").unwrap(), - BooleanArray::from( - vec![Some(true), Some(true), Some(false), Some(true), None, Some(true)] - ), - ); - - assert_eq!( - nlike_utf8_scalar_dyn(&dict_arrayref, "Wa%").unwrap(), - BooleanArray::from( - vec![Some(true), Some(true), Some(false), Some(true), None, Some(true)] - ), - ); - - assert_eq!( - nlike_utf8_scalar_dyn(&dict_array, "%r").unwrap(), - BooleanArray::from( - vec![Some(true), Some(true), Some(false), Some(false), None, Some(false)] - ), - ); - - assert_eq!( - nlike_utf8_scalar_dyn(&dict_arrayref, "%r").unwrap(), - BooleanArray::from( - vec![Some(true), Some(true), Some(false), Some(false), None, Some(false)] - ), - ); - - assert_eq!( - nlike_utf8_scalar_dyn(&dict_array, "%i%").unwrap(), - BooleanArray::from( - vec![Some(true), Some(false), Some(true), Some(false), None, Some(false)] - ), - ); - - assert_eq!( - nlike_utf8_scalar_dyn(&dict_arrayref, "%i%").unwrap(), - BooleanArray::from( - vec![Some(true), Some(false), Some(true), Some(false), None, Some(false)] - ), - ); - - assert_eq!( - nlike_utf8_scalar_dyn(&dict_array, "%a%r%").unwrap(), - BooleanArray::from( - vec![Some(false), Some(true), Some(false), Some(true), None, Some(true)] - ), - ); - - assert_eq!( - nlike_utf8_scalar_dyn(&dict_arrayref, "%a%r%").unwrap(), - BooleanArray::from( - vec![Some(false), Some(true), Some(false), Some(true), None, Some(true)] - ), - ); - } - - #[test] - fn test_dict_ilike_kernels() { - let data = - vec![Some("Earth"), Some("Fire"), Some("Water"), Some("Air"), None, Some("Air")]; - - let dict_array: DictionaryArray = data.into_iter().collect(); - - let dict_arrayref = Arc::new(dict_array.clone()) as ArrayRef; - - assert_eq!( - ilike_utf8_scalar_dyn(&dict_array, "air").unwrap(), - BooleanArray::from( - vec![Some(false), Some(false), Some(false), Some(true), None, Some(true)] - ), - ); - - assert_eq!( - ilike_utf8_scalar_dyn(&dict_arrayref, "air").unwrap(), - BooleanArray::from( - vec![Some(false), Some(false), Some(false), Some(true), None, Some(true)] - ), - ); - - assert_eq!( - ilike_utf8_scalar_dyn(&dict_array, "wa%").unwrap(), - BooleanArray::from( - vec![Some(false), Some(false), Some(true), Some(false), None, Some(false)] - ), - ); - - assert_eq!( - ilike_utf8_scalar_dyn(&dict_arrayref, "wa%").unwrap(), - BooleanArray::from( - vec![Some(false), Some(false), Some(true), Some(false), None, Some(false)] - ), - ); - - assert_eq!( - ilike_utf8_scalar_dyn(&dict_array, "%R").unwrap(), - BooleanArray::from( - vec![Some(false), Some(false), Some(true), Some(true), None, Some(true)] - ), - ); - - assert_eq!( - ilike_utf8_scalar_dyn(&dict_arrayref, "%R").unwrap(), - BooleanArray::from( - vec![Some(false), Some(false), Some(true), Some(true), None, Some(true)] - ), - ); - - assert_eq!( - ilike_utf8_scalar_dyn(&dict_array, "%I%").unwrap(), - BooleanArray::from( - vec![Some(false), Some(true), Some(false), Some(true), None, Some(true)] - ), - ); - - assert_eq!( - ilike_utf8_scalar_dyn(&dict_arrayref, "%I%").unwrap(), - BooleanArray::from( - vec![Some(false), Some(true), Some(false), Some(true), None, Some(true)] - ), - ); - - assert_eq!( - ilike_utf8_scalar_dyn(&dict_array, "%A%r%").unwrap(), - BooleanArray::from( - vec![Some(true), Some(false), Some(true), Some(true), None, Some(true)] - ), - ); - - assert_eq!( - ilike_utf8_scalar_dyn(&dict_arrayref, "%A%r%").unwrap(), - BooleanArray::from( - vec![Some(true), Some(false), Some(true), Some(true), None, Some(true)] - ), - ); - } - - #[test] - fn test_dict_nilike_kernels() { - let data = - vec![Some("Earth"), Some("Fire"), Some("Water"), Some("Air"), None, Some("Air")]; - - let dict_array: DictionaryArray = data.into_iter().collect(); - - let dict_arrayref = Arc::new(dict_array.clone()) as ArrayRef; - - assert_eq!( - nilike_utf8_scalar_dyn(&dict_array, "air").unwrap(), - BooleanArray::from( - vec![Some(true), Some(true), Some(true), Some(false), None, Some(false)] - ), - ); - - assert_eq!( - nilike_utf8_scalar_dyn(&dict_arrayref, "air").unwrap(), - BooleanArray::from( - vec![Some(true), Some(true), Some(true), Some(false), None, Some(false)] - ), - ); - - assert_eq!( - nilike_utf8_scalar_dyn(&dict_array, "wa%").unwrap(), - BooleanArray::from( - vec![Some(true), Some(true), Some(false), Some(true), None, Some(true)] - ), - ); - - assert_eq!( - nilike_utf8_scalar_dyn(&dict_arrayref, "wa%").unwrap(), - BooleanArray::from( - vec![Some(true), Some(true), Some(false), Some(true), None, Some(true)] - ), - ); - - assert_eq!( - nilike_utf8_scalar_dyn(&dict_array, "%R").unwrap(), - BooleanArray::from( - vec![Some(true), Some(true), Some(false), Some(false), None, Some(false)] - ), - ); - - assert_eq!( - nilike_utf8_scalar_dyn(&dict_arrayref, "%R").unwrap(), - BooleanArray::from( - vec![Some(true), Some(true), Some(false), Some(false), None, Some(false)] - ), - ); - - assert_eq!( - nilike_utf8_scalar_dyn(&dict_array, "%I%").unwrap(), - BooleanArray::from( - vec![Some(true), Some(false), Some(true), Some(false), None, Some(false)] - ), - ); - - assert_eq!( - nilike_utf8_scalar_dyn(&dict_arrayref, "%I%").unwrap(), - BooleanArray::from( - vec![Some(true), Some(false), Some(true), Some(false), None, Some(false)] - ), - ); - - assert_eq!( - nilike_utf8_scalar_dyn(&dict_array, "%A%r%").unwrap(), - BooleanArray::from( - vec![Some(false), Some(true), Some(false), Some(false), None, Some(false)] - ), - ); - - assert_eq!( - nilike_utf8_scalar_dyn(&dict_arrayref, "%A%r%").unwrap(), - BooleanArray::from( - vec![Some(false), Some(true), Some(false), Some(false), None, Some(false)] - ), - ); - } - #[test] #[cfg(feature = "dyn_cmp_dict")] fn test_eq_dyn_neq_dyn_dict_non_dict_float_nan() { diff --git a/arrow/src/compute/kernels/mod.rs b/arrow/src/compute/kernels/mod.rs index 0eebb701232..29468861f82 100644 --- a/arrow/src/compute/kernels/mod.rs +++ b/arrow/src/compute/kernels/mod.rs @@ -23,15 +23,12 @@ pub mod arity; pub mod bitwise; pub mod boolean; pub mod comparison; -pub mod concat_elements; -pub mod length; pub mod limit; pub mod partition; -pub mod regexp; pub mod sort; -pub mod substring; pub mod temporal; pub use arrow_cast::cast; pub use arrow_cast::parse as cast_utils; pub use arrow_select::{concat, filter, interleave, take, window, zip}; +pub use arrow_string::{concat_elements, length, regexp, substring};