From c08e5324d11a8a2b9998d0d625876815f4954968 Mon Sep 17 00:00:00 2001 From: Remzi Yang <59198230+HaoYang670@users.noreply.github.com> Date: Sat, 11 Jun 2022 17:18:04 +0800 Subject: [PATCH] speed up `substring_by_char` by about 2.5x (#1832) * speed up substring_by_char Signed-off-by: remzi <13716567376yh@gmail.com> * better estimate the length of value buffer Signed-off-by: remzi <13716567376yh@gmail.com> --- arrow/src/compute/kernels/substring.rs | 79 ++++++++++++++++++++------ 1 file changed, 62 insertions(+), 17 deletions(-) diff --git a/arrow/src/compute/kernels/substring.rs b/arrow/src/compute/kernels/substring.rs index 1954307e9ac..625a37514d1 100644 --- a/arrow/src/compute/kernels/substring.rs +++ b/arrow/src/compute/kernels/substring.rs @@ -182,24 +182,69 @@ pub fn substring_by_char( start: i64, length: Option, ) -> Result> { - Ok(array - .iter() - .map(|val| { - val.map(|val| { - let char_count = val.chars().count(); - let start = if start >= 0 { - start.to_usize().unwrap().min(char_count) - } else { - char_count - (-start).to_usize().unwrap().min(char_count) - }; - let length = length.map_or(char_count - start, |length| { - length.to_usize().unwrap().min(char_count - start) - }); + let mut vals = BufferBuilder::::new({ + let offsets = array.value_offsets(); + (offsets[array.len()] - offsets[0]).to_usize().unwrap() + }); + let mut new_offsets = BufferBuilder::::new(array.len() + 1); + new_offsets.append(OffsetSize::zero()); + let length = length.map(|len| len.to_usize().unwrap()); + + array.iter().for_each(|val| { + if let Some(val) = val { + let char_count = val.chars().count(); + let start = if start >= 0 { + start.to_usize().unwrap() + } else { + char_count - (-start).to_usize().unwrap().min(char_count) + }; + let (start_offset, end_offset) = get_start_end_offset(val, start, length); + vals.append_slice(&val.as_bytes()[start_offset..end_offset]); + } + new_offsets.append(OffsetSize::from_usize(vals.len()).unwrap()); + }); + let data = unsafe { + ArrayData::new_unchecked( + GenericStringArray::::get_data_type(), + array.len(), + None, + array + .data_ref() + .null_buffer() + .map(|b| b.bit_slice(array.offset(), array.len())), + 0, + vec![new_offsets.finish(), vals.finish()], + vec![], + ) + }; + Ok(GenericStringArray::::from(data)) +} - val.chars().skip(start).take(length).collect::() - }) - }) - .collect::>()) +/// * `val` - string +/// * `start` - the start char index of the substring +/// * `length` - the char length of the substring +/// +/// Return the `start` and `end` offset (by byte) of the substring +fn get_start_end_offset( + val: &str, + start: usize, + length: Option, +) -> (usize, usize) { + let len = val.len(); + let mut offset_char_iter = val.char_indices(); + let start_offset = offset_char_iter + .nth(start) + .map_or(len, |(offset, _)| offset); + let end_offset = length.map_or(len, |length| { + if length > 0 { + offset_char_iter + .nth(length - 1) + .map_or(len, |(offset, _)| offset) + } else { + start_offset + } + }); + (start_offset, end_offset) } fn binary_substring(