Skip to content

Commit

Permalink
Improve validate_utf8 performance (apache#2048)
Browse files Browse the repository at this point in the history
* added utf8 validation bench

* improve utf8 validation performance

* fix bench clippy errors

* Add is_char_boundary() to utf8 validation
  • Loading branch information
tfeda committed Jul 26, 2022
1 parent 9c70e4a commit 0c64054
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 12 deletions.
15 changes: 13 additions & 2 deletions arrow/benches/array_data_validate.rs
Expand Up @@ -37,11 +37,22 @@ fn create_binary_array_data(length: i32) -> ArrayData {
.unwrap()
}

fn array_slice_benchmark(c: &mut Criterion) {
fn validate_utf8_array(arr: &StringArray) {
arr.data().validate_values().unwrap();
}

fn validate_benchmark(c: &mut Criterion) {
//Binary Array
c.bench_function("validate_binary_array_data 20000", |b| {
b.iter(|| create_binary_array_data(20000))
});

//Utf8 Array
let str_arr = StringArray::from(vec!["test"; 20000]);
c.bench_function("validate_utf8_array_data 20000", |b| {
b.iter(|| validate_utf8_array(&str_arr))
});
}

criterion_group!(benches, array_slice_benchmark);
criterion_group!(benches, validate_benchmark);
criterion_main!(benches);
41 changes: 31 additions & 10 deletions arrow/src/array/data.rs
Expand Up @@ -1141,16 +1141,37 @@ impl ArrayData {
T: ArrowNativeType + TryInto<usize> + num::Num + std::fmt::Display,
{
let values_buffer = &self.buffers[1].as_slice();

self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
std::str::from_utf8(&values_buffer[range.clone()]).map_err(|e| {
ArrowError::InvalidArgumentError(format!(
"Invalid UTF8 sequence at string index {} ({:?}): {}",
string_index, range, e
))
})?;
Ok(())
})
if let Ok(values_str) = std::str::from_utf8(values_buffer) {
// Validate Offsets are correct
self.validate_each_offset::<T, _>(
values_buffer.len(),
|string_index, range| {
if !values_str.is_char_boundary(range.start)
|| !values_str.is_char_boundary(range.end)
{
return Err(ArrowError::InvalidArgumentError(format!(
"incomplete utf-8 byte sequence from index {}",
string_index
)));
}
Ok(())
},
)
} else {
// find specific offset that failed utf8 validation
self.validate_each_offset::<T, _>(
values_buffer.len(),
|string_index, range| {
std::str::from_utf8(&values_buffer[range.clone()]).map_err(|e| {
ArrowError::InvalidArgumentError(format!(
"Invalid UTF8 sequence at string index {} ({:?}): {}",
string_index, range, e
))
})?;
Ok(())
},
)
}
}

/// Ensures that all offsets in `buffers[0]` into `buffers[1]` are
Expand Down

0 comments on commit 0c64054

Please sign in to comment.