Skip to content

Commit

Permalink
Faster BinaryArray to StringArray conversion (#3168)
Browse files Browse the repository at this point in the history
* Faster ByteArray to StringArray conversion

* Add benchmark

* Fix logical conflict
  • Loading branch information
tustvold committed Nov 24, 2022
1 parent 1d22fe3 commit 8ba7842
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 2 deletions.
16 changes: 15 additions & 1 deletion arrow-array/src/array/string_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -216,8 +216,22 @@ impl<OffsetSize: OffsetSizeTrait> From<GenericBinaryArray<OffsetSize>>
for GenericStringArray<OffsetSize>
{
fn from(v: GenericBinaryArray<OffsetSize>) -> Self {
let offsets = v.value_offsets();
let values = v.value_data();

// We only need to validate that all values are valid UTF-8
let validated = std::str::from_utf8(values).expect("Invalid UTF-8 sequence");
for offset in offsets.iter() {
assert!(
validated.is_char_boundary(offset.as_usize()),
"Invalid UTF-8 sequence"
)
}

let builder = v.into_data().into_builder().data_type(Self::DATA_TYPE);
Self::from(builder.build().unwrap())
// SAFETY:
// Validated UTF-8 above
Self::from(unsafe { builder.build_unchecked() })
}
}

Expand Down
6 changes: 6 additions & 0 deletions arrow/benches/array_data_validate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,12 @@ fn validate_benchmark(c: &mut Criterion) {
c.bench_function("validate_utf8_array_data 20000", |b| {
b.iter(|| validate_utf8_array(&str_arr))
});

let byte_array =
BinaryArray::from_iter_values(std::iter::repeat(b"test").take(20000));
c.bench_function("byte_array_to_string_array 20000", |b| {
b.iter(|| StringArray::from(BinaryArray::from(byte_array.data().clone())))
});
}

criterion_group!(benches, validate_benchmark);
Expand Down
2 changes: 1 addition & 1 deletion arrow/src/row/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1425,7 +1425,7 @@ mod tests {
}

#[test]
#[should_panic(expected = "Invalid UTF8 sequence at string")]
#[should_panic(expected = "Invalid UTF-8 sequence")]
fn test_invalid_utf8() {
let mut converter =
RowConverter::new(vec![SortField::new(DataType::Binary)]).unwrap();
Expand Down

0 comments on commit 8ba7842

Please sign in to comment.