Skip to content

Commit

Permalink
Faster StringDictionaryBuilder (apache#1851)
Browse files Browse the repository at this point in the history
  • Loading branch information
tustvold committed Jun 13, 2022
1 parent deb0fa0 commit 00e7ea8
Show file tree
Hide file tree
Showing 4 changed files with 441 additions and 303 deletions.
6 changes: 6 additions & 0 deletions arrow/Cargo.toml
Expand Up @@ -38,9 +38,11 @@ path = "src/lib.rs"
bench = false

[dependencies]
ahash = "0.7"
serde = { version = "1.0" }
serde_derive = "1.0"
serde_json = { version = "1.0", features = ["preserve_order"] }
hashbrown = "0.12"
indexmap = { version = "1.6", features = ["std"] }
rand = { version = "0.8", optional = true }
num = "0.4"
Expand Down Expand Up @@ -194,3 +196,7 @@ required-features = ["test_utils"]
[[bench]]
name = "array_data_validate"
harness = false

[[bench]]
name = "string_dictionary_builder"
harness = false
53 changes: 53 additions & 0 deletions arrow/benches/string_dictionary_builder.rs
@@ -0,0 +1,53 @@
use arrow::array::{Int32Builder, StringBuilder, StringDictionaryBuilder};
use criterion::{criterion_group, criterion_main, Criterion};
use rand::{thread_rng, Rng};

/// Note: this is best effort, not all keys are necessarily present or unique
fn build_strings(dict_size: usize, total_size: usize, key_len: usize) -> Vec<String> {
let mut rng = thread_rng();
let values: Vec<String> = (0..dict_size)
.map(|_| (0..key_len).map(|_| rng.gen::<char>()).collect())
.collect();

(0..total_size)
.map(|_| values[rng.gen_range(0..dict_size)].clone())
.collect()
}

fn criterion_benchmark(c: &mut Criterion) {
let mut group = c.benchmark_group("string_dictionary_builder");

let mut do_bench = |dict_size: usize, total_size: usize, key_len: usize| {
group.bench_function(
format!(
"(dict_size:{}, len:{}, key_len: {})",
dict_size, total_size, key_len
),
|b| {
let strings = build_strings(dict_size, total_size, key_len);
b.iter(|| {
let keys = Int32Builder::new(strings.len());
let values = StringBuilder::new((key_len + 1) * dict_size);
let mut builder = StringDictionaryBuilder::new(keys, values);

for val in &strings {
builder.append(val).unwrap();
}

builder.finish();
})
},
);
};

do_bench(20, 1000, 5);
do_bench(100, 1000, 5);
do_bench(100, 1000, 10);
do_bench(100, 10000, 10);
do_bench(100, 10000, 100);

group.finish();
}

criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);

0 comments on commit 00e7ea8

Please sign in to comment.