Skip to content

Commit

Permalink
Faster StringDictionaryBuilder (apache#1851)
Browse files Browse the repository at this point in the history
  • Loading branch information
tustvold committed Jun 24, 2022
1 parent 9e8c1bf commit 8032029
Show file tree
Hide file tree
Showing 8 changed files with 560 additions and 43 deletions.
Empty file.
2 changes: 2 additions & 0 deletions arrow/Cargo.toml
Expand Up @@ -38,13 +38,15 @@ path = "src/lib.rs"
bench = false

[dependencies]
ahash = { version = "0.7", default-features = false }
serde = { version = "1.0", default-features = false }
serde_derive = { version = "1.0", default-features = false }
serde_json = { version = "1.0", default-features = false, features = ["preserve_order"] }
indexmap = { version = "1.9", default-features = false, features = ["std"] }
rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true }
num = { version = "0.4", default-features = false, features = ["std"] }
half = { version = "2.0", default-features = false }
hashbrown = { version = "0.12", default-features = false }
csv_crate = { version = "1.1", default-features = false, optional = true, package="csv" }
regex = { version = "1.5.6", default-features = false, features = ["std", "unicode"] }
lazy_static = { version = "1.4", default-features = false }
Expand Down
70 changes: 70 additions & 0 deletions arrow/benches/string_dictionary_builder.rs
@@ -0,0 +1,70 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use arrow::array::{Int32Builder, StringBuilder, StringDictionaryBuilder};
use criterion::{criterion_group, criterion_main, Criterion};
use rand::{thread_rng, Rng};

/// Note: this is best effort, not all keys are necessarily present or unique
fn build_strings(dict_size: usize, total_size: usize, key_len: usize) -> Vec<String> {
let mut rng = thread_rng();
let values: Vec<String> = (0..dict_size)
.map(|_| (0..key_len).map(|_| rng.gen::<char>()).collect())
.collect();

(0..total_size)
.map(|_| values[rng.gen_range(0..dict_size)].clone())
.collect()
}

fn criterion_benchmark(c: &mut Criterion) {
let mut group = c.benchmark_group("string_dictionary_builder");

let mut do_bench = |dict_size: usize, total_size: usize, key_len: usize| {
group.bench_function(
format!(
"(dict_size:{}, len:{}, key_len: {})",
dict_size, total_size, key_len
),
|b| {
let strings = build_strings(dict_size, total_size, key_len);
b.iter(|| {
let keys = Int32Builder::new(strings.len());
let values = StringBuilder::new((key_len + 1) * dict_size);
let mut builder = StringDictionaryBuilder::new(keys, values);

for val in &strings {
builder.append(val).unwrap();
}

builder.finish();
})
},
);
};

do_bench(20, 1000, 5);
do_bench(100, 1000, 5);
do_bench(100, 1000, 10);
do_bench(100, 10000, 10);
do_bench(100, 10000, 100);

group.finish();
}

criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);

0 comments on commit 8032029

Please sign in to comment.