Skip to content

Commit

Permalink
Remove simd and avx512 bitwise kernels in favor of autovectorization (a…
Browse files Browse the repository at this point in the history
…pache#1830)

* Remove simd and avx512 bitwise kernels since they are actually slightly slower than the autovectorized version

* Add notes about target-cpu to README
  • Loading branch information
jhorstmann committed Jun 12, 2022
1 parent 029203e commit fb697ce
Show file tree
Hide file tree
Showing 7 changed files with 69 additions and 413 deletions.
1 change: 0 additions & 1 deletion arrow/Cargo.toml
Expand Up @@ -61,7 +61,6 @@ bitflags = "1.2.1"

[features]
default = ["csv", "ipc", "test_utils"]
avx512 = []
csv = ["csv_crate"]
ipc = ["flatbuffers"]
simd = ["packed_simd"]
Expand Down
14 changes: 14 additions & 0 deletions arrow/README.md
Expand Up @@ -100,3 +100,17 @@ cargo run --example read_csv
```

[arrow]: https://arrow.apache.org/


## Performance

Most of the compute kernels benefit a lot from being optimized for a specific CPU target.
This is especially so on x86-64 since without specifying a target the compiler can only assume support for SSE2 vector instructions.
One of the following values as `-Ctarget-cpu=value` in `RUSTFLAGS` can therefore improve performance significantly:

- `native`: Target the exact features of the cpu that the build is running on.
This should give the best performance when building and running locally, but should be used carefully for example when building in a CI pipeline or when shipping pre-compiled software.
- `x86-64-v3`: Includes AVX2 support and is close to the intel `haswell` architecture released in 2013 and should be supported by any recent Intel or Amd cpu.
- `x86-64-v4`: Includes AVX512 support available on intel `skylake` server and `icelake`/`tigerlake`/`rocketlake` laptop and desktop processors.

These flags should be used in addition to the `simd` feature, since they will also affect the code generated by the simd library.
61 changes: 53 additions & 8 deletions arrow/benches/buffer_bit_ops.rs
Expand Up @@ -17,11 +17,14 @@

#[macro_use]
extern crate criterion;
use criterion::Criterion;

use criterion::{Criterion, Throughput};

extern crate arrow;

use arrow::buffer::{Buffer, MutableBuffer};
use arrow::buffer::{
buffer_bin_and, buffer_bin_or, buffer_unary_not, Buffer, MutableBuffer,
};

/// Helper function to create arrays
fn create_buffer(size: usize) -> Buffer {
Expand All @@ -42,17 +45,59 @@ fn bench_buffer_or(left: &Buffer, right: &Buffer) {
criterion::black_box((left | right).unwrap());
}

fn bench_buffer_not(buffer: &Buffer) {
criterion::black_box(!buffer);
}

fn bench_buffer_and_with_offsets(
left: &Buffer,
left_offset: usize,
right: &Buffer,
right_offset: usize,
len: usize,
) {
criterion::black_box(buffer_bin_and(left, left_offset, right, right_offset, len));
}

fn bench_buffer_or_with_offsets(
left: &Buffer,
left_offset: usize,
right: &Buffer,
right_offset: usize,
len: usize,
) {
criterion::black_box(buffer_bin_or(left, left_offset, right, right_offset, len));
}

fn bench_buffer_not_with_offsets(buffer: &Buffer, offset: usize, len: usize) {
criterion::black_box(buffer_unary_not(buffer, offset, len));
}

fn bit_ops_benchmark(c: &mut Criterion) {
let left = create_buffer(512 * 10);
let right = create_buffer(512 * 10);

c.bench_function("buffer_bit_ops and", |b| {
b.iter(|| bench_buffer_and(&left, &right))
});
c.benchmark_group("buffer_binary_ops")
.throughput(Throughput::Bytes(3 * left.len() as u64))
.bench_function("and", |b| b.iter(|| bench_buffer_and(&left, &right)))
.bench_function("or", |b| b.iter(|| bench_buffer_or(&left, &right)))
.bench_function("and_with_offset", |b| {
b.iter(|| {
bench_buffer_and_with_offsets(&left, 1, &right, 2, left.len() * 8 - 5)
})
})
.bench_function("or_with_offset", |b| {
b.iter(|| {
bench_buffer_or_with_offsets(&left, 1, &right, 2, left.len() * 8 - 5)
})
});

c.bench_function("buffer_bit_ops or", |b| {
b.iter(|| bench_buffer_or(&left, &right))
});
c.benchmark_group("buffer_unary_ops")
.throughput(Throughput::Bytes(2 * left.len() as u64))
.bench_function("not", |b| b.iter(|| bench_buffer_not(&left)))
.bench_function("not_with_offset", |b| {
b.iter(|| bench_buffer_not_with_offsets(&left, 1, left.len() * 8 - 5))
});
}

criterion_group!(benches, bit_ops_benchmark);
Expand Down
73 changes: 0 additions & 73 deletions arrow/src/arch/avx512.rs

This file was deleted.

22 changes: 0 additions & 22 deletions arrow/src/arch/mod.rs

This file was deleted.

0 comments on commit fb697ce

Please sign in to comment.