Remove simd and avx512 bitwise kernels in favor of autovectorization (a…

…pache#1830) * Remove simd and avx512 bitwise kernels since they are actually slightly slower than the autovectorized version * Add notes about target-cpu to README
tustvold · Jun 12, 2022 · fb697ce · fb697ce
1 parent 029203e
commit fb697ce
Show file tree

Hide file tree

Showing 7 changed files with 69 additions and 413 deletions.
diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml
@@ -61,7 +61,6 @@ bitflags = "1.2.1"
 
 [features]
 default = ["csv", "ipc", "test_utils"]
-avx512 = []
 csv = ["csv_crate"]
 ipc = ["flatbuffers"]
 simd = ["packed_simd"]

diff --git a/arrow/README.md b/arrow/README.md
@@ -100,3 +100,17 @@ cargo run --example read_csv
 ```
 
 [arrow]: https://arrow.apache.org/
+
+
+## Performance
+
+Most of the compute kernels benefit a lot from being optimized for a specific CPU target.
+This is especially so on x86-64 since without specifying a target the compiler can only assume support for SSE2 vector instructions.
+One of the following values as `-Ctarget-cpu=value` in `RUSTFLAGS` can therefore improve performance significantly:
+
+ - `native`: Target the exact features of the cpu that the build is running on.
+   This should give the best performance when building and running locally, but should be used carefully for example when building in a CI pipeline or when shipping pre-compiled software. 
+ - `x86-64-v3`: Includes AVX2 support and is close to the intel `haswell` architecture released in 2013 and should be supported by any recent Intel or Amd cpu.
+ - `x86-64-v4`: Includes AVX512 support available on intel `skylake` server and `icelake`/`tigerlake`/`rocketlake` laptop and desktop processors.
+
+These flags should be used in addition to the `simd` feature, since they will also affect the code generated by the simd library. 
diff --git a/arrow/benches/buffer_bit_ops.rs b/arrow/benches/buffer_bit_ops.rs
@@ -17,11 +17,14 @@
 
 #[macro_use]
 extern crate criterion;
-use criterion::Criterion;
+
+use criterion::{Criterion, Throughput};
 
 extern crate arrow;
 
-use arrow::buffer::{Buffer, MutableBuffer};
+use arrow::buffer::{
+    buffer_bin_and, buffer_bin_or, buffer_unary_not, Buffer, MutableBuffer,
+};
 
 ///  Helper function to create arrays
 fn create_buffer(size: usize) -> Buffer {
@@ -42,17 +45,59 @@ fn bench_buffer_or(left: &Buffer, right: &Buffer) {
     criterion::black_box((left | right).unwrap());
 }
 
+fn bench_buffer_not(buffer: &Buffer) {
+    criterion::black_box(!buffer);
+}
+
+fn bench_buffer_and_with_offsets(
+    left: &Buffer,
+    left_offset: usize,
+    right: &Buffer,
+    right_offset: usize,
+    len: usize,
+) {
+    criterion::black_box(buffer_bin_and(left, left_offset, right, right_offset, len));
+}
+
+fn bench_buffer_or_with_offsets(
+    left: &Buffer,
+    left_offset: usize,
+    right: &Buffer,
+    right_offset: usize,
+    len: usize,
+) {
+    criterion::black_box(buffer_bin_or(left, left_offset, right, right_offset, len));
+}
+
+fn bench_buffer_not_with_offsets(buffer: &Buffer, offset: usize, len: usize) {
+    criterion::black_box(buffer_unary_not(buffer, offset, len));
+}
+
 fn bit_ops_benchmark(c: &mut Criterion) {
     let left = create_buffer(512 * 10);
     let right = create_buffer(512 * 10);
 
-    c.bench_function("buffer_bit_ops and", |b| {
-        b.iter(|| bench_buffer_and(&left, &right))
-    });
+    c.benchmark_group("buffer_binary_ops")
+        .throughput(Throughput::Bytes(3 * left.len() as u64))
+        .bench_function("and", |b| b.iter(|| bench_buffer_and(&left, &right)))
+        .bench_function("or", |b| b.iter(|| bench_buffer_or(&left, &right)))
+        .bench_function("and_with_offset", |b| {
+            b.iter(|| {
+                bench_buffer_and_with_offsets(&left, 1, &right, 2, left.len() * 8 - 5)
+            })
+        })
+        .bench_function("or_with_offset", |b| {
+            b.iter(|| {
+                bench_buffer_or_with_offsets(&left, 1, &right, 2, left.len() * 8 - 5)
+            })
+        });
 
-    c.bench_function("buffer_bit_ops or", |b| {
-        b.iter(|| bench_buffer_or(&left, &right))
-    });
+    c.benchmark_group("buffer_unary_ops")
+        .throughput(Throughput::Bytes(2 * left.len() as u64))
+        .bench_function("not", |b| b.iter(|| bench_buffer_not(&left)))
+        .bench_function("not_with_offset", |b| {
+            b.iter(|| bench_buffer_not_with_offsets(&left, 1, left.len() * 8 - 5))
+        });
 }
 
 criterion_group!(benches, bit_ops_benchmark);

diff --git a/arrow/src/arch/avx512.rs b/arrow/src/arch/avx512.rs
diff --git a/arrow/src/arch/mod.rs b/arrow/src/arch/mod.rs