diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index ebcdd9e7a85..3f69888d514 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -61,7 +61,6 @@ bitflags = "1.2.1" [features] default = ["csv", "ipc", "test_utils"] -avx512 = [] csv = ["csv_crate"] ipc = ["flatbuffers"] simd = ["packed_simd"] diff --git a/arrow/README.md b/arrow/README.md index 67de57ff0f6..28240e77dff 100644 --- a/arrow/README.md +++ b/arrow/README.md @@ -100,3 +100,17 @@ cargo run --example read_csv ``` [arrow]: https://arrow.apache.org/ + + +## Performance + +Most of the compute kernels benefit a lot from being optimized for a specific CPU target. +This is especially so on x86-64 since without specifying a target the compiler can only assume support for SSE2 vector instructions. +One of the following values as `-Ctarget-cpu=value` in `RUSTFLAGS` can therefore improve performance significantly: + + - `native`: Target the exact features of the cpu that the build is running on. + This should give the best performance when building and running locally, but should be used carefully for example when building in a CI pipeline or when shipping pre-compiled software. + - `x86-64-v3`: Includes AVX2 support and is close to the intel `haswell` architecture released in 2013 and should be supported by any recent Intel or Amd cpu. + - `x86-64-v4`: Includes AVX512 support available on intel `skylake` server and `icelake`/`tigerlake`/`rocketlake` laptop and desktop processors. + +These flags should be used in addition to the `simd` feature, since they will also affect the code generated by the simd library. \ No newline at end of file diff --git a/arrow/benches/buffer_bit_ops.rs b/arrow/benches/buffer_bit_ops.rs index 063f39c9272..6c6bb0463b2 100644 --- a/arrow/benches/buffer_bit_ops.rs +++ b/arrow/benches/buffer_bit_ops.rs @@ -17,11 +17,14 @@ #[macro_use] extern crate criterion; -use criterion::Criterion; + +use criterion::{Criterion, Throughput}; extern crate arrow; -use arrow::buffer::{Buffer, MutableBuffer}; +use arrow::buffer::{ + buffer_bin_and, buffer_bin_or, buffer_unary_not, Buffer, MutableBuffer, +}; /// Helper function to create arrays fn create_buffer(size: usize) -> Buffer { @@ -42,17 +45,59 @@ fn bench_buffer_or(left: &Buffer, right: &Buffer) { criterion::black_box((left | right).unwrap()); } +fn bench_buffer_not(buffer: &Buffer) { + criterion::black_box(!buffer); +} + +fn bench_buffer_and_with_offsets( + left: &Buffer, + left_offset: usize, + right: &Buffer, + right_offset: usize, + len: usize, +) { + criterion::black_box(buffer_bin_and(left, left_offset, right, right_offset, len)); +} + +fn bench_buffer_or_with_offsets( + left: &Buffer, + left_offset: usize, + right: &Buffer, + right_offset: usize, + len: usize, +) { + criterion::black_box(buffer_bin_or(left, left_offset, right, right_offset, len)); +} + +fn bench_buffer_not_with_offsets(buffer: &Buffer, offset: usize, len: usize) { + criterion::black_box(buffer_unary_not(buffer, offset, len)); +} + fn bit_ops_benchmark(c: &mut Criterion) { let left = create_buffer(512 * 10); let right = create_buffer(512 * 10); - c.bench_function("buffer_bit_ops and", |b| { - b.iter(|| bench_buffer_and(&left, &right)) - }); + c.benchmark_group("buffer_binary_ops") + .throughput(Throughput::Bytes(3 * left.len() as u64)) + .bench_function("and", |b| b.iter(|| bench_buffer_and(&left, &right))) + .bench_function("or", |b| b.iter(|| bench_buffer_or(&left, &right))) + .bench_function("and_with_offset", |b| { + b.iter(|| { + bench_buffer_and_with_offsets(&left, 1, &right, 2, left.len() * 8 - 5) + }) + }) + .bench_function("or_with_offset", |b| { + b.iter(|| { + bench_buffer_or_with_offsets(&left, 1, &right, 2, left.len() * 8 - 5) + }) + }); - c.bench_function("buffer_bit_ops or", |b| { - b.iter(|| bench_buffer_or(&left, &right)) - }); + c.benchmark_group("buffer_unary_ops") + .throughput(Throughput::Bytes(2 * left.len() as u64)) + .bench_function("not", |b| b.iter(|| bench_buffer_not(&left))) + .bench_function("not_with_offset", |b| { + b.iter(|| bench_buffer_not_with_offsets(&left, 1, left.len() * 8 - 5)) + }); } criterion_group!(benches, bit_ops_benchmark); diff --git a/arrow/src/arch/avx512.rs b/arrow/src/arch/avx512.rs deleted file mode 100644 index 264532f3594..00000000000 --- a/arrow/src/arch/avx512.rs +++ /dev/null @@ -1,73 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -pub(crate) const AVX512_U8X64_LANES: usize = 64; - -#[target_feature(enable = "avx512f")] -pub(crate) unsafe fn avx512_bin_and(left: &[u8], right: &[u8], res: &mut [u8]) { - use core::arch::x86_64::{__m512i, _mm512_and_si512, _mm512_loadu_epi64}; - - let l: __m512i = _mm512_loadu_epi64(left.as_ptr() as *const _); - let r: __m512i = _mm512_loadu_epi64(right.as_ptr() as *const _); - let f = _mm512_and_si512(l, r); - let s = &f as *const __m512i as *const u8; - let d = res.get_unchecked_mut(0) as *mut _ as *mut u8; - std::ptr::copy_nonoverlapping(s, d, std::mem::size_of::<__m512i>()); -} - -#[target_feature(enable = "avx512f")] -pub(crate) unsafe fn avx512_bin_or(left: &[u8], right: &[u8], res: &mut [u8]) { - use core::arch::x86_64::{__m512i, _mm512_loadu_epi64, _mm512_or_si512}; - - let l: __m512i = _mm512_loadu_epi64(left.as_ptr() as *const _); - let r: __m512i = _mm512_loadu_epi64(right.as_ptr() as *const _); - let f = _mm512_or_si512(l, r); - let s = &f as *const __m512i as *const u8; - let d = res.get_unchecked_mut(0) as *mut _ as *mut u8; - std::ptr::copy_nonoverlapping(s, d, std::mem::size_of::<__m512i>()); -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_bitwise_and_avx512() { - let buf1 = [0b00110011u8; 64]; - let buf2 = [0b11110000u8; 64]; - let mut buf3 = [0b00000000; 64]; - unsafe { - avx512_bin_and(&buf1, &buf2, &mut buf3); - }; - for i in buf3.iter() { - assert_eq!(&0b00110000u8, i); - } - } - - #[test] - fn test_bitwise_or_avx512() { - let buf1 = [0b00010011u8; 64]; - let buf2 = [0b11100000u8; 64]; - let mut buf3 = [0b00000000; 64]; - unsafe { - avx512_bin_or(&buf1, &buf2, &mut buf3); - }; - for i in buf3.iter() { - assert_eq!(&0b11110011u8, i); - } - } -} diff --git a/arrow/src/arch/mod.rs b/arrow/src/arch/mod.rs deleted file mode 100644 index 56d8f4c0e2c..00000000000 --- a/arrow/src/arch/mod.rs +++ /dev/null @@ -1,22 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -/// -/// Arch module contains architecture specific code. -/// Be aware that not all machines have these specific operations available. -#[cfg(all(target_arch = "x86_64", feature = "avx512"))] -pub(crate) mod avx512; diff --git a/arrow/src/buffer/ops.rs b/arrow/src/buffer/ops.rs index e0086a1a820..b3571d1740b 100644 --- a/arrow/src/buffer/ops.rs +++ b/arrow/src/buffer/ops.rs @@ -15,110 +15,8 @@ // specific language governing permissions and limitations // under the License. -#[cfg(feature = "simd")] -use crate::util::bit_util; -#[cfg(feature = "simd")] -use packed_simd::u8x64; - -#[cfg(feature = "avx512")] -use crate::arch::avx512::*; -use crate::util::bit_util::ceil; -#[cfg(any(feature = "simd", feature = "avx512"))] -use std::borrow::BorrowMut; - use super::{Buffer, MutableBuffer}; - -/// Apply a bitwise operation `simd_op` / `scalar_op` to two inputs using simd instructions and return the result as a Buffer. -/// The `simd_op` functions gets applied on chunks of 64 bytes (512 bits) at a time -/// and the `scalar_op` gets applied to remaining bytes. -/// Contrary to the non-simd version `bitwise_bin_op_helper`, the offset and length is specified in bytes -/// and this version does not support operations starting at arbitrary bit offsets. -#[cfg(feature = "simd")] -pub fn bitwise_bin_op_simd_helper( - left: &Buffer, - left_offset: usize, - right: &Buffer, - right_offset: usize, - len: usize, - simd_op: SI, - scalar_op: SC, -) -> Buffer -where - SI: Fn(u8x64, u8x64) -> u8x64, - SC: Fn(u8, u8) -> u8, -{ - let mut result = MutableBuffer::new(len).with_bitset(len, false); - let lanes = u8x64::lanes(); - - let mut left_chunks = left.as_slice()[left_offset..].chunks_exact(lanes); - let mut right_chunks = right.as_slice()[right_offset..].chunks_exact(lanes); - let mut result_chunks = result.as_slice_mut().chunks_exact_mut(lanes); - - result_chunks - .borrow_mut() - .zip(left_chunks.borrow_mut().zip(right_chunks.borrow_mut())) - .for_each(|(res, (left, right))| { - unsafe { bit_util::bitwise_bin_op_simd(&left, &right, res, &simd_op) }; - }); - - result_chunks - .into_remainder() - .iter_mut() - .zip( - left_chunks - .remainder() - .iter() - .zip(right_chunks.remainder().iter()), - ) - .for_each(|(res, (left, right))| { - *res = scalar_op(*left, *right); - }); - - result.into() -} - -/// Apply a bitwise operation `simd_op` / `scalar_op` to one input using simd instructions and return the result as a Buffer. -/// The `simd_op` functions gets applied on chunks of 64 bytes (512 bits) at a time -/// and the `scalar_op` gets applied to remaining bytes. -/// Contrary to the non-simd version `bitwise_unary_op_helper`, the offset and length is specified in bytes -/// and this version does not support operations starting at arbitrary bit offsets. -#[cfg(feature = "simd")] -pub fn bitwise_unary_op_simd_helper( - left: &Buffer, - left_offset: usize, - len: usize, - simd_op: SI, - scalar_op: SC, -) -> Buffer -where - SI: Fn(u8x64) -> u8x64, - SC: Fn(u8) -> u8, -{ - let mut result = MutableBuffer::new(len).with_bitset(len, false); - let lanes = u8x64::lanes(); - - let mut left_chunks = left.as_slice()[left_offset..].chunks_exact(lanes); - let mut result_chunks = result.as_slice_mut().chunks_exact_mut(lanes); - - result_chunks - .borrow_mut() - .zip(left_chunks.borrow_mut()) - .for_each(|(res, left)| unsafe { - let data_simd = u8x64::from_slice_unaligned_unchecked(left); - let simd_result = simd_op(data_simd); - simd_result.write_to_slice_unaligned_unchecked(res); - }); - - result_chunks - .into_remainder() - .iter_mut() - .zip(left_chunks.remainder().iter()) - .for_each(|(res, left)| { - *res = scalar_op(*left); - }); - - result.into() -} +use crate::util::bit_util::ceil; /// Apply a bitwise operation `op` to two inputs and return the result as a Buffer. /// The inputs are treated as bitmaps, meaning that offsets and length are specified in number of bits. @@ -189,100 +87,6 @@ where result.into() } -#[cfg(all(target_arch = "x86_64", feature = "avx512"))] -pub fn buffer_bin_and( - left: &Buffer, - left_offset_in_bits: usize, - right: &Buffer, - right_offset_in_bits: usize, - len_in_bits: usize, -) -> Buffer { - if left_offset_in_bits % 8 == 0 - && right_offset_in_bits % 8 == 0 - && len_in_bits % 8 == 0 - { - let len = len_in_bits / 8; - let left_offset = left_offset_in_bits / 8; - let right_offset = right_offset_in_bits / 8; - - let mut result = MutableBuffer::new(len).with_bitset(len, false); - - let mut left_chunks = - left.as_slice()[left_offset..].chunks_exact(AVX512_U8X64_LANES); - let mut right_chunks = - right.as_slice()[right_offset..].chunks_exact(AVX512_U8X64_LANES); - let mut result_chunks = - result.as_slice_mut().chunks_exact_mut(AVX512_U8X64_LANES); - - result_chunks - .borrow_mut() - .zip(left_chunks.borrow_mut().zip(right_chunks.borrow_mut())) - .for_each(|(res, (left, right))| unsafe { - avx512_bin_and(left, right, res); - }); - - result_chunks - .into_remainder() - .iter_mut() - .zip( - left_chunks - .remainder() - .iter() - .zip(right_chunks.remainder().iter()), - ) - .for_each(|(res, (left, right))| { - *res = *left & *right; - }); - - result.into() - } else { - bitwise_bin_op_helper( - &left, - left_offset_in_bits, - right, - right_offset_in_bits, - len_in_bits, - |a, b| a & b, - ) - } -} - -#[cfg(all(feature = "simd", not(feature = "avx512")))] -pub fn buffer_bin_and( - left: &Buffer, - left_offset_in_bits: usize, - right: &Buffer, - right_offset_in_bits: usize, - len_in_bits: usize, -) -> Buffer { - if left_offset_in_bits % 8 == 0 - && right_offset_in_bits % 8 == 0 - && len_in_bits % 8 == 0 - { - bitwise_bin_op_simd_helper( - &left, - left_offset_in_bits / 8, - &right, - right_offset_in_bits / 8, - len_in_bits / 8, - |a, b| a & b, - |a, b| a & b, - ) - } else { - bitwise_bin_op_helper( - &left, - left_offset_in_bits, - right, - right_offset_in_bits, - len_in_bits, - |a, b| a & b, - ) - } -} - -// Note: do not target specific features like x86 without considering -// other targets like wasm32, as those would fail to build -#[cfg(all(not(any(feature = "simd", feature = "avx512"))))] pub fn buffer_bin_and( left: &Buffer, left_offset_in_bits: usize, @@ -300,98 +104,6 @@ pub fn buffer_bin_and( ) } -#[cfg(all(target_arch = "x86_64", feature = "avx512"))] -pub fn buffer_bin_or( - left: &Buffer, - left_offset_in_bits: usize, - right: &Buffer, - right_offset_in_bits: usize, - len_in_bits: usize, -) -> Buffer { - if left_offset_in_bits % 8 == 0 - && right_offset_in_bits % 8 == 0 - && len_in_bits % 8 == 0 - { - let len = len_in_bits / 8; - let left_offset = left_offset_in_bits / 8; - let right_offset = right_offset_in_bits / 8; - - let mut result = MutableBuffer::new(len).with_bitset(len, false); - - let mut left_chunks = - left.as_slice()[left_offset..].chunks_exact(AVX512_U8X64_LANES); - let mut right_chunks = - right.as_slice()[right_offset..].chunks_exact(AVX512_U8X64_LANES); - let mut result_chunks = - result.as_slice_mut().chunks_exact_mut(AVX512_U8X64_LANES); - - result_chunks - .borrow_mut() - .zip(left_chunks.borrow_mut().zip(right_chunks.borrow_mut())) - .for_each(|(res, (left, right))| unsafe { - avx512_bin_or(left, right, res); - }); - - result_chunks - .into_remainder() - .iter_mut() - .zip( - left_chunks - .remainder() - .iter() - .zip(right_chunks.remainder().iter()), - ) - .for_each(|(res, (left, right))| { - *res = *left | *right; - }); - - result.into() - } else { - bitwise_bin_op_helper( - &left, - left_offset_in_bits, - right, - right_offset_in_bits, - len_in_bits, - |a, b| a | b, - ) - } -} - -#[cfg(all(feature = "simd", not(feature = "avx512")))] -pub fn buffer_bin_or( - left: &Buffer, - left_offset_in_bits: usize, - right: &Buffer, - right_offset_in_bits: usize, - len_in_bits: usize, -) -> Buffer { - if left_offset_in_bits % 8 == 0 - && right_offset_in_bits % 8 == 0 - && len_in_bits % 8 == 0 - { - bitwise_bin_op_simd_helper( - &left, - left_offset_in_bits / 8, - &right, - right_offset_in_bits / 8, - len_in_bits / 8, - |a, b| a | b, - |a, b| a | b, - ) - } else { - bitwise_bin_op_helper( - &left, - left_offset_in_bits, - right, - right_offset_in_bits, - len_in_bits, - |a, b| a | b, - ) - } -} - -#[cfg(all(not(any(feature = "simd", feature = "avx512"))))] pub fn buffer_bin_or( left: &Buffer, left_offset_in_bits: usize, @@ -414,20 +126,5 @@ pub fn buffer_unary_not( offset_in_bits: usize, len_in_bits: usize, ) -> Buffer { - // SIMD implementation if available and byte-aligned - #[cfg(feature = "simd")] - if offset_in_bits % 8 == 0 && len_in_bits % 8 == 0 { - return bitwise_unary_op_simd_helper( - &left, - offset_in_bits / 8, - len_in_bits / 8, - |a| !a, - |a| !a, - ); - } - // Default implementation - #[allow(unreachable_code)] - { - bitwise_unary_op_helper(left, offset_in_bits, len_in_bits, |a| !a) - } + bitwise_unary_op_helper(left, offset_in_bits, len_in_bits, |a| !a) } diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs index 0cb77a36090..95c69ca0be6 100644 --- a/arrow/src/lib.rs +++ b/arrow/src/lib.rs @@ -225,14 +225,10 @@ //! [issue tracker]: https://github.com/apache/arrow-rs/issues //! -#![cfg_attr(feature = "avx512", feature(stdsimd))] -#![cfg_attr(feature = "avx512", feature(repr_simd))] -#![cfg_attr(feature = "avx512", feature(avx512_target_feature))] #![deny(clippy::redundant_clone)] #![warn(missing_debug_implementations)] pub mod alloc; -mod arch; pub mod array; pub mod bitmap; pub mod buffer;