Merge pull request #563 from rust-ndarray/integrate-rayon

Integrate ndarray-parallel and make rayon an optional feature
rust-ndarray · Dec 3, 2018 · ad714f8 · ad714f8
2 parents 1cf7963 + b677c77
commit ad714f8
Show file tree

Hide file tree

Showing 14 changed files with 1,030 additions and 6 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -31,6 +31,8 @@ num-traits = "0.2"
 num-complex = "0.2"
 itertools = { version = "0.7.0", default-features = false }
 
+rayon = { version = "1.0.3", optional = true }
+
 # Use via the `blas` crate feature!
 cblas-sys = { version = "0.1.4", optional = true, default-features = false }
 blas-src = { version = "0.2.0", optional = true, default-features = false }
@@ -57,7 +59,7 @@ test-blas-openblas-sys = ["blas"]
 test = ["test-blas-openblas-sys"]
 
 # This feature is used for docs
-docs = ["serde-1"]
+docs = ["serde-1", "rayon"]
 
 [profile.release]
 [profile.bench]

diff --git a/README.rst b/README.rst
@@ -52,6 +52,11 @@ your `Cargo.toml`.
   - Optional, compatible with Rust stable
   - Enables serialization support for serde 1.0
 
+- ``rayon``
+
+  - Optional, compatible with Rust stable
+  - Enables parallel iterators, parallelized methods and ``par_azip!``.
+
 - ``blas``
 
   - Optional and experimental, compatible with Rust stable

diff --git a/benches/par_rayon.rs b/benches/par_rayon.rs
@@ -0,0 +1,157 @@
+#![cfg(feature="rayon")]
+#![feature(test)]
+
+extern crate rayon;
+
+extern crate ndarray;
+extern crate itertools;
+
+use ndarray::prelude::*;
+use ndarray::parallel::prelude::*;
+
+extern crate test;
+use test::Bencher;
+
+use ndarray::Zip;
+
+const EXP_N: usize = 256;
+const ADDN: usize = 512;
+
+use std::cmp::max;
+
+fn set_threads() {
+    // Consider setting a fixed number of threads here, for example to avoid
+    // oversubscribing on hyperthreaded cores.
+    // let n = 4;
+    // let _ = rayon::ThreadPoolBuilder::new().num_threads(n).build_global();
+}
+
+#[bench]
+fn map_exp_regular(bench: &mut Bencher)
+{
+    let mut a = Array2::<f64>::zeros((EXP_N, EXP_N));
+    a.swap_axes(0, 1);
+    bench.iter(|| {
+        a.mapv_inplace(|x| x.exp());
+    });
+}
+
+#[bench]
+fn rayon_exp_regular(bench: &mut Bencher)
+{
+    set_threads();
+    let mut a = Array2::<f64>::zeros((EXP_N, EXP_N));
+    a.swap_axes(0, 1);
+    bench.iter(|| {
+        a.view_mut().into_par_iter().for_each(|x| *x = x.exp());
+    });
+}
+
+const FASTEXP: usize = EXP_N;
+
+#[inline]
+fn fastexp(x: f64) -> f64 {
+    let x = 1. + x/1024.;
+    x.powi(1024)
+}
+
+#[bench]
+fn map_fastexp_regular(bench: &mut Bencher)
+{
+    let mut a = Array2::<f64>::zeros((FASTEXP, FASTEXP));
+    bench.iter(|| {
+        a.mapv_inplace(|x| fastexp(x))
+    });
+}
+
+#[bench]
+fn rayon_fastexp_regular(bench: &mut Bencher)
+{
+    set_threads();
+    let mut a = Array2::<f64>::zeros((FASTEXP, FASTEXP));
+    bench.iter(|| {
+        a.view_mut().into_par_iter().for_each(|x| *x = fastexp(*x));
+    });
+}
+
+#[bench]
+fn map_fastexp_cut(bench: &mut Bencher)
+{
+    let mut a = Array2::<f64>::zeros((FASTEXP, FASTEXP));
+    let mut a = a.slice_mut(s![.., ..-1]);
+    bench.iter(|| {
+        a.mapv_inplace(|x| fastexp(x))
+    });
+}
+
+#[bench]
+fn rayon_fastexp_cut(bench: &mut Bencher)
+{
+    set_threads();
+    let mut a = Array2::<f64>::zeros((FASTEXP, FASTEXP));
+    let mut a = a.slice_mut(s![.., ..-1]);
+    bench.iter(|| {
+        a.view_mut().into_par_iter().for_each(|x| *x = fastexp(*x));
+    });
+}
+
+#[bench]
+fn map_fastexp_by_axis(bench: &mut Bencher)
+{
+    let mut a = Array2::<f64>::zeros((FASTEXP, FASTEXP));
+    bench.iter(|| {
+        for mut sheet in a.axis_iter_mut(Axis(0)) {
+            sheet.mapv_inplace(fastexp)
+        }
+    });
+}
+
+#[bench]
+fn rayon_fastexp_by_axis(bench: &mut Bencher)
+{
+    set_threads();
+    let mut a = Array2::<f64>::zeros((FASTEXP, FASTEXP));
+    bench.iter(|| {
+        a.axis_iter_mut(Axis(0)).into_par_iter()
+            .for_each(|mut sheet| sheet.mapv_inplace(fastexp));
+    });
+}
+
+#[bench]
+fn rayon_fastexp_zip(bench: &mut Bencher)
+{
+    set_threads();
+    let mut a = Array2::<f64>::zeros((FASTEXP, FASTEXP));
+    bench.iter(|| {
+        Zip::from(&mut a).into_par_iter().for_each(|(elt, )| *elt = fastexp(*elt));
+    });
+}
+
+#[bench]
+fn add(bench: &mut Bencher)
+{
+    let mut a = Array2::<f64>::zeros((ADDN, ADDN));
+    let b = Array2::<f64>::zeros((ADDN, ADDN));
+    let c = Array2::<f64>::zeros((ADDN, ADDN));
+    let d = Array2::<f64>::zeros((ADDN, ADDN));
+    bench.iter(|| {
+        azip!(mut a, b, c, d in {
+            *a += b.exp() + c.exp() + d.exp();
+        });
+    });
+}
+
+#[bench]
+fn rayon_add(bench: &mut Bencher)
+{
+    set_threads();
+    let mut a = Array2::<f64>::zeros((ADDN, ADDN));
+    let b = Array2::<f64>::zeros((ADDN, ADDN));
+    let c = Array2::<f64>::zeros((ADDN, ADDN));
+    let d = Array2::<f64>::zeros((ADDN, ADDN));
+    bench.iter(|| {
+        par_azip!(mut a, b, c, d in {
+            *a += b.exp() + c.exp() + d.exp();
+        });
+    });
+}
diff --git a/scripts/all-tests.sh b/scripts/all-tests.sh
@@ -11,7 +11,6 @@ cargo test --verbose --no-default-features
 cargo test --release --verbose --no-default-features
 cargo build --verbose --features "$FEATURES"
 cargo test --verbose --features "$FEATURES"
-cargo test --manifest-path=parallel/Cargo.toml --verbose
 cargo test --manifest-path=serialization-tests/Cargo.toml --verbose
 cargo test --manifest-path=blas-tests/Cargo.toml --verbose
 CARGO_TARGET_DIR=target/ cargo test --manifest-path=numeric-tests/Cargo.toml --verbose

diff --git a/src/lib.rs b/src/lib.rs
@@ -55,11 +55,8 @@
 //!     needs matching memory layout to be efficient (with some exceptions).
 //!   + Efficient floating point matrix multiplication even for very large
 //!     matrices; can optionally use BLAS to improve it further.
-//!   + See also the [`ndarray-parallel`] crate for integration with rayon.
 //! - **Requires Rust 1.30**
 //!
-//! [`ndarray-parallel`]: https://docs.rs/ndarray-parallel
-//!
 //! ## Crate Feature Flags
 //!
 //! The following crate feature flags are available. They are configured in your
@@ -68,6 +65,9 @@
 //! - `serde-1`
 //!   - Optional, compatible with Rust stable
 //!   - Enables serialization support for serde 1.0
+//! - `rayon`
+//!   - Optional, compatible with Rust stable
+//!   - Enables parallel iterators, parallelized methods and [`par_azip!`].
 //! - `blas`
 //!   - Optional and experimental, compatible with Rust stable
 //!   - Enable transparent BLAS support for matrix multiplication.
@@ -87,6 +87,9 @@
 #[cfg(feature = "serde-1")]
 extern crate serde;
 
+#[cfg(feature="rayon")]
+extern crate rayon;
+
 #[cfg(feature="blas")]
 extern crate cblas_sys;
 #[cfg(feature="blas")]
@@ -1333,6 +1336,10 @@ impl<A, S, D> ArrayBase<S, D>
 }
 
 
+// parallel methods
+#[cfg(feature="rayon")]
+pub mod parallel;
+
 mod impl_1d;
 mod impl_2d;
 mod impl_dyn;

diff --git a/src/parallel/impl_par_methods.rs b/src/parallel/impl_par_methods.rs
@@ -0,0 +1,85 @@
+
+use {
+    Dimension,
+    NdProducer,
+    Zip,
+    ArrayBase,
+    DataMut,
+};
+
+use parallel::prelude::*;
+
+
+/// # Parallel methods
+///
+/// These methods require crate feature `rayon`.
+impl<A, S, D> ArrayBase<S, D>
+    where S: DataMut<Elem=A>,
+          D: Dimension,
+          A: Send + Sync,
+{
+    /// Parallel version of `map_inplace`.
+    ///
+    /// Modify the array in place by calling `f` by mutable reference on each element.
+    ///
+    /// Elements are visited in arbitrary order.
+    pub fn par_map_inplace<F>(&mut self, f: F)
+        where F: Fn(&mut A) + Sync + Send
+    {
+        self.view_mut().into_par_iter().for_each(f)
+    }
+
+    /// Parallel version of `mapv_inplace`.
+    ///
+    /// Modify the array in place by calling `f` by **v**alue on each element.
+    /// The array is updated with the new values.
+    ///
+    /// Elements are visited in arbitrary order.
+    pub fn par_mapv_inplace<F>(&mut self, f: F)
+        where F: Fn(A) -> A + Sync + Send,
+              A: Clone,
+    {
+        self.view_mut().into_par_iter()
+            .for_each(move |x| *x = f(x.clone()))
+    }
+}
+
+
+
+
+// Zip
+
+macro_rules! zip_impl {
+    ($([$($p:ident)*],)+) => {
+        $(
+        #[allow(non_snake_case)]
+        impl<D, $($p),*> Zip<($($p,)*), D>
+            where $($p::Item : Send , )*
+                  $($p : Send , )*
+                  D: Dimension,
+                  $($p: NdProducer<Dim=D> ,)*
+        {
+            /// The `par_apply` method for `Zip`.
+            ///
+            /// This is a shorthand for using `.into_par_iter().for_each()` on
+            /// `Zip`.
+            ///
+            /// Requires crate feature `rayon`.
+            pub fn par_apply<F>(self, function: F)
+                where F: Fn($($p::Item),*) + Sync + Send
+            {
+                self.into_par_iter().for_each(move |($($p,)*)| function($($p),*))
+            }
+        }
+        )+
+    }
+}
+
+zip_impl!{
+    [P1],
+    [P1 P2],
+    [P1 P2 P3],
+    [P1 P2 P3 P4],
+    [P1 P2 P3 P4 P5],
+    [P1 P2 P3 P4 P5 P6],
+}
diff --git a/src/parallel/into_impls.rs b/src/parallel/into_impls.rs
@@ -0,0 +1,54 @@
+use {Array, ArcArray, Dimension, ArrayView, ArrayViewMut};
+
+use super::prelude::IntoParallelIterator;
+use super::Parallel;
+
+/// Requires crate feature `rayon`.
+impl<'a, A, D> IntoParallelIterator for &'a Array<A, D>
+    where D: Dimension,
+          A: Sync
+{
+    type Item = &'a A;
+    type Iter = Parallel<ArrayView<'a, A, D>>;
+    fn into_par_iter(self) -> Self::Iter {
+        self.view().into_par_iter()
+    }
+}
+
+// This is allowed: goes through `.view()`
+/// Requires crate feature `rayon`.
+impl<'a, A, D> IntoParallelIterator for &'a ArcArray<A, D>
+    where D: Dimension,
+          A: Sync
+{
+    type Item = &'a A;
+    type Iter = Parallel<ArrayView<'a, A, D>>;
+    fn into_par_iter(self) -> Self::Iter {
+        self.view().into_par_iter()
+    }
+}
+
+/// Requires crate feature `rayon`.
+impl<'a, A, D> IntoParallelIterator for &'a mut Array<A, D>
+    where D: Dimension,
+          A: Sync + Send
+{
+    type Item = &'a mut A;
+    type Iter = Parallel<ArrayViewMut<'a, A, D>>;
+    fn into_par_iter(self) -> Self::Iter {
+        self.view_mut().into_par_iter()
+    }
+}
+
+// This is allowed: goes through `.view_mut()`, which is unique access
+/// Requires crate feature `rayon`.
+impl<'a, A, D> IntoParallelIterator for &'a mut ArcArray<A, D>
+    where D: Dimension,
+          A: Sync + Send + Clone,
+{
+    type Item = &'a mut A;
+    type Iter = Parallel<ArrayViewMut<'a, A, D>>;
+    fn into_par_iter(self) -> Self::Iter {
+        self.view_mut().into_par_iter()
+    }
+}