diff --git a/Cargo.toml b/Cargo.toml
index 8de0064b4..81da3bd0d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -31,6 +31,8 @@ num-traits = "0.2"
 num-complex = "0.2"
 itertools = { version = "0.7.0", default-features = false }
 
+rayon = { version = "1.0.3", optional = true }
+
 # Use via the `blas` crate feature!
 cblas-sys = { version = "0.1.4", optional = true, default-features = false }
 blas-src = { version = "0.2.0", optional = true, default-features = false }
@@ -57,7 +59,7 @@ test-blas-openblas-sys = ["blas"]
 test = ["test-blas-openblas-sys"]
 
 # This feature is used for docs
-docs = ["serde-1"]
+docs = ["serde-1", "rayon"]
 
 [profile.release]
 [profile.bench]
diff --git a/README.rst b/README.rst
index 3e8959a9a..1fa147a62 100644
--- a/README.rst
+++ b/README.rst
@@ -52,6 +52,11 @@ your `Cargo.toml`.
   - Optional, compatible with Rust stable
   - Enables serialization support for serde 1.0
 
+- ``rayon``
+
+  - Optional, compatible with Rust stable
+  - Enables parallel iterators, parallelized methods and ``par_azip!``.
+
 - ``blas``
 
   - Optional and experimental, compatible with Rust stable
diff --git a/benches/par_rayon.rs b/benches/par_rayon.rs
new file mode 100644
index 000000000..e207a65aa
--- /dev/null
+++ b/benches/par_rayon.rs
@@ -0,0 +1,157 @@
+#![cfg(feature="rayon")]
+#![feature(test)]
+
+extern crate rayon;
+
+extern crate ndarray;
+extern crate itertools;
+
+use ndarray::prelude::*;
+use ndarray::parallel::prelude::*;
+
+extern crate test;
+use test::Bencher;
+
+use ndarray::Zip;
+
+const EXP_N: usize = 256;
+const ADDN: usize = 512;
+
+use std::cmp::max;
+
+fn set_threads() {
+    // Consider setting a fixed number of threads here, for example to avoid
+    // oversubscribing on hyperthreaded cores.
+    // let n = 4;
+    // let _ = rayon::ThreadPoolBuilder::new().num_threads(n).build_global();
+}
+
+#[bench]
+fn map_exp_regular(bench: &mut Bencher)
+{
+    let mut a = Array2::<f64>::zeros((EXP_N, EXP_N));
+    a.swap_axes(0, 1);
+    bench.iter(|| {
+        a.mapv_inplace(|x| x.exp());
+    });
+}
+
+#[bench]
+fn rayon_exp_regular(bench: &mut Bencher)
+{
+    set_threads();
+    let mut a = Array2::<f64>::zeros((EXP_N, EXP_N));
+    a.swap_axes(0, 1);
+    bench.iter(|| {
+        a.view_mut().into_par_iter().for_each(|x| *x = x.exp());
+    });
+}
+
+const FASTEXP: usize = EXP_N;
+
+#[inline]
+fn fastexp(x: f64) -> f64 {
+    let x = 1. + x/1024.;
+    x.powi(1024)
+}
+
+#[bench]
+fn map_fastexp_regular(bench: &mut Bencher)
+{
+    let mut a = Array2::<f64>::zeros((FASTEXP, FASTEXP));
+    bench.iter(|| {
+        a.mapv_inplace(|x| fastexp(x))
+    });
+}
+
+#[bench]
+fn rayon_fastexp_regular(bench: &mut Bencher)
+{
+    set_threads();
+    let mut a = Array2::<f64>::zeros((FASTEXP, FASTEXP));
+    bench.iter(|| {
+        a.view_mut().into_par_iter().for_each(|x| *x = fastexp(*x));
+    });
+}
+
+#[bench]
+fn map_fastexp_cut(bench: &mut Bencher)
+{
+    let mut a = Array2::<f64>::zeros((FASTEXP, FASTEXP));
+    let mut a = a.slice_mut(s![.., ..-1]);
+    bench.iter(|| {
+        a.mapv_inplace(|x| fastexp(x))
+    });
+}
+
+#[bench]
+fn rayon_fastexp_cut(bench: &mut Bencher)
+{
+    set_threads();
+    let mut a = Array2::<f64>::zeros((FASTEXP, FASTEXP));
+    let mut a = a.slice_mut(s![.., ..-1]);
+    bench.iter(|| {
+        a.view_mut().into_par_iter().for_each(|x| *x = fastexp(*x));
+    });
+}
+
+#[bench]
+fn map_fastexp_by_axis(bench: &mut Bencher)
+{
+    let mut a = Array2::<f64>::zeros((FASTEXP, FASTEXP));
+    bench.iter(|| {
+        for mut sheet in a.axis_iter_mut(Axis(0)) {
+            sheet.mapv_inplace(fastexp)
+        }
+    });
+}
+
+#[bench]
+fn rayon_fastexp_by_axis(bench: &mut Bencher)
+{
+    set_threads();
+    let mut a = Array2::<f64>::zeros((FASTEXP, FASTEXP));
+    bench.iter(|| {
+        a.axis_iter_mut(Axis(0)).into_par_iter()
+            .for_each(|mut sheet| sheet.mapv_inplace(fastexp));
+    });
+}
+
+#[bench]
+fn rayon_fastexp_zip(bench: &mut Bencher)
+{
+    set_threads();
+    let mut a = Array2::<f64>::zeros((FASTEXP, FASTEXP));
+    bench.iter(|| {
+        Zip::from(&mut a).into_par_iter().for_each(|(elt, )| *elt = fastexp(*elt));
+    });
+}
+
+#[bench]
+fn add(bench: &mut Bencher)
+{
+    let mut a = Array2::<f64>::zeros((ADDN, ADDN));
+    let b = Array2::<f64>::zeros((ADDN, ADDN));
+    let c = Array2::<f64>::zeros((ADDN, ADDN));
+    let d = Array2::<f64>::zeros((ADDN, ADDN));
+    bench.iter(|| {
+        azip!(mut a, b, c, d in {
+            *a += b.exp() + c.exp() + d.exp();
+        });
+    });
+}
+
+#[bench]
+fn rayon_add(bench: &mut Bencher)
+{
+    set_threads();
+    let mut a = Array2::<f64>::zeros((ADDN, ADDN));
+    let b = Array2::<f64>::zeros((ADDN, ADDN));
+    let c = Array2::<f64>::zeros((ADDN, ADDN));
+    let d = Array2::<f64>::zeros((ADDN, ADDN));
+    bench.iter(|| {
+        par_azip!(mut a, b, c, d in {
+            *a += b.exp() + c.exp() + d.exp();
+        });
+    });
+}
diff --git a/scripts/all-tests.sh b/scripts/all-tests.sh
index e2e00b9d6..5ad7306ab 100755
--- a/scripts/all-tests.sh
+++ b/scripts/all-tests.sh
@@ -11,7 +11,6 @@ cargo test --verbose --no-default-features
 cargo test --release --verbose --no-default-features
 cargo build --verbose --features "$FEATURES"
 cargo test --verbose --features "$FEATURES"
-cargo test --manifest-path=parallel/Cargo.toml --verbose
 cargo test --manifest-path=serialization-tests/Cargo.toml --verbose
 cargo test --manifest-path=blas-tests/Cargo.toml --verbose
 CARGO_TARGET_DIR=target/ cargo test --manifest-path=numeric-tests/Cargo.toml --verbose
diff --git a/src/lib.rs b/src/lib.rs
index c56df3d02..1c884e0a7 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -55,11 +55,8 @@
 //!     needs matching memory layout to be efficient (with some exceptions).
 //!   + Efficient floating point matrix multiplication even for very large
 //!     matrices; can optionally use BLAS to improve it further.
-//!   + See also the [`ndarray-parallel`] crate for integration with rayon.
 //! - **Requires Rust 1.30**
 //!
-//! [`ndarray-parallel`]: https://docs.rs/ndarray-parallel
-//!
 //! ## Crate Feature Flags
 //!
 //! The following crate feature flags are available. They are configured in your
@@ -68,6 +65,9 @@
 //! - `serde-1`
 //!   - Optional, compatible with Rust stable
 //!   - Enables serialization support for serde 1.0
+//! - `rayon`
+//!   - Optional, compatible with Rust stable
+//!   - Enables parallel iterators, parallelized methods and [`par_azip!`].
 //! - `blas`
 //!   - Optional and experimental, compatible with Rust stable
 //!   - Enable transparent BLAS support for matrix multiplication.
@@ -87,6 +87,9 @@
 #[cfg(feature = "serde-1")]
 extern crate serde;
 
+#[cfg(feature="rayon")]
+extern crate rayon;
+
 #[cfg(feature="blas")]
 extern crate cblas_sys;
 #[cfg(feature="blas")]
@@ -1333,6 +1336,10 @@ impl<A, S, D> ArrayBase<S, D>
 }
 
 
+// parallel methods
+#[cfg(feature="rayon")]
+pub mod parallel;
+
 mod impl_1d;
 mod impl_2d;
 mod impl_dyn;
diff --git a/src/parallel/impl_par_methods.rs b/src/parallel/impl_par_methods.rs
new file mode 100644
index 000000000..462eb8cc8
--- /dev/null
+++ b/src/parallel/impl_par_methods.rs
@@ -0,0 +1,85 @@
+
+use {
+    Dimension,
+    NdProducer,
+    Zip,
+    ArrayBase,
+    DataMut,
+};
+
+use parallel::prelude::*;
+
+
+/// # Parallel methods
+///
+/// These methods require crate feature `rayon`.
+impl<A, S, D> ArrayBase<S, D>
+    where S: DataMut<Elem=A>,
+          D: Dimension,
+          A: Send + Sync,
+{
+    /// Parallel version of `map_inplace`.
+    ///
+    /// Modify the array in place by calling `f` by mutable reference on each element.
+    ///
+    /// Elements are visited in arbitrary order.
+    pub fn par_map_inplace<F>(&mut self, f: F)
+        where F: Fn(&mut A) + Sync + Send
+    {
+        self.view_mut().into_par_iter().for_each(f)
+    }
+
+    /// Parallel version of `mapv_inplace`.
+    ///
+    /// Modify the array in place by calling `f` by **v**alue on each element.
+    /// The array is updated with the new values.
+    ///
+    /// Elements are visited in arbitrary order.
+    pub fn par_mapv_inplace<F>(&mut self, f: F)
+        where F: Fn(A) -> A + Sync + Send,
+              A: Clone,
+    {
+        self.view_mut().into_par_iter()
+            .for_each(move |x| *x = f(x.clone()))
+    }
+}
+
+
+
+
+// Zip
+
+macro_rules! zip_impl {
+    ($([$($p:ident)*],)+) => {
+        $(
+        #[allow(non_snake_case)]
+        impl<D, $($p),*> Zip<($($p,)*), D>
+            where $($p::Item : Send , )*
+                  $($p : Send , )*
+                  D: Dimension,
+                  $($p: NdProducer<Dim=D> ,)*
+        {
+            /// The `par_apply` method for `Zip`.
+            ///
+            /// This is a shorthand for using `.into_par_iter().for_each()` on
+            /// `Zip`.
+            ///
+            /// Requires crate feature `rayon`.
+            pub fn par_apply<F>(self, function: F)
+                where F: Fn($($p::Item),*) + Sync + Send
+            {
+                self.into_par_iter().for_each(move |($($p,)*)| function($($p),*))
+            }
+        }
+        )+
+    }
+}
+
+zip_impl!{
+    [P1],
+    [P1 P2],
+    [P1 P2 P3],
+    [P1 P2 P3 P4],
+    [P1 P2 P3 P4 P5],
+    [P1 P2 P3 P4 P5 P6],
+}
diff --git a/src/parallel/into_impls.rs b/src/parallel/into_impls.rs
new file mode 100644
index 000000000..4d38939fd
--- /dev/null
+++ b/src/parallel/into_impls.rs
@@ -0,0 +1,54 @@
+use {Array, ArcArray, Dimension, ArrayView, ArrayViewMut};
+
+use super::prelude::IntoParallelIterator;
+use super::Parallel;
+
+/// Requires crate feature `rayon`.
+impl<'a, A, D> IntoParallelIterator for &'a Array<A, D>
+    where D: Dimension,
+          A: Sync
+{
+    type Item = &'a A;
+    type Iter = Parallel<ArrayView<'a, A, D>>;
+    fn into_par_iter(self) -> Self::Iter {
+        self.view().into_par_iter()
+    }
+}
+
+// This is allowed: goes through `.view()`
+/// Requires crate feature `rayon`.
+impl<'a, A, D> IntoParallelIterator for &'a ArcArray<A, D>
+    where D: Dimension,
+          A: Sync
+{
+    type Item = &'a A;
+    type Iter = Parallel<ArrayView<'a, A, D>>;
+    fn into_par_iter(self) -> Self::Iter {
+        self.view().into_par_iter()
+    }
+}
+
+/// Requires crate feature `rayon`.
+impl<'a, A, D> IntoParallelIterator for &'a mut Array<A, D>
+    where D: Dimension,
+          A: Sync + Send
+{
+    type Item = &'a mut A;
+    type Iter = Parallel<ArrayViewMut<'a, A, D>>;
+    fn into_par_iter(self) -> Self::Iter {
+        self.view_mut().into_par_iter()
+    }
+}
+
+// This is allowed: goes through `.view_mut()`, which is unique access
+/// Requires crate feature `rayon`.
+impl<'a, A, D> IntoParallelIterator for &'a mut ArcArray<A, D>
+    where D: Dimension,
+          A: Sync + Send + Clone,
+{
+    type Item = &'a mut A;
+    type Iter = Parallel<ArrayViewMut<'a, A, D>>;
+    fn into_par_iter(self) -> Self::Iter {
+        self.view_mut().into_par_iter()
+    }
+}
diff --git a/src/parallel/mod.rs b/src/parallel/mod.rs
new file mode 100644
index 000000000..dfed8a636
--- /dev/null
+++ b/src/parallel/mod.rs
@@ -0,0 +1,122 @@
+//! Parallelization features for ndarray.
+//!
+//! Parallelization features are based on the crate [rayon] and its parallel
+//! iterators. Ndarray implements the parallel iterable traits for arrays
+//! and array views, for some of its iterators and for [Zip].
+//! There are also directly parallelized methods on arrays and on [Zip].
+//!
+//! This requires the crate feature `rayon` to be enabled.
+//!
+//! The following types implement parallel iterators, accessed using these
+//! methods:
+//!
+//! - [`Array`], [`ArcArray`]: `.par_iter()` and `.par_iter_mut()`
+//! - [`ArrayView`](ArrayView): `.into_par_iter()`
+//! - [`ArrayViewMut`](ArrayViewMut): `.into_par_iter()`
+//! - [`AxisIter`](iter::AxisIter), [`AxisIterMut`](iter::AxisIterMut): `.into_par_iter()`
+//! - [`Zip`] `.into_par_iter()`
+//!
+//! The following other parallelized methods exist:
+//!
+//! - [`ArrayBase::par_map_inplace()`]
+//! - [`ArrayBase::par_mapv_inplace()`]
+//! - [`Zip::par_apply()`] (all arities)
+//!
+//! Note that you can use the parallel iterator for [Zip] to access all other
+//! rayon parallel iterator methods.
+//!
+//! Only the axis iterators are indexed parallel iterators, the rest are all
+//! “unindexed”. Use ndarray’s [Zip] for lock step parallel iteration of
+//! multiple arrays or producers at a time.
+//!
+//! # Examples
+//!
+//! ## Arrays and array views
+//!
+//! Compute the exponential of each element in an array, parallelized.
+//!
+//! ```
+//! extern crate ndarray;
+//!
+//! use ndarray::Array2;
+//! use ndarray::parallel::prelude::*;
+//!
+//! fn main() {
+//!     let mut a = Array2::<f64>::zeros((128, 128));
+//!
+//!     // Parallel versions of regular array methods
+//!     a.par_map_inplace(|x| *x = x.exp());
+//!     a.par_mapv_inplace(f64::exp);
+//!
+//!     // You can also use the parallel iterator directly
+//!     a.par_iter_mut().for_each(|x| *x = x.exp());
+//! }
+//! ```
+//!
+//! ## Axis iterators
+//!
+//! Use the parallel `.axis_iter()` to compute the sum of each row.
+//!
+//! ```
+//! extern crate ndarray;
+//!
+//! use ndarray::Array;
+//! use ndarray::Axis;
+//! use ndarray::parallel::prelude::*;
+//!
+//! fn main() {
+//!     let a = Array::linspace(0., 63., 64).into_shape((4, 16)).unwrap();
+//!     let mut sums = Vec::new();
+//!     a.axis_iter(Axis(0))
+//!      .into_par_iter()
+//!      .map(|row| row.sum())
+//!      .collect_into_vec(&mut sums);
+//!
+//!     assert_eq!(sums, [120., 376., 632., 888.]);
+//! }
+//! ```
+//!
+//! ## Zip
+//!
+//! Use zip for lock step function application across several arrays
+//!
+//! ```
+//! extern crate ndarray;
+//!
+//! use ndarray::Array3;
+//! use ndarray::Zip;
+//!
+//! type Array3f64 = Array3<f64>;
+//!
+//! fn main() {
+//!     const N: usize = 128;
+//!     let a = Array3f64::from_elem((N, N, N), 1.);
+//!     let b = Array3f64::from_elem(a.dim(), 2.);
+//!     let mut c = Array3f64::zeros(a.dim());
+//!
+//!     Zip::from(&mut c)
+//!         .and(&a)
+//!         .and(&b)
+//!         .par_apply(|c, &a, &b| {
+//!             *c += a - b;
+//!         });
+//! }
+//! ```
+
+
+/// Into- traits for creating parallelized iterators and/or using [`par_azip!`]
+pub mod prelude {
+    #[doc(no_inline)]
+    pub use rayon::prelude::{ParallelIterator, IndexedParallelIterator,
+    IntoParallelIterator, IntoParallelRefIterator, IntoParallelRefMutIterator};
+
+    pub use super::par_azip;
+}
+
+pub use self::par::Parallel;
+pub use par_azip;
+
+mod par;
+mod impl_par_methods;
+mod into_impls;
+mod zipmacro;
diff --git a/src/parallel/par.rs b/src/parallel/par.rs
new file mode 100644
index 000000000..c5d2da0bf
--- /dev/null
+++ b/src/parallel/par.rs
@@ -0,0 +1,278 @@
+
+use rayon::iter::ParallelIterator;
+use rayon::prelude::IntoParallelIterator;
+use rayon::iter::IndexedParallelIterator;
+use rayon::iter::plumbing::{Consumer, UnindexedConsumer};
+use rayon::iter::plumbing::bridge;
+use rayon::iter::plumbing::ProducerCallback;
+use rayon::iter::plumbing::Producer;
+use rayon::iter::plumbing::UnindexedProducer;
+use rayon::iter::plumbing::bridge_unindexed;
+use rayon::iter::plumbing::Folder;
+
+use iter::AxisIter;
+use iter::AxisIterMut;
+use {Dimension};
+use {ArrayView, ArrayViewMut};
+
+/// Parallel iterator wrapper.
+#[derive(Copy, Clone, Debug)]
+pub struct Parallel<I> {
+    iter: I,
+}
+
+/// Parallel producer wrapper.
+#[derive(Copy, Clone, Debug)]
+struct ParallelProducer<I>(I);
+
+macro_rules! par_iter_wrapper {
+    // thread_bounds are either Sync or Send + Sync
+    ($iter_name:ident, [$($thread_bounds:tt)*]) => {
+    /// Requires crate feature `rayon`.
+    impl<'a, A, D> IntoParallelIterator for $iter_name<'a, A, D>
+        where D: Dimension,
+              A: $($thread_bounds)*,
+    {
+        type Item = <Self as Iterator>::Item;
+        type Iter = Parallel<Self>;
+        fn into_par_iter(self) -> Self::Iter {
+            Parallel {
+                iter: self,
+            }
+        }
+    }
+
+    impl<'a, A, D> ParallelIterator for Parallel<$iter_name<'a, A, D>>
+        where D: Dimension,
+              A: $($thread_bounds)*,
+    {
+        type Item = <$iter_name<'a, A, D> as Iterator>::Item;
+        fn drive_unindexed<C>(self, consumer: C) -> C::Result
+            where C: UnindexedConsumer<Self::Item>
+        {
+            bridge(self, consumer)
+        }
+
+        fn opt_len(&self) -> Option<usize> {
+            Some(self.iter.len())
+        }
+    }
+
+    impl<'a, A, D> IndexedParallelIterator for Parallel<$iter_name<'a, A, D>>
+        where D: Dimension,
+              A: $($thread_bounds)*,
+    {
+        fn with_producer<Cb>(self, callback: Cb) -> Cb::Output
+            where Cb: ProducerCallback<Self::Item>
+        {
+            callback.callback(ParallelProducer(self.iter))
+        }
+
+        fn len(&self) -> usize {
+            ExactSizeIterator::len(&self.iter)
+        }
+
+        fn drive<C>(self, consumer: C) -> C::Result
+            where C: Consumer<Self::Item>
+        {
+            bridge(self, consumer)
+        }
+    }
+
+    impl<'a, A, D> IntoIterator for ParallelProducer<$iter_name<'a, A, D>>
+        where D: Dimension,
+    {
+        type IntoIter = $iter_name<'a, A, D>;
+        type Item = <Self::IntoIter as Iterator>::Item;
+
+        fn into_iter(self) -> Self::IntoIter {
+            self.0
+        }
+    }
+
+    // This is the real magic, I guess
+    impl<'a, A, D> Producer for ParallelProducer<$iter_name<'a, A, D>>
+        where D: Dimension,
+              A: $($thread_bounds)*,
+    {
+        type IntoIter = $iter_name<'a, A, D>;
+        type Item = <Self::IntoIter as Iterator>::Item;
+
+        fn into_iter(self) -> Self::IntoIter {
+            self.0
+        }
+
+        fn split_at(self, i: usize) -> (Self, Self) {
+            let (a, b) = self.0.split_at(i);
+            (ParallelProducer(a), ParallelProducer(b))
+        }
+    }
+
+    }
+}
+
+
+par_iter_wrapper!(AxisIter, [Sync]);
+par_iter_wrapper!(AxisIterMut, [Send + Sync]);
+
+
+
+macro_rules! par_iter_view_wrapper {
+    // thread_bounds are either Sync or Send + Sync
+    ($view_name:ident, [$($thread_bounds:tt)*]) => {
+    /// Requires crate feature `rayon`.
+    impl<'a, A, D> IntoParallelIterator for $view_name<'a, A, D>
+        where D: Dimension,
+              A: $($thread_bounds)*,
+    {
+        type Item = <Self as IntoIterator>::Item;
+        type Iter = Parallel<Self>;
+        fn into_par_iter(self) -> Self::Iter {
+            Parallel {
+                iter: self,
+            }
+        }
+    }
+
+
+    impl<'a, A, D> ParallelIterator for Parallel<$view_name<'a, A, D>>
+        where D: Dimension,
+              A: $($thread_bounds)*,
+    {
+        type Item = <$view_name<'a, A, D> as IntoIterator>::Item;
+        fn drive_unindexed<C>(self, consumer: C) -> C::Result
+            where C: UnindexedConsumer<Self::Item>
+        {
+            bridge_unindexed(ParallelProducer(self.iter), consumer)
+        }
+
+        fn opt_len(&self) -> Option<usize> {
+            None
+        }
+    }
+
+    impl<'a, A, D> UnindexedProducer for ParallelProducer<$view_name<'a, A, D>>
+        where D: Dimension,
+              A: $($thread_bounds)*,
+    {
+        type Item = <$view_name<'a, A, D> as IntoIterator>::Item;
+        fn split(self) -> (Self, Option<Self>) {
+            if self.0.len() <= 1 {
+                return (self, None)
+            }
+            let array = self.0;
+            let max_axis = array.max_stride_axis();
+            let mid = array.len_of(max_axis) / 2;
+            let (a, b) = array.split_at(max_axis, mid);
+            (ParallelProducer(a), Some(ParallelProducer(b)))
+        }
+
+        fn fold_with<F>(self, folder: F) -> F
+            where F: Folder<Self::Item>,
+        {
+            self.into_iter().fold(folder, move |f, elt| f.consume(elt))
+        }
+    }
+
+    impl<'a, A, D> IntoIterator for ParallelProducer<$view_name<'a, A, D>>
+        where D: Dimension,
+              A: $($thread_bounds)*,
+    {
+        type Item = <$view_name<'a, A, D> as IntoIterator>::Item;
+        type IntoIter = <$view_name<'a, A, D> as IntoIterator>::IntoIter;
+        fn into_iter(self) -> Self::IntoIter {
+            self.0.into_iter()
+        }
+    }
+
+    }
+}
+
+par_iter_view_wrapper!(ArrayView, [Sync]);
+par_iter_view_wrapper!(ArrayViewMut, [Sync + Send]);
+
+
+use {Zip, NdProducer, FoldWhile};
+
+macro_rules! zip_impl {
+    ($([$($p:ident)*],)+) => {
+        $(
+        /// Requires crate feature `rayon`.
+        #[allow(non_snake_case)]
+        impl<D, $($p),*> IntoParallelIterator for Zip<($($p,)*), D>
+            where $($p::Item : Send , )*
+                  $($p : Send , )*
+                  D: Dimension,
+                  $($p: NdProducer<Dim=D> ,)*
+        {
+            type Item = ($($p::Item ,)*);
+            type Iter = Parallel<Self>;
+            fn into_par_iter(self) -> Self::Iter {
+                Parallel {
+                    iter: self,
+                }
+            }
+        }
+
+        #[allow(non_snake_case)]
+        impl<D, $($p),*> ParallelIterator for Parallel<Zip<($($p,)*), D>>
+            where $($p::Item : Send , )*
+                  $($p : Send , )*
+                  D: Dimension,
+                  $($p: NdProducer<Dim=D> ,)*
+        {
+            type Item = ($($p::Item ,)*);
+
+            fn drive_unindexed<Cons>(self, consumer: Cons) -> Cons::Result
+                where Cons: UnindexedConsumer<Self::Item>
+            {
+                bridge_unindexed(ParallelProducer(self.iter), consumer)
+            }
+
+            fn opt_len(&self) -> Option<usize> {
+                None
+            }
+        }
+
+        #[allow(non_snake_case)]
+        impl<D, $($p),*> UnindexedProducer for ParallelProducer<Zip<($($p,)*), D>>
+            where $($p : Send , )*
+                  $($p::Item : Send , )*
+                  D: Dimension,
+                  $($p: NdProducer<Dim=D> ,)*
+        {
+            type Item = ($($p::Item ,)*);
+
+            fn split(self) -> (Self, Option<Self>) {
+                if self.0.size() <= 1 {
+                    return (self, None)
+                }
+                let (a, b) = self.0.split();
+                (ParallelProducer(a), Some(ParallelProducer(b)))
+            }
+
+            fn fold_with<Fold>(self, folder: Fold) -> Fold
+                where Fold: Folder<Self::Item>,
+            {
+                self.0.fold_while(folder, |mut folder, $($p),*| {
+                    folder = folder.consume(($($p ,)*));
+                    if folder.full() {
+                        FoldWhile::Done(folder)
+                    } else {
+                        FoldWhile::Continue(folder)
+                    }
+                }).into_inner()
+            }
+        }
+        )+
+    }
+}
+
+zip_impl!{
+    [P1],
+    [P1 P2],
+    [P1 P2 P3],
+    [P1 P2 P3 P4],
+    [P1 P2 P3 P4 P5],
+    [P1 P2 P3 P4 P5 P6],
+}
diff --git a/src/parallel/zipmacro.rs b/src/parallel/zipmacro.rs
new file mode 100644
index 000000000..a61ae88d4
--- /dev/null
+++ b/src/parallel/zipmacro.rs
@@ -0,0 +1,106 @@
+// Copyright 2017 bluss and ndarray developers.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+#[macro_export]
+/// Parallelized array zip macro: lock step function application across several
+/// arrays and producers.
+///
+/// This is a version of the [`azip`] macro that requires the crate feature
+/// `rayon` to be enabled.
+///
+/// This example:
+///
+/// ```rust,ignore
+/// par_azip!(mut a, b, c in { *a = b + c })
+/// ```
+///
+/// Is equivalent to:
+///
+/// ```rust,ignore
+/// Zip::from(&mut a).and(&b).and(&c).par_apply(|a, &b, &c| {
+///     *a = b + c;
+/// });
+/// ```
+///
+/// **Panics** if any of the arrays are not of the same shape.
+///
+/// ## Examples
+///
+/// ```rust
+/// extern crate ndarray;
+///
+/// use ndarray::Array2;
+/// use ndarray::parallel::par_azip;
+///
+/// type M = Array2<f32>;
+///
+/// fn main() {
+///     let mut a = M::zeros((16, 16));
+///     let b = M::from_elem(a.dim(), 1.);
+///     let c = M::from_elem(a.dim(), 2.);
+///
+///     // Compute a simple ternary operation:
+///     // elementwise addition of b and c, stored in a
+///
+///     par_azip!(mut a, b, c in { *a = b + c });
+///
+///     assert_eq!(a, &b + &c);
+/// }
+/// ```
+macro_rules! par_azip {
+    // Build Zip Rule (index)
+    (@parse [index => $a:expr, $($aa:expr,)*] $t1:tt in $t2:tt) => {
+        $crate::par_azip!(@finish ($crate::Zip::indexed($a)) [$($aa,)*] $t1 in $t2)
+    };
+    // Build Zip Rule (no index)
+    (@parse [$a:expr, $($aa:expr,)*] $t1:tt in $t2:tt) => {
+        $crate::par_azip!(@finish ($crate::Zip::from($a)) [$($aa,)*] $t1 in $t2)
+    };
+    // Build Finish Rule (both)
+    (@finish ($z:expr) [$($aa:expr,)*] [$($p:pat,)+] in { $($t:tt)*}) => {
+        use $crate::parallel::prelude::*;
+        #[allow(unused_mut)]
+        ($z)
+            $(
+                .and($aa)
+            )*
+            .par_apply(|$($p),+| {
+                $($t)*
+            })
+    };
+    // parsing stack: [expressions] [patterns] (one per operand)
+    // index uses empty [] -- must be first
+    (@parse [] [] index $i:pat, $($t:tt)*) => {
+        $crate::par_azip!(@parse [index =>] [$i,] $($t)*);
+    };
+    (@parse [$($exprs:tt)*] [$($pats:tt)*] mut $x:ident ($e:expr) $($t:tt)*) => {
+        $crate::par_azip!(@parse [$($exprs)* $e,] [$($pats)* mut $x,] $($t)*);
+    };
+    (@parse [$($exprs:tt)*] [$($pats:tt)*] mut $x:ident $($t:tt)*) => {
+        $crate::par_azip!(@parse [$($exprs)* &mut $x,] [$($pats)* mut $x,] $($t)*);
+    };
+    (@parse [$($exprs:tt)*] [$($pats:tt)*] , $($t:tt)*) => {
+        $crate::par_azip!(@parse [$($exprs)*] [$($pats)*] $($t)*);
+    };
+    (@parse [$($exprs:tt)*] [$($pats:tt)*] ref $x:ident ($e:expr) $($t:tt)*) => {
+        $crate::par_azip!(@parse [$($exprs)* $e,] [$($pats)* $x,] $($t)*);
+    };
+    (@parse [$($exprs:tt)*] [$($pats:tt)*] ref $x:ident $($t:tt)*) => {
+        $crate::par_azip!(@parse [$($exprs)* &$x,] [$($pats)* $x,] $($t)*);
+    };
+    (@parse [$($exprs:tt)*] [$($pats:tt)*] $x:ident ($e:expr) $($t:tt)*) => {
+        $crate::par_azip!(@parse [$($exprs)* $e,] [$($pats)* &$x,] $($t)*);
+    };
+    (@parse [$($exprs:tt)*] [$($pats:tt)*] $x:ident $($t:tt)*) => {
+        $crate::par_azip!(@parse [$($exprs)* &$x,] [$($pats)* &$x,] $($t)*);
+    };
+    (@parse [$($exprs:tt)*] [$($pats:tt)*] $($t:tt)*) => { };
+    ($($t:tt)*) => {
+        $crate::par_azip!(@parse [] [] $($t)*);
+    }
+}
diff --git a/src/zip/mod.rs b/src/zip/mod.rs
index 366a85c0c..877c97a20 100644
--- a/src/zip/mod.rs
+++ b/src/zip/mod.rs
@@ -708,7 +708,10 @@ macro_rules! map_impl {
     ($([$notlast:ident $($p:ident)*],)+) => {
         $(
         #[allow(non_snake_case)]
-        impl<D: Dimension, $($p: NdProducer<Dim=D>),*> Zip<($($p,)*), D> {
+        impl<D, $($p),*> Zip<($($p,)*), D>
+            where D: Dimension,
+                  $($p: NdProducer<Dim=D> ,)*
+        {
             /// Apply a function to all elements of the input arrays,
             /// visiting elements in lock step.
             pub fn apply<F>(mut self, mut function: F)
diff --git a/tests/par_azip.rs b/tests/par_azip.rs
new file mode 100644
index 000000000..93caee62e
--- /dev/null
+++ b/tests/par_azip.rs
@@ -0,0 +1,68 @@
+#![cfg(feature="rayon")]
+
+#[macro_use]
+extern crate ndarray;
+extern crate itertools;
+
+use ndarray::prelude::*;
+use ndarray::parallel::prelude::*;
+use itertools::{enumerate};
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+#[test]
+fn test_par_azip1() {
+    let mut a = Array::zeros(62);
+    let b = Array::from_elem(62, 42);
+    par_azip!(mut a in { *a = 42 });
+    assert_eq!(a, b);
+}
+
+#[test]
+fn test_par_azip2() {
+    let mut a = Array::zeros((5, 7));
+    let b = Array::from_shape_fn(a.dim(), |(i, j)| 1. / (i + 2*j) as f32);
+    par_azip!(mut a, b in { *a = b; });
+    assert_eq!(a, b);
+}
+
+#[test]
+fn test_par_azip3() {
+    let mut a = [0.; 32];
+    let mut b = [0.; 32];
+    let mut c = [0.; 32];
+    for (i, elt) in enumerate(&mut b) {
+        *elt = i as f32;
+    }
+
+    par_azip!(mut a (&mut a[..]), b (&b[..]), mut c (&mut c[..]) in {
+        *a += b / 10.;
+        *c = a.sin();
+    });
+    let res = Array::linspace(0., 3.1, 32).mapv_into(f32::sin);
+    assert!(res.all_close(&ArrayView::from(&c), 1e-4));
+}
+
+#[should_panic]
+#[test]
+fn test_zip_dim_mismatch_1() {
+    let mut a = Array::zeros((5, 7));
+    let mut d = a.raw_dim();
+    d[0] += 1;
+    let b = Array::from_shape_fn(d, |(i, j)| 1. / (i + 2*j) as f32);
+    par_azip!(mut a, b in { *a = b; });
+}
+
+#[test]
+fn test_indices_1() {
+    let mut a1 = Array::default(12);
+    for (i, elt) in a1.indexed_iter_mut() {
+        *elt = i;
+    }
+
+    let count = AtomicUsize::new(0);
+    par_azip!(index i, elt (&a1) in {
+        count.fetch_add(1, Ordering::SeqCst);
+        assert_eq!(elt, i);
+    });
+    assert_eq!(count.load(Ordering::SeqCst), a1.len());
+}
diff --git a/tests/par_rayon.rs b/tests/par_rayon.rs
new file mode 100644
index 000000000..3f457c46f
--- /dev/null
+++ b/tests/par_rayon.rs
@@ -0,0 +1,55 @@
+#![cfg(feature="rayon")]
+
+extern crate rayon;
+
+#[macro_use]
+extern crate ndarray;
+extern crate itertools;
+
+use ndarray::prelude::*;
+use ndarray::parallel::prelude::*;
+
+const M: usize = 1024 * 10;
+const N: usize = 100;
+
+#[test]
+fn test_axis_iter() {
+    let mut a = Array2::<f64>::zeros((M, N));
+    for (i, mut v) in a.axis_iter_mut(Axis(0)).enumerate() {
+        v.fill(i as _);
+    }
+    assert_eq!(a.axis_iter(Axis(0)).len(), M);
+    let s: f64 = a.axis_iter(Axis(0)).into_par_iter().map(|x| x.sum()).sum();
+    println!("{:?}", a.slice(s![..10, ..5]));
+    assert_eq!(s, a.sum());
+}
+
+#[test]
+fn test_axis_iter_mut() {
+    let mut a = Array::linspace(0., 1.0f64, M * N).into_shape((M, N)).unwrap();
+    let b = a.mapv(|x| x.exp());
+    a.axis_iter_mut(Axis(0)).into_par_iter().for_each(|mut v| v.mapv_inplace(|x| x.exp()));
+    println!("{:?}", a.slice(s![..10, ..5]));
+    assert!(a.all_close(&b, 0.001));
+}
+
+#[test]
+fn test_regular_iter() {
+    let mut a = Array2::<f64>::zeros((M, N));
+    for (i, mut v) in a.axis_iter_mut(Axis(0)).enumerate() {
+        v.fill(i as _);
+    }
+    let s: f64 = a.view().into_par_iter().map(|&x| x).sum();
+    println!("{:?}", a.slice(s![..10, ..5]));
+    assert_eq!(s, a.sum());
+}
+
+#[test]
+fn test_regular_iter_collect() {
+    let mut a = Array2::<f64>::zeros((M, N));
+    for (i, mut v) in a.axis_iter_mut(Axis(0)).enumerate() {
+        v.fill(i as _);
+    }
+    let v = a.view().into_par_iter().map(|&x| x).collect::<Vec<_>>();
+    assert_eq!(v.len(), a.len());
+}
diff --git a/tests/par_zip.rs b/tests/par_zip.rs
new file mode 100644
index 000000000..9cba9888c
--- /dev/null
+++ b/tests/par_zip.rs
@@ -0,0 +1,83 @@
+#![cfg(feature="rayon")]
+
+extern crate ndarray;
+extern crate itertools;
+
+use ndarray::prelude::*;
+
+use ndarray::Zip;
+
+const M: usize = 1024 * 10;
+const N: usize = 100;
+
+#[test]
+fn test_zip_1() {
+    let mut a = Array2::<f64>::zeros((M, N));
+
+    Zip::from(&mut a)
+        .par_apply(|x| {
+            *x = x.exp()
+        });
+}
+
+#[test]
+fn test_zip_index_1() {
+    let mut a = Array2::default((10, 10));
+
+    Zip::indexed(&mut a)
+        .par_apply(|i, x| {
+            *x = i;
+        });
+
+    for (i, elt) in a.indexed_iter() {
+        assert_eq!(*elt, i);
+    }
+}
+
+#[test]
+fn test_zip_index_2() {
+    let mut a = Array2::default((M, N));
+
+    Zip::indexed(&mut a)
+        .par_apply(|i, x| {
+            *x = i;
+        });
+
+    for (i, elt) in a.indexed_iter() {
+        assert_eq!(*elt, i);
+    }
+}
+
+#[test]
+fn test_zip_index_3() {
+    let mut a = Array::default((1, 2, 1, 2, 3));
+
+    Zip::indexed(&mut a)
+        .par_apply(|i, x| {
+            *x = i;
+        });
+
+    for (i, elt) in a.indexed_iter() {
+        assert_eq!(*elt, i);
+    }
+}
+
+#[test]
+fn test_zip_index_4() {
+    let mut a = Array2::zeros((M, N));
+    let mut b = Array2::zeros((M, N));
+
+    Zip::indexed(&mut a)
+        .and(&mut b)
+        .par_apply(|(i, j), x, y| {
+            *x = i;
+            *y = j;
+        });
+
+    for ((i, _), elt) in a.indexed_iter() {
+        assert_eq!(*elt, i);
+    }
+    for ((_, j), elt) in b.indexed_iter() {
+        assert_eq!(*elt, j);
+    }
+}