Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

provide Standard for x86 __m128/256i on stable Rust, add 128xN/sizexN SIMD types #1162

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ alloc = ["rand_core/alloc"]
# Option: use getrandom package for seeding
getrandom = ["rand_core/getrandom"]

# Option (requires nightly): experimental SIMD support
# Option (requires nightly Rust): experimental SIMD support
simd_support = ["packed_simd"]

# Option (enabled by default): enable StdRng
Expand Down
93 changes: 49 additions & 44 deletions src/distributions/integer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,10 @@

use crate::distributions::{Distribution, Standard};
use crate::Rng;
#[cfg(all(target_arch = "x86", feature = "simd_support"))]
use core::arch::x86::{__m128i, __m256i};
#[cfg(all(target_arch = "x86_64", feature = "simd_support"))]
use core::arch::x86_64::{__m128i, __m256i};
use core::num::{NonZeroU16, NonZeroU32, NonZeroU64, NonZeroU8, NonZeroUsize,
NonZeroU128};
#[cfg(target_arch = "x86")] use core::arch::x86::{__m128i, __m256i};
#[cfg(target_arch = "x86_64")] use core::arch::x86_64::{__m128i, __m256i};
use core::mem;
use core::num::{NonZeroU128, NonZeroU16, NonZeroU32, NonZeroU64, NonZeroU8, NonZeroUsize};
#[cfg(feature = "simd_support")] use packed_simd::*;

impl Distribution<u8> for Standard {
Expand Down Expand Up @@ -109,53 +107,60 @@ impl_nzint!(NonZeroU64, NonZeroU64::new);
impl_nzint!(NonZeroU128, NonZeroU128::new);
impl_nzint!(NonZeroUsize, NonZeroUsize::new);


#[cfg(feature = "simd_support")]
macro_rules! simd_impl {
($(($intrinsic:ident, $vec:ty),)+) => {$(
impl Distribution<$intrinsic> for Standard {
#[inline]
fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> $intrinsic {
$intrinsic::from_bits(rng.gen::<$vec>())
macro_rules! packed_simd_types_impl {
($($ty:ty),+) => {
$(
impl Distribution<$ty> for Standard {
#[inline]
fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> $ty {
let mut vec: $ty = <$ty>::default();
unsafe {
let ptr = &mut vec;
let b_ptr = &mut *(ptr as *mut $ty as *mut [u8; mem::size_of::<$ty>()]);
rng.fill_bytes(b_ptr);
}
vec.to_le()
Comment on lines +118 to +124
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is correct, but we should really use from_bits like the old code to avoid unsafe (but do use fill_bytes instead of gen).

Unfortunately from_bits is not documented on docs.rs; I just dropped a PR for that.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm confused by this. Do you mean use fill_bytes on a regular array and then from_slice_unaligned? That would avoid all unsafe.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, I hadn't figured on Simd<[u8; 2]> etc. being hard to construct from an array. Maybe my suggestion doesn't make sense then.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could do something like

let mut bytes = [0_u8; mem::size_of::<$ty>()];
rng.fill_bytes(&mut bytes);
let vec = $ty::from_bits($u8xN::from_slice_unaligned(&bytes));
vec.to_le()

but usizexN don't have from_bits,

}
}
}
)+};
)+
};
}

($bits:expr,) => {};
($bits:expr, $ty:ty, $($ty_more:ty,)*) => {
simd_impl!($bits, $($ty_more,)*);
#[cfg(feature = "simd_support")]
#[rustfmt::skip]
packed_simd_types_impl!(
u8x2, i8x2,
u8x4, i8x4, u16x2, i16x2,
u8x8, i8x8, u16x4, i16x4, u32x2, i32x2,
u8x16, i8x16, u16x8, i16x8, u32x4, i32x4, u64x2, i64x2, u128x1, i128x1,
u8x32, i8x32, u16x16, i16x16, u32x8, i32x8, u64x4, i64x4, u128x2, i128x2,
u8x64, i8x64, u16x32, i16x32, u32x16, i32x16, u64x8, i64x8, u128x4, i128x4,
usizex2, usizex4, usizex8
);

impl Distribution<$ty> for Standard {
#[inline]
fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> $ty {
let mut vec: $ty = Default::default();
unsafe {
let ptr = &mut vec;
let b_ptr = &mut *(ptr as *mut $ty as *mut [u8; $bits/8]);
rng.fill_bytes(b_ptr);
// x86/64 are already little endian so we don't need packed_simd's `to_le` and
// therefore can provide this on stable Rust.
macro_rules! intrinsic_native_le_impl {
($($ty:ty),+) => {
$(
impl Distribution<$ty> for Standard {
#[inline]
fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> $ty {
// On proper hardware, this should compile to SIMD intrinsics
// Verified on x86 Haswell with __m128i, __m256i
let mut buf = [0_u8; mem::size_of::<$ty>()];
rng.fill_bytes(&mut buf);
unsafe { mem::transmute_copy(&buf) }
}
vec.to_le()
}
}
)+
};
}

#[cfg(feature = "simd_support")]
simd_impl!(16, u8x2, i8x2,);
#[cfg(feature = "simd_support")]
simd_impl!(32, u8x4, i8x4, u16x2, i16x2,);
#[cfg(feature = "simd_support")]
simd_impl!(64, u8x8, i8x8, u16x4, i16x4, u32x2, i32x2,);
#[cfg(feature = "simd_support")]
simd_impl!(128, u8x16, i8x16, u16x8, i16x8, u32x4, i32x4, u64x2, i64x2,);
#[cfg(feature = "simd_support")]
simd_impl!(256, u8x32, i8x32, u16x16, i16x16, u32x8, i32x8, u64x4, i64x4,);
#[cfg(feature = "simd_support")]
simd_impl!(512, u8x64, i8x64, u16x32, i16x32, u32x16, i32x16, u64x8, i64x8,);
#[cfg(all(
feature = "simd_support",
any(target_arch = "x86", target_arch = "x86_64")
))]
simd_impl!((__m128i, u8x16), (__m256i, u8x32),);
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
intrinsic_native_le_impl!(__m128i, __m256i);

#[cfg(test)]
mod tests {
Expand Down
15 changes: 15 additions & 0 deletions src/distributions/uniform.rs
Original file line number Diff line number Diff line change
Expand Up @@ -712,6 +712,21 @@ uniform_simd_int_impl! {
u8
}

#[cfg(feature = "simd_support")]
uniform_simd_int_impl! {
(usizex2, isizex2),
(usizex4, isizex4),
(usizex8, isizex8),
usize
}

#[cfg(feature = "simd_support")]
uniform_simd_int_impl! {
(u128x2, i128x2),
(u128x4, i128x4),
u128
}

impl SampleUniform for char {
type Sampler = UniformChar;
}
Expand Down
8 changes: 8 additions & 0 deletions src/distributions/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,14 @@ mod simd_wmul {
wmul_impl_large! { (u16x32,) u16, 8 }
wmul_impl_large! { (u32x16,) u32, 16 }
wmul_impl_large! { (u64x2, u64x4, u64x8,) u64, 32 }
wmul_impl_large! { (u128x2, u128x4,) u128, 64 }

#[cfg(target_pointer_width = "64")]
wmul_impl_large! { (usizex2, usizex4, usizex8,) usize, 32 }
#[cfg(target_pointer_width = "32")]
wmul_impl! { (usizex2, u64x2), (usizex4, u64x4), (usizex8, u64x8),, 32 }
#[cfg(target_pointer_width = "16")]
wmul_impl! { (usizex2, u32x2), (usizex4, u32x4), (usizex8, u32x8),, 16 }
}

/// Helper trait when dealing with scalar and SIMD floating point types.
Expand Down