diff --git a/benches/distributions.rs b/benches/distributions.rs
index 4e215e857fa..eeab8ca9db7 100644
--- a/benches/distributions.rs
+++ b/benches/distributions.rs
@@ -150,6 +150,33 @@ gen_range_int!(gen_range_i64, i64, 3i64, 123_456_789_123);
 #[cfg(feature = "i128_support")]
 gen_range_int!(gen_range_i128, i128, -12345678901234i128, 123_456_789_123_456_789);
 
+// construct and sample from a floating-point range
+macro_rules! gen_range_float {
+    ($fnn:ident, $ty:ident, $low:expr, $high:expr) => {
+        #[bench]
+        fn $fnn(b: &mut Bencher) {
+            let mut rng = XorShiftRng::from_entropy();
+
+            b.iter(|| {
+                let mut high = $high;
+                let mut low = $low;
+                let mut accum: $ty = 0.0;
+                for _ in 0..::RAND_BENCH_N {
+                    accum += rng.gen_range(low, high);
+                    // force recalculation of range each time
+                    low += 0.9;
+                    high += 1.1;
+                }
+                accum
+            });
+            b.bytes = size_of::<$ty>() as u64 * ::RAND_BENCH_N;
+        }
+    }
+}
+
+gen_range_float!(gen_range_f32, f32, -20000.0f32, 100000.0);
+gen_range_float!(gen_range_f64, f64, 123.456f64, 7890.12);
+
 #[bench]
 fn dist_iter(b: &mut Bencher) {
     let mut rng = XorShiftRng::from_entropy();
diff --git a/src/distributions/uniform.rs b/src/distributions/uniform.rs
index 9db0f59600e..b57fa9eaf6a 100644
--- a/src/distributions/uniform.rs
+++ b/src/distributions/uniform.rs
@@ -116,7 +116,12 @@ use std::time::Duration;
 use Rng;
 use distributions::Distribution;
 use distributions::float::IntoFloat;
-use distributions::utils::{WideningMultiply, CompareAll};
+use distributions::utils::{WideningMultiply, CompareAll, FloatAsSIMD, BoolAsSIMD};
+
+#[cfg(not(feature = "std"))]
+#[allow(unused_imports)] // rustc doesn't detect that this is actually used
+use distributions::utils::Float;
+
 
 #[cfg(feature="simd_support")]
 use core::simd::*;
@@ -139,10 +144,9 @@ use core::simd::*;
 /// generated by the RNG than the low bits, since with some RNGs the low-bits
 /// are of lower quality than the high bits.
 ///
-/// Implementations should attempt to sample in `[low, high)` for
-/// `Uniform::new(low, high)`, i.e., excluding `high`, but this may be very
-/// difficult. All the primitive integer types satisfy this property, and the
-/// float types normally satisfy it, but rounding may mean `high` can occur.
+/// Implementations must sample in `[low, high)` range for
+/// `Uniform::new(low, high)`, i.e., excluding `high`. In particular care must
+/// be taken to ensure that rounding never results values `< low` or `>= high`.
 ///
 /// # Example
 ///
@@ -284,9 +288,11 @@ pub trait SampleBorrow<Borrowed> {
     fn borrow(&self) -> &Borrowed;
 }
 impl<Borrowed> SampleBorrow<Borrowed> for Borrowed where Borrowed: SampleUniform {
+    #[inline(always)]
     fn borrow(&self) -> &Borrowed { self }
 }
 impl<'a, Borrowed> SampleBorrow<Borrowed> for &'a Borrowed where Borrowed: SampleUniform {
+    #[inline(always)]
    fn borrow(&self) -> &Borrowed { *self }
 }
 
@@ -487,10 +493,6 @@ uniform_int_impl! { u128, u128, u128, i128, u128 }
 /// multiply and addition. Values produced this way have what equals 22 bits of
 /// random digits for an `f32`, and 52 for an `f64`.
 ///
-/// Currently there is no difference between [`new`] and [`new_inclusive`],
-/// because the boundaries of a floats range are a bit of a fuzzy concept due to
-/// rounding errors.
-///
 /// [`UniformSampler`]: trait.UniformSampler.html
 /// [`new`]: trait.UniformSampler.html#tymethod.new
 /// [`new_inclusive`]: trait.UniformSampler.html#tymethod.new_inclusive
@@ -498,12 +500,12 @@ uniform_int_impl! { u128, u128, u128, i128, u128 }
 /// [`Standard`]: ../struct.Standard.html
 #[derive(Clone, Copy, Debug)]
 pub struct UniformFloat<X> {
+    low: X,
     scale: X,
-    offset: X,
 }
 
 macro_rules! uniform_float_impl {
-    ($ty:ty, $uty:ident, $bits_to_discard:expr) => {
+    ($ty:ty, $uty:ident, $f_scalar:ident, $u_scalar:ident, $bits_to_discard:expr) => {
         impl SampleUniform for $ty {
             type Sampler = UniformFloat<$ty>;
         }
@@ -519,12 +521,24 @@ macro_rules! uniform_float_impl {
                 let high = *high_b.borrow();
                 assert!(low.all_lt(high),
                         "Uniform::new called with `low >= high`");
-                let scale = high - low;
-                let offset = low - scale;
-                UniformFloat {
-                    scale: scale,
-                    offset: offset,
+                assert!(low.all_finite() && high.all_finite(),
+                        "Uniform::new called with non-finite boundaries");
+                let max_rand = <$ty>::splat((::core::$u_scalar::MAX >> $bits_to_discard)
+                                            .into_float_with_exponent(0) - 1.0);
+
+                let mut scale = high - low;
+
+                loop {
+                    let mask = (scale * max_rand + low).ge_mask(high);
+                    if mask.none() {
+                        break;
+                    }
+                    scale = scale.decrease_masked(mask);
                 }
+
+                debug_assert!(<$ty>::splat(0.0).all_le(scale));
+
+                UniformFloat { low, scale }
             }
 
             fn new_inclusive<B1, B2>(low_b: B1, high_b: B2) -> Self
@@ -535,26 +549,44 @@ macro_rules! uniform_float_impl {
                 let high = *high_b.borrow();
                 assert!(low.all_le(high),
                         "Uniform::new_inclusive called with `low > high`");
-                let scale = high - low;
-                let offset = low - scale;
-                UniformFloat {
-                    scale: scale,
-                    offset: offset,
+                assert!(low.all_finite() && high.all_finite(),
+                        "Uniform::new_inclusive called with non-finite boundaries");
+                let max_rand = <$ty>::splat((::core::$u_scalar::MAX >> $bits_to_discard)
+                                            .into_float_with_exponent(0) - 1.0);
+
+                let mut scale = (high - low) / max_rand;
+
+                loop {
+                    let mask = (scale * max_rand + low).gt_mask(high);
+                    if mask.none() {
+                        break;
+                    }
+                    scale = scale.decrease_masked(mask);
                 }
+
+                debug_assert!(<$ty>::splat(0.0).all_le(scale));
+
+                UniformFloat { low, scale }
             }
 
             fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> Self::X {
                 // Generate a value in the range [1, 2)
-                let value: $uty = rng.gen::<$uty>() >> $bits_to_discard;
-                let value1_2 = value.into_float_with_exponent(0);
+                let value1_2 = (rng.gen::<$uty>() >> $bits_to_discard as u8)
+                               .into_float_with_exponent(0);
+
+                // Get a value in the range [0, 1) in order to avoid
+                // overflowing into infinity when multiplying with scale
+                let value0_1 = value1_2 - 1.0;
+
                 // We don't use `f64::mul_add`, because it is not available with
                 // `no_std`. Furthermore, it is slower for some targets (but
                 // faster for others). However, the order of multiplication and
                 // addition is important, because on some platforms (e.g. ARM)
                 // it will be optimized to a single (non-FMA) instruction.
-                value1_2 * self.scale + self.offset
+                value0_1 * self.scale + self.low
             }
 
+            #[inline]
             fn sample_single<R: Rng + ?Sized, B1, B2>(low_b: B1, high_b: B2, rng: &mut R)
                 -> Self::X
                 where B1: SampleBorrow<Self::X> + Sized,
@@ -564,37 +596,83 @@ macro_rules! uniform_float_impl {
                 let high = *high_b.borrow();
                 assert!(low.all_lt(high),
                         "Uniform::sample_single called with low >= high");
-                let scale = high - low;
-                let offset = low - scale;
-                // Generate a value in the range [1, 2)
-                let value: $uty = rng.gen::<$uty>() >> $bits_to_discard;
-                let value1_2 = value.into_float_with_exponent(0);
-                // Doing multiply before addition allows some architectures to
-                // use a single instruction.
-                value1_2 * scale + offset
+                let mut scale = high - low;
+
+                loop {
+                    // Generate a value in the range [1, 2)
+                    let value1_2 = (rng.gen::<$uty>() >> $bits_to_discard as u32)
+                                   .into_float_with_exponent(0);
+
+                    // Get a value in the range [0, 1) in order to avoid
+                    // overflowing into infinity when multiplying with scale
+                    let value0_1 = value1_2 - 1.0;
+
+                    // Doing multiply before addition allows some architectures
+                    // to use a single instruction.
+                    let res = value0_1 * scale + low;
+
+                    debug_assert!(low.all_le(res) || !scale.all_finite());
+                    if res.all_lt(high) {
+                        return res;
+                    }
+
+                    // This handles a number of edge cases.
+                    // * `low` or `high` is NaN. In this case `scale` and
+                    //   `res` are going to end up as NaN.
+                    // * `low` is negative infinity and `high` is finite.
+                    //   `scale` is going to be infinite and `res` will be
+                    //   NaN.
+                    // * `high` is positive infinity and `low` is finite.
+                    //   `scale` is going to be infinite and `res` will
+                    //   be infinite or NaN (if value0_1 is 0).
+                    // * `low` is negative infinity and `high` is positive
+                    //   infinity. `scale` will be infinite and `res` will
+                    //   be NaN.
+                    // * `low` and `high` are finite, but `high - low`
+                    //   overflows to infinite. `scale` will be infinite
+                    //   and `res` will be infinite or NaN (if value0_1 is 0).
+                    // So if `high` or `low` are non-finite, we are guaranteed
+                    // to fail the `res < high` check above and end up here.
+                    //
+                    // While we technically should check for non-finite `low`
+                    // and `high` before entering the loop, by doing the checks
+                    // here instead, we allow the common case to avoid these
+                    // checks. But we are still guaranteed that if `low` or
+                    // `high` are non-finite we'll end up here and can do the
+                    // appropriate checks.
+                    //
+                    // Likewise `high - low` overflowing to infinity is also
+                    // rare, so handle it here after the common case.
+                    let mask = !scale.finite_mask();
+                    if mask.any() {
+                        assert!(low.all_finite() && high.all_finite(),
+                                "Uniform::sample_single called with non-finite boundaries");
+                        scale = scale.decrease_masked(mask);
+                    }
+                }
             }
         }
     }
 }
 
-uniform_float_impl! { f32, u32, 32 - 23 }
-uniform_float_impl! { f64, u64, 64 - 52 }
+uniform_float_impl! { f32, u32, f32, u32, 32 - 23 }
+uniform_float_impl! { f64, u64, f64, u64, 64 - 52 }
 
 #[cfg(feature="simd_support")]
-uniform_float_impl! { f32x2, u32x2, 32 - 23 }
+uniform_float_impl! { f32x2, u32x2, f32, u32, 32 - 23 }
 #[cfg(feature="simd_support")]
-uniform_float_impl! { f32x4, u32x4, 32 - 23 }
+uniform_float_impl! { f32x4, u32x4, f32, u32, 32 - 23 }
 #[cfg(feature="simd_support")]
-uniform_float_impl! { f32x8, u32x8, 32 - 23 }
+uniform_float_impl! { f32x8, u32x8, f32, u32, 32 - 23 }
 #[cfg(feature="simd_support")]
-uniform_float_impl! { f32x16, u32x16, 32 - 23 }
+uniform_float_impl! { f32x16, u32x16, f32, u32, 32 - 23 }
 
 #[cfg(feature="simd_support")]
-uniform_float_impl! { f64x2, u64x2, 64 - 52 }
+uniform_float_impl! { f64x2, u64x2, f64, u64, 64 - 52 }
 #[cfg(feature="simd_support")]
-uniform_float_impl! { f64x4, u64x4, 64 - 52 }
+uniform_float_impl! { f64x4, u64x4, f64, u64, 64 - 52 }
 #[cfg(feature="simd_support")]
-uniform_float_impl! { f64x8, u64x8, 64 - 52 }
+uniform_float_impl! { f64x8, u64x8, f64, u64, 64 - 52 }
 
 
 
@@ -704,7 +782,10 @@ impl UniformSampler for UniformDuration {
 #[cfg(test)]
 mod tests {
     use Rng;
+    use rngs::mock::StepRng;
     use distributions::uniform::Uniform;
+    use distributions::utils::FloatAsSIMD;
+    #[cfg(feature="simd_support")] use core::simd::*;
 
     #[should_panic]
     #[test]
@@ -712,12 +793,6 @@ mod tests {
         Uniform::new(10, 10);
     }
 
-    #[should_panic]
-    #[test]
-    fn test_uniform_bad_limits_equal_float() {
-        Uniform::new(10., 10.);
-    }
-
     #[test]
     fn test_uniform_good_limits_equal_int() {
         let mut rng = ::test::rng(804);
@@ -727,27 +802,12 @@ mod tests {
         }
     }
 
-    #[test]
-    fn test_uniform_good_limits_equal_float() {
-        let mut rng = ::test::rng(805);
-        let dist = Uniform::new_inclusive(10., 10.);
-        for _ in 0..20 {
-            assert_eq!(rng.sample(dist), 10.);
-        }
-    }
-
     #[should_panic]
     #[test]
     fn test_uniform_bad_limits_flipped_int() {
         Uniform::new(10, 5);
     }
 
-    #[should_panic]
-    #[test]
-    fn test_uniform_bad_limits_flipped_float() {
-        Uniform::new(10., 5.);
-    }
-
     #[test]
     fn test_integers() {
         let mut rng = ::test::rng(251);
@@ -799,27 +859,135 @@ mod tests {
     #[test]
     fn test_floats() {
         let mut rng = ::test::rng(252);
+        let mut zero_rng = StepRng::new(0, 0);
+        let mut max_rng = StepRng::new(0xffff_ffff_ffff_ffff, 0);
         macro_rules! t {
-            ($($ty:ty),*) => {{
-                $(
-                   let v: &[($ty, $ty)] = &[(0.0, 100.0),
-                                            (-1e35, -1e25),
-                                            (1e-35, 1e-25),
-                                            (-1e35, 1e35)];
-                   for &(low, high) in v.iter() {
+            ($ty:ty, $f_scalar:ident, $bits_shifted:expr) => {{
+                let v: &[($f_scalar, $f_scalar)]=
+                    &[(0.0, 100.0),
+                      (-1e35, -1e25),
+                      (1e-35, 1e-25),
+                      (-1e35, 1e35),
+                      (<$f_scalar>::from_bits(0), <$f_scalar>::from_bits(3)),
+                      (-<$f_scalar>::from_bits(10), -<$f_scalar>::from_bits(1)),
+                      (-<$f_scalar>::from_bits(5), 0.0),
+                      (-<$f_scalar>::from_bits(7), -0.0),
+                      (10.0, ::core::$f_scalar::MAX),
+                      (-100.0, ::core::$f_scalar::MAX),
+                      (-::core::$f_scalar::MAX / 5.0, ::core::$f_scalar::MAX),
+                      (-::core::$f_scalar::MAX, ::core::$f_scalar::MAX / 5.0),
+                      (-::core::$f_scalar::MAX * 0.8, ::core::$f_scalar::MAX * 0.7),
+                      (-::core::$f_scalar::MAX, ::core::$f_scalar::MAX),
+                     ];
+                for &(low_scalar, high_scalar) in v.iter() {
+                    for lane in 0..<$ty>::lanes() {
+                        let low = <$ty>::splat(0.0 as $f_scalar).replace(lane, low_scalar);
+                        let high = <$ty>::splat(1.0 as $f_scalar).replace(lane, high_scalar);
                         let my_uniform = Uniform::new(low, high);
-                        for _ in 0..1000 {
-                            let v: $ty = rng.sample(my_uniform);
-                            assert!(low <= v && v < high);
+                        let my_incl_uniform = Uniform::new_inclusive(low, high);
+                        for _ in 0..100 {
+                            let v = rng.sample(my_uniform).extract(lane);
+                            assert!(low_scalar <= v && v < high_scalar);
+                            let v = rng.sample(my_incl_uniform).extract(lane);
+                            assert!(low_scalar <= v && v <= high_scalar);
+                            let v = rng.gen_range(low, high).extract(lane);
+                            assert!(low_scalar <= v && v < high_scalar);
+                        }
+
+                        assert_eq!(rng.sample(Uniform::new_inclusive(low, low)).extract(lane), low_scalar);
+
+                        assert_eq!(zero_rng.sample(my_uniform).extract(lane), low_scalar);
+                        assert_eq!(zero_rng.sample(my_incl_uniform).extract(lane), low_scalar);
+                        assert_eq!(zero_rng.gen_range(low, high).extract(lane), low_scalar);
+                        assert!(max_rng.sample(my_uniform).extract(lane) < high_scalar);
+                        assert!(max_rng.sample(my_incl_uniform).extract(lane) <= high_scalar);
+
+                        // Don't run this test for really tiny differences between high and low
+                        // since for those rounding might result in selecting high for a very
+                        // long time.
+                        if (high_scalar - low_scalar) > 0.0001 {
+                            let mut lowering_max_rng =
+                                StepRng::new(0xffff_ffff_ffff_ffff,
+                                             (-1i64 << $bits_shifted) as u64);
+                            assert!(lowering_max_rng.gen_range(low, high).extract(lane) < high_scalar);
                         }
                     }
-                 )*
+                }
+
+                assert_eq!(rng.sample(Uniform::new_inclusive(::core::$f_scalar::MAX,
+                                                             ::core::$f_scalar::MAX)),
+                           ::core::$f_scalar::MAX);
+                assert_eq!(rng.sample(Uniform::new_inclusive(-::core::$f_scalar::MAX,
+                                                             -::core::$f_scalar::MAX)),
+                           -::core::$f_scalar::MAX);
             }}
         }
 
-        t!(f32, f64)
+        t!(f32, f32, 32 - 23);
+        t!(f64, f64, 64 - 52);
+        #[cfg(feature="simd_support")] t!(f32x2, f32, 32 - 23);
+        #[cfg(feature="simd_support")] t!(f32x4, f32, 32 - 23);
+        #[cfg(feature="simd_support")] t!(f32x8, f32, 32 - 23);
+        #[cfg(feature="simd_support")] t!(f32x16, f32, 32 - 23);
+        #[cfg(feature="simd_support")] t!(f64x2, f64, 64 - 52);
+        #[cfg(feature="simd_support")] t!(f64x4, f64, 64 - 52);
+        #[cfg(feature="simd_support")] t!(f64x8, f64, 64 - 52);
     }
 
+    #[test]
+    #[cfg(all(feature="std",
+              not(target_arch = "wasm32"),
+              not(target_arch = "asmjs")))]
+    fn test_float_assertions() {
+        use core::panic::catch_unwind;
+        use super::SampleUniform;
+        fn range<T: SampleUniform>(low: T, high: T) {
+            let mut rng = ::test::rng(253);
+            rng.gen_range(low, high);
+        }
+
+        macro_rules! t {
+            ($ty:ident, $f_scalar:ident) => {{
+                let v: &[($f_scalar, $f_scalar)] =
+                    &[(::std::$f_scalar::NAN, 0.0),
+                      (1.0, ::std::$f_scalar::NAN),
+                      (::std::$f_scalar::NAN, ::std::$f_scalar::NAN),
+                      (1.0, 0.5),
+                      (::std::$f_scalar::MAX, -::std::$f_scalar::MAX),
+                      (::std::$f_scalar::INFINITY, ::std::$f_scalar::INFINITY),
+                      (::std::$f_scalar::NEG_INFINITY, ::std::$f_scalar::NEG_INFINITY),
+                      (::std::$f_scalar::NEG_INFINITY, 5.0),
+                      (5.0, ::std::$f_scalar::INFINITY),
+                      (::std::$f_scalar::NAN, ::std::$f_scalar::INFINITY),
+                      (::std::$f_scalar::NEG_INFINITY, ::std::$f_scalar::NAN),
+                      (::std::$f_scalar::NEG_INFINITY, ::std::$f_scalar::INFINITY),
+                     ];
+                for &(low_scalar, high_scalar) in v.iter() {
+                    for lane in 0..<$ty>::lanes() {
+                        let low = <$ty>::splat(0.0 as $f_scalar).replace(lane, low_scalar);
+                        let high = <$ty>::splat(1.0 as $f_scalar).replace(lane, high_scalar);
+                        assert!(catch_unwind(|| range(low, high)).is_err());
+                        assert!(catch_unwind(|| Uniform::new(low, high)).is_err());
+                        assert!(catch_unwind(|| Uniform::new_inclusive(low, high)).is_err());
+                        assert!(catch_unwind(|| range(low, low)).is_err());
+                        assert!(catch_unwind(|| Uniform::new(low, low)).is_err());
+                    }
+                }
+            }}
+        }
+
+        t!(f32, f32);
+        t!(f64, f64);
+        #[cfg(feature="simd_support")] t!(f32x2, f32);
+        #[cfg(feature="simd_support")] t!(f32x4, f32);
+        #[cfg(feature="simd_support")] t!(f32x8, f32);
+        #[cfg(feature="simd_support")] t!(f32x16, f32);
+        #[cfg(feature="simd_support")] t!(f64x2, f64);
+        #[cfg(feature="simd_support")] t!(f64x4, f64);
+        #[cfg(feature="simd_support")] t!(f64x8, f64);
+    }
+
+
     #[test]
     #[cfg(feature = "std")]
     fn test_durations() {
@@ -889,7 +1057,7 @@ mod tests {
         assert_eq!(r.inner.low, 2);
         assert_eq!(r.inner.range, 5);
         let r = Uniform::from(2.0f64..7.0);
-        assert_eq!(r.inner.offset, -3.0);
+        assert_eq!(r.inner.low, 2.0);
         assert_eq!(r.inner.scale, 5.0);
     }
 }
diff --git a/src/distributions/utils.rs b/src/distributions/utils.rs
index f5fef26eb9a..81b7dfcd3f8 100644
--- a/src/distributions/utils.rs
+++ b/src/distributions/utils.rs
@@ -131,34 +131,159 @@ macro_rules! simd_float_from_int {
 /// implement it as a trait so we can write the same code for `f32` and `f64`.
 /// Only the comparison functions we need are implemented.
 pub trait CompareAll {
+    type Mask;
     fn all_lt(self, other: Self) -> bool;
     fn all_le(self, other: Self) -> bool;
+    fn all_finite(self) -> bool;
+    fn finite_mask(self) -> Self::Mask;
+    fn gt_mask(self, other: Self) -> Self::Mask;
+    fn ge_mask(self, other: Self) -> Self::Mask;
+
+    // Decrease all lanes where the mask is `true` to the next lower value
+    // representable by the floating-point type.
+    fn decrease_masked(self, mask: Self::Mask) -> Self;
+}
+
+// Implement functions available in std builds but missing from core primitives
+#[cfg(not(std))]
+pub(crate) trait Float : Sized {
+    type Bits;
+
+    fn is_nan(self) -> bool;
+    fn is_infinite(self) -> bool;
+    fn is_finite(self) -> bool;
+    fn to_bits(self) -> Self::Bits;
+    fn from_bits(v: Self::Bits) -> Self;
+}
+
+// Implement functions on f32/f64 to give them APIs similar to SIMD types
+pub(crate) trait FloatAsSIMD : Sized {
+    #[inline(always)]
+    fn lanes() -> usize { 1 }
+    #[inline(always)]
+    fn splat(scalar: Self) -> Self { scalar }
+    #[inline(always)]
+    fn extract(self, index: usize) -> Self { assert_eq!(index, 0); self }
+    #[inline(always)]
+    fn replace(self, index: usize, new_value: Self) -> Self { assert_eq!(index, 0); new_value }
 }
 
-impl CompareAll for f32 {
-    fn all_lt(self, other: Self) -> bool { self < other }
-    fn all_le(self, other: Self) -> bool { self <= other }
+pub(crate) trait BoolAsSIMD : Sized {
+    fn any(self) -> bool;
+    fn all(self) -> bool;
+    fn none(self) -> bool;
 }
 
-impl CompareAll for f64 {
-    fn all_lt(self, other: Self) -> bool { self < other }
-    fn all_le(self, other: Self) -> bool { self <= other }
+impl BoolAsSIMD for bool {
+    #[inline(always)]
+    fn any(self) -> bool { self }
+    #[inline(always)]
+    fn all(self) -> bool { self }
+    #[inline(always)]
+    fn none(self) -> bool { !self }
+}
+
+macro_rules! scalar_float_impl {
+    ($ty:ident, $uty:ident) => {
+        #[cfg(not(std))]
+        impl Float for $ty {
+            type Bits = $uty;
+
+            #[inline]
+            fn is_nan(self) -> bool {
+                self != self
+            }
+
+            #[inline]
+            fn is_infinite(self) -> bool {
+                self == ::core::$ty::INFINITY || self == ::core::$ty::NEG_INFINITY
+            }
+
+            #[inline]
+            fn is_finite(self) -> bool {
+                !(self.is_nan() || self.is_infinite())
+            }
+
+            #[inline]
+            fn to_bits(self) -> Self::Bits {
+                unsafe { ::core::mem::transmute(self) }
+            }
+
+            #[inline]
+            fn from_bits(v: Self::Bits) -> Self {
+                // It turns out the safety issues with sNaN were overblown! Hooray!
+                unsafe { ::core::mem::transmute(v) }
+            }
+        }
+
+        impl CompareAll for $ty {
+            type Mask = bool;
+            #[inline(always)]
+            fn all_lt(self, other: Self) -> bool { self < other }
+            #[inline(always)]
+            fn all_le(self, other: Self) -> bool { self <= other }
+            #[inline(always)]
+            fn all_finite(self) -> bool { self.is_finite() }
+            #[inline(always)]
+            fn finite_mask(self) -> Self::Mask { self.is_finite() }
+            #[inline(always)]
+            fn gt_mask(self, other: Self) -> Self::Mask { self > other }
+            #[inline(always)]
+            fn ge_mask(self, other: Self) -> Self::Mask { self >= other }
+            #[inline(always)]
+            fn decrease_masked(self, mask: Self::Mask) -> Self {
+                <$ty>::from_bits(self.to_bits() - mask as $uty)
+            }
+        }
+
+        impl FloatAsSIMD for $ty {}
+    }
 }
 
+scalar_float_impl!(f32, u32);
+scalar_float_impl!(f64, u64);
+
+
 #[cfg(feature="simd_support")]
-macro_rules! simd_less_then {
-    ($ty:ident) => {
+macro_rules! simd_impl {
+    ($ty:ident, $f_scalar:ident, $mty:ident, $uty:ident) => {
         impl CompareAll for $ty {
+            type Mask = $mty;
+            #[inline(always)]
             fn all_lt(self, other: Self) -> bool { self.lt(other).all() }
+            #[inline(always)]
             fn all_le(self, other: Self) -> bool { self.le(other).all() }
+            #[inline(always)]
+            fn all_finite(self) -> bool { self.finite_mask().all() }
+            #[inline(always)]
+            fn finite_mask(self) -> Self::Mask {
+                // This can possibly be done faster by checking bit patterns
+                let neg_inf = $ty::splat(::core::$f_scalar::NEG_INFINITY);
+                let pos_inf = $ty::splat(::core::$f_scalar::INFINITY);
+                self.gt(neg_inf) & self.lt(pos_inf)
+            }
+            #[inline(always)]
+            fn gt_mask(self, other: Self) -> Self::Mask { self.gt(other) }
+            #[inline(always)]
+            fn ge_mask(self, other: Self) -> Self::Mask { self.ge(other) }
+            #[inline(always)]
+            fn decrease_masked(self, mask: Self::Mask) -> Self {
+                // Casting a mask into ints will produce all bits set for
+                // true, and 0 for false. Adding that to the binary
+                // representation of a float means subtracting one from
+                // the binary representation, resulting in the next lower
+                // value representable by $ty. This works even when the
+                // current value is infinity.
+                <$ty>::from_bits(<$uty>::from_bits(self) + <$uty>::from_bits(mask))
+            }
         }
     }
 }
 
-#[cfg(feature="simd_support")] simd_less_then! { f32x2 }
-#[cfg(feature="simd_support")] simd_less_then! { f32x4 }
-#[cfg(feature="simd_support")] simd_less_then! { f32x8 }
-#[cfg(feature="simd_support")] simd_less_then! { f32x16 }
-#[cfg(feature="simd_support")] simd_less_then! { f64x2 }
-#[cfg(feature="simd_support")] simd_less_then! { f64x4 }
-#[cfg(feature="simd_support")] simd_less_then! { f64x8 }
+#[cfg(feature="simd_support")] simd_impl! { f32x2, f32, m32x2, u32x2 }
+#[cfg(feature="simd_support")] simd_impl! { f32x4, f32, m32x4, u32x4 }
+#[cfg(feature="simd_support")] simd_impl! { f32x8, f32, m32x8, u32x8 }
+#[cfg(feature="simd_support")] simd_impl! { f32x16, f32, m1x16, u32x16 }
+#[cfg(feature="simd_support")] simd_impl! { f64x2, f64, m64x2, u64x2 }
+#[cfg(feature="simd_support")] simd_impl! { f64x4, f64, m64x4, u64x4 }
+#[cfg(feature="simd_support")] simd_impl! { f64x8, f64, m1x8, u64x8 }