diff --git a/rand_hc/src/hc128.rs b/rand_hc/src/hc128.rs
index 8bc85f917fe..4f192afcb77 100644
--- a/rand_hc/src/hc128.rs
+++ b/rand_hc/src/hc128.rs
@@ -128,6 +128,12 @@ impl BlockRngCore for Hc128Core {
         let cc = self.counter1024 % 512;
         let dd = (cc + 16) % 512;
         let ee = cc.wrapping_sub(16) % 512;
+        // These asserts let the compiler optimize out the bounds checks.
+        // Some of them may be superflous, and that's fine:
+        // they'll be optimized out if that's the case.
+        assert!(ee + 15 < 512);
+        assert!(cc + 15 < 512);
+        assert!(dd < 512);
 
         if self.counter1024 & 512 == 0 {
             // P block
@@ -175,25 +181,19 @@ impl Hc128Core {
     #[inline(always)]
     fn step_p(&mut self, i: usize, i511: usize, i3: usize, i10: usize, i12: usize) -> u32 {
         let (p, q) = self.t.split_at_mut(512);
-        // FIXME: it would be great if we the bounds checks here could be
-        // optimized out, and we would not need unsafe.
-        // This improves performance by about 7%.
-        unsafe {
-            let temp0 = p.get_unchecked(i511).rotate_right(23);
-            let temp1 = p.get_unchecked(i3).rotate_right(10);
-            let temp2 = p.get_unchecked(i10).rotate_right(8);
-            *p.get_unchecked_mut(i) = p
-                .get_unchecked(i)
-                .wrapping_add(temp2)
-                .wrapping_add(temp0 ^ temp1);
-            let temp3 = {
-                // The h1 function in HC-128
-                let a = *p.get_unchecked(i12) as u8;
-                let c = (p.get_unchecked(i12) >> 16) as u8;
-                q[a as usize].wrapping_add(q[256 + c as usize])
-            };
-            temp3 ^ p.get_unchecked(i)
-        }
+        let temp0 = p[i511].rotate_right(23);
+        let temp1 = p[i3].rotate_right(10);
+        let temp2 = p[i10].rotate_right(8);
+        p[i] = p[i]
+            .wrapping_add(temp2)
+            .wrapping_add(temp0 ^ temp1);
+        let temp3 = {
+            // The h1 function in HC-128
+            let a = p[i12] as u8;
+            let c = (p[i12] >> 16) as u8;
+            q[a as usize].wrapping_add(q[256 + c as usize])
+        };
+        temp3 ^ p[i]
     }
 
     // One step of HC-128, update Q and generate 32 bits keystream
@@ -202,22 +202,20 @@ impl Hc128Core {
     #[inline(always)]
     fn step_q(&mut self, i: usize, i511: usize, i3: usize, i10: usize, i12: usize) -> u32 {
         let (p, q) = self.t.split_at_mut(512);
-        unsafe {
-            let temp0 = q.get_unchecked(i511).rotate_left(23);
-            let temp1 = q.get_unchecked(i3).rotate_left(10);
-            let temp2 = q.get_unchecked(i10).rotate_left(8);
-            *q.get_unchecked_mut(i) = q
-                .get_unchecked(i)
-                .wrapping_add(temp2)
-                .wrapping_add(temp0 ^ temp1);
-            let temp3 = {
-                // The h2 function in HC-128
-                let a = *q.get_unchecked(i12) as u8;
-                let c = (q.get_unchecked(i12) >> 16) as u8;
-                p[a as usize].wrapping_add(p[256 + c as usize])
-            };
-            temp3 ^ q.get_unchecked(i)
-        }
+        let temp0 = q[i511].rotate_left(23);
+        let temp1 = q[i3].rotate_left(10);
+        let temp2 = q[i10].rotate_left(8);
+        q[i] = q
+            [i]
+            .wrapping_add(temp2)
+            .wrapping_add(temp0 ^ temp1);
+        let temp3 = {
+            // The h2 function in HC-128
+            let a = q[i12] as u8;
+            let c = (q[i12] >> 16) as u8;
+            p[a as usize].wrapping_add(p[256 + c as usize])
+        };
+        temp3 ^ q[i]
     }
 
     fn sixteen_steps(&mut self) {
@@ -226,6 +224,12 @@ impl Hc128Core {
         let cc = self.counter1024 % 512;
         let dd = (cc + 16) % 512;
         let ee = cc.wrapping_sub(16) % 512;
+        // These asserts let the compiler optimize out the bounds checks.
+        // Some of them may be superflous, and that's fine:
+        // they'll be optimized out if that's the case.
+        assert!(ee + 15 < 512);
+        assert!(cc + 15 < 512);
+        assert!(dd < 512);
 
         if self.counter1024 < 512 {
             // P block