diff --git a/Cargo.toml b/Cargo.toml
index 0e83f1a6f..67bf41dd1 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -32,11 +32,23 @@ typenum = { version = "1.16", features = ["const-generics"] }
 const-default = { version = "1", optional = true, default-features = false }
 serde = { version = "1.0", optional = true, default-features = false }
 zeroize = { version = "1", optional = true, default-features = false }
+faster-hex = { version = "0.8", optional = true, default-features = false }
 
 [dev-dependencies]
 # this can't yet be made optional, see https://github.com/rust-lang/cargo/issues/1596
 serde_json = "1.0"
 bincode = "1.0"
+criterion = { version = "0.5", features = ["html_reports"] }
+rand = "0.8"
+
+[[bench]]
+name = "hex"
+harness = false
+
+[profile.bench]
+opt-level = 3
+lto = 'fat'
+codegen-units = 1
 
 [package.metadata.docs.rs]
 # all but "internals", don't show those on docs.rs
diff --git a/benches/hex.rs b/benches/hex.rs
new file mode 100644
index 000000000..ff879c9ae
--- /dev/null
+++ b/benches/hex.rs
@@ -0,0 +1,46 @@
+use criterion::{
+    criterion_group, criterion_main, measurement::WallTime, BenchmarkGroup, Criterion,
+};
+use generic_array::{typenum::*, ArrayLength, GenericArray};
+use rand::RngCore;
+
+use std::{fmt::UpperHex, io::Write};
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let mut hex = c.benchmark_group("hex");
+
+    let mut rng = rand::thread_rng();
+
+    macro_rules! all_hex_benches {
+        ($($len:ty,)*) => {
+            $(bench_hex::<$len>(&mut rng, &mut hex);)*
+        }
+    }
+
+    all_hex_benches!(
+        U1, U2, U4, U8, U12, U15, U16, U32, U64, U100, U128, U160, U255, U256, U500, U512, U900,
+        U1023, U1024, Sum<U1024, U1>, U2048, U4096, Prod<U1000, U5>, U10000,
+    );
+
+    hex.finish();
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
+
+fn bench_hex<N: ArrayLength>(mut rng: impl RngCore, g: &mut BenchmarkGroup<'_, WallTime>)
+where
+    GenericArray<u8, N>: UpperHex,
+{
+    let mut fixture = Box::<GenericArray<u8, N>>::default();
+    rng.fill_bytes(fixture.as_mut_slice());
+
+    g.bench_function(format!("N{:08}", N::USIZE), |b| {
+        let mut out = Vec::with_capacity(N::USIZE * 2);
+
+        b.iter(|| {
+            _ = write!(out, "{:X}", &*fixture);
+            out.clear();
+        });
+    });
+}
diff --git a/src/hex.rs b/src/hex.rs
index a132d6646..c86056c72 100644
--- a/src/hex.rs
+++ b/src/hex.rs
@@ -18,12 +18,40 @@ use typenum::*;
 
 use crate::{ArrayLength, GenericArray};
 
-static LOWER_CHARS: [u8; 16] = *b"0123456789abcdef";
-static UPPER_CHARS: [u8; 16] = *b"0123456789ABCDEF";
+#[inline(always)]
+fn hex_encode_fallback<const UPPER: bool>(src: &[u8], dst: &mut [u8]) {
+    if dst.len() < src.len() * 2 {
+        unsafe { core::hint::unreachable_unchecked() };
+    }
+
+    let alphabet = match UPPER {
+        true => b"0123456789ABCDEF",
+        false => b"0123456789abcdef",
+    };
+
+    dst.chunks_exact_mut(2).zip(src).for_each(|(s, c)| {
+        s[0] = alphabet[(c >> 4) as usize];
+        s[1] = alphabet[(c & 0xF) as usize];
+    });
+}
+
+#[inline]
+fn hex_encode<const UPPER: bool>(src: &[u8], dst: &mut [u8]) {
+    debug_assert!(dst.len() >= (src.len() * 2));
 
-fn generic_hex<N: ArrayLength>(
+    #[cfg(any(miri, not(feature = "faster-hex")))]
+    hex_encode_fallback::<UPPER>(src, dst);
+
+    // the `unwrap_unchecked` is to avoid the length checks
+    #[cfg(all(feature = "faster-hex", not(miri)))]
+    match UPPER {
+        true => unsafe { faster_hex::hex_encode_upper(src, dst).unwrap_unchecked() },
+        false => unsafe { faster_hex::hex_encode(src, dst).unwrap_unchecked() },
+    };
+}
+
+fn generic_hex<N: ArrayLength, const UPPER: bool>(
     arr: &GenericArray<u8, N>,
-    alphabet: &[u8; 16], // use fixed-length array to avoid slice index checks
     f: &mut fmt::Formatter<'_>,
 ) -> fmt::Result
 where
@@ -36,32 +64,43 @@ where
         _ => max_digits,
     };
 
-    let max_hex = (max_digits >> 1) + (max_digits & 1);
+    // ceil(max_digits / 2)
+    let max_bytes = (max_digits >> 1) + (max_digits & 1);
+
+    let input = {
+        // LLVM can't seem to automatically prove this
+        if max_bytes > N::USIZE {
+            unsafe { core::hint::unreachable_unchecked() };
+        }
+
+        &arr[..max_bytes]
+    };
 
     if N::USIZE <= 1024 {
-        // For small arrays use a stack allocated
-        // buffer of 2x number of bytes
-        let mut res = GenericArray::<u8, Sum<N, N>>::default();
+        // For small arrays use a stack allocated buffer of 2x number of bytes
+        let mut buf = GenericArray::<u8, Sum<N, N>>::default();
 
-        arr.iter().take(max_hex).enumerate().for_each(|(i, c)| {
-            res[i * 2] = alphabet[(c >> 4) as usize];
-            res[i * 2 + 1] = alphabet[(c & 0xF) as usize];
-        });
+        if N::USIZE < 16 {
+            // for the smallest inputs, don't bother limiting to max_bytes,
+            // just process the entire array. When "faster-hex" is enabled,
+            // this avoids its logic that winds up going to the fallback anyway
+            hex_encode_fallback::<UPPER>(arr, &mut buf);
+        } else {
+            hex_encode::<UPPER>(input, &mut buf);
+        }
 
-        f.write_str(unsafe { str::from_utf8_unchecked(&res[..max_digits]) })?;
+        f.write_str(unsafe { str::from_utf8_unchecked(buf.get_unchecked(..max_digits)) })?;
     } else {
         // For large array use chunks of up to 1024 bytes (2048 hex chars)
         let mut buf = [0u8; 2048];
         let mut digits_left = max_digits;
 
-        for chunk in arr[..max_hex].chunks(1024) {
-            chunk.iter().enumerate().for_each(|(i, c)| {
-                buf[i * 2] = alphabet[(c >> 4) as usize];
-                buf[i * 2 + 1] = alphabet[(c & 0xF) as usize];
-            });
+        for chunk in input.chunks(1024) {
+            hex_encode::<UPPER>(chunk, &mut buf);
 
             let n = min(chunk.len() * 2, digits_left);
-            f.write_str(unsafe { str::from_utf8_unchecked(&buf[..n]) })?;
+            // SAFETY: n will always be within bounds due to the above min
+            f.write_str(unsafe { str::from_utf8_unchecked(buf.get_unchecked(..n)) })?;
             digits_left -= n;
         }
     }
@@ -74,7 +113,7 @@ where
     Sum<N, N>: ArrayLength,
 {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        generic_hex(self, &LOWER_CHARS, f)
+        generic_hex::<_, false>(self, f)
     }
 }
 
@@ -84,6 +123,6 @@ where
     Sum<N, N>: ArrayLength,
 {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        generic_hex(self, &UPPER_CHARS, f)
+        generic_hex::<_, true>(self, f)
     }
 }