Skip to content

Commit

Permalink
Avoid surrogates when generating char using Standard distribution
Browse files Browse the repository at this point in the history
  • Loading branch information
Pazzaz committed Jun 20, 2018
1 parent b7b1176 commit e00ec65
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 8 deletions.
11 changes: 11 additions & 0 deletions benches/misc.rs
Expand Up @@ -150,3 +150,14 @@ fn gen_1k_fill(b: &mut Bencher) {
});
b.bytes = 1024;
}

#[bench]
fn misc_gen_chars(b: &mut Bencher) {
use std::iter;
let mut rng = StdRng::from_rng(&mut thread_rng()).unwrap();
b.iter(|| {
let v: Vec<char> = iter::repeat(()).map(|()| rng.gen()).take(128).collect();
v
});
b.bytes = 512;
}
19 changes: 11 additions & 8 deletions src/distributions/other.rs
Expand Up @@ -44,15 +44,18 @@ pub struct Alphanumeric;
impl Distribution<char> for Standard {
#[inline]
fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> char {
let range = Uniform::new(0u32, 0x11_0000);
loop {
match char::from_u32(range.sample(rng)) {
Some(c) => return c,
// About 0.2% of numbers in the range 0..0x110000 are invalid
// codepoints (surrogates).
None => {}
}
// A valid `char` is either in the interval `[0, 0xD800)` or
// `(0xDFFF, 0x11_0000)`. All `char`s must therefore be in
// `[0, 0x11_0000)` but not in the "gap" `[0xD800, 0xDFFF]` which is
// reserved for surrogates. This is the size of that gap.
const GAP_SIZE: u32 = 0xDFFF - 0xD800 + 1;

let range = Uniform::new(GAP_SIZE, 0x11_0000);
let mut n = range.sample(rng);
if n <= 0xDFFF {
n -= GAP_SIZE;
}
unsafe { char::from_u32_unchecked(n) }
}
}

Expand Down

0 comments on commit e00ec65

Please sign in to comment.