From 0a44f513f5de7b5617fae4376391e8ce53a9de6c Mon Sep 17 00:00:00 2001
From: Vinzent Steinberg <Vinzent.Steinberg@gmail.com>
Date: Tue, 19 Feb 2019 17:03:25 +0100
Subject: [PATCH 1/3] Binomial: Faster sampling for n * p < 10

Fixes #734.
---
 src/distributions/binomial.rs | 115 +++++++++++++++++++---------------
 1 file changed, 65 insertions(+), 50 deletions(-)
diff --git a/src/distributions/binomial.rs b/src/distributions/binomial.rs
index 2df393e53ab..907155b24ed 100644
--- a/src/distributions/binomial.rs
+++ b/src/distributions/binomial.rs
@@ -10,7 +10,7 @@
 //! The binomial distribution.
 
 use Rng;
-use distributions::{Distribution, Bernoulli, Cauchy};
+use distributions::{Distribution, Cauchy};
 use distributions::utils::log_gamma;
 
 /// The binomial distribution `Binomial(n, p)`.
@@ -55,19 +55,7 @@ impl Distribution<u64> for Binomial {
         } else if self.p == 1.0 {
             return self.n;
         }
-        
-        // For low n, it is faster to sample directly. For both methods,
-        // performance is independent of p. On Intel Haswell CPU this method
-        // appears to be faster for approx n < 300.
-        if self.n < 300 {
-            let mut result = 0;
-            let d = Bernoulli::new(self.p);
-            for _ in 0 .. self.n {
-                result += rng.sample(d) as u32;
-            }
-            return result as u64;
-        }
-        
+
         // binomial distribution is symmetrical with respect to p -> 1-p, k -> n-k
         // switch p so that it is less than 0.5 - this allows for lower expected values
         // we will just invert the result at the end
@@ -77,53 +65,80 @@ impl Distribution<u64> for Binomial {
             1.0 - self.p
         };
 
-        // prepare some cached values
-        let float_n = self.n as f64;
-        let ln_fact_n = log_gamma(float_n + 1.0);
-        let pc = 1.0 - p;
-        let log_p = p.ln();
-        let log_pc = pc.ln();
-        let expected = self.n as f64 * p;
-        let sq = (expected * (2.0 * pc)).sqrt();
-
-        let mut lresult;
-
-        // we use the Cauchy distribution as the comparison distribution
-        // f(x) ~ 1/(1+x^2)
-        let cauchy = Cauchy::new(0.0, 1.0);
-        loop {
-            let mut comp_dev: f64;
+        let result;
+
+        // For small n * min(p, 1 - p), the BINV algorithm based on the inverse
+        // transformation of the binomial distribution is more efficient:
+        //
+        // Voratas Kachitvichyanukul and Bruce W. Schmeiser. 1988. Binomial
+        // random variate generation. Commun. ACM 31, 2 (February 1988),
+        // 216-222. http://dx.doi.org/10.1145/42372.42381
+        if (self.n as f64) * p < 10. {
+            let q = 1. - p;
+            let s = p / q;
+            let a = ((self.n + 1) as f64) * s;
+            let mut r = q.powf(self.n as f64);
+            // FIXME: Using integer exponentiation should be faster, but it is not available for
+            // `u64` exponents, neither in `std` or `num`.
+            let mut u: f64 = rng.gen();
+            let mut x = 0;
+            while u > r as f64 {
+                u -= r;
+                x += 1;
+                r *= a / (x as f64) - s;
+            }
+            result = x;
+        } else {
+            // FIXME: Using the BTPE algorithm is probably faster.
+
+            // prepare some cached values
+            let float_n = self.n as f64;
+            let ln_fact_n = log_gamma(float_n + 1.0);
+            let pc = 1.0 - p;
+            let log_p = p.ln();
+            let log_pc = pc.ln();
+            let expected = self.n as f64 * p;
+            let sq = (expected * (2.0 * pc)).sqrt();
+            let mut lresult;
+
+            // we use the Cauchy distribution as the comparison distribution
+            // f(x) ~ 1/(1+x^2)
+            let cauchy = Cauchy::new(0.0, 1.0);
             loop {
-                // draw from the Cauchy distribution
-                comp_dev = rng.sample(cauchy);
-                // shift the peak of the comparison ditribution
-                lresult = expected + sq * comp_dev;
-                // repeat the drawing until we are in the range of possible values
-                if lresult >= 0.0 && lresult < float_n + 1.0 {
-                    break;
+                let mut comp_dev: f64;
+                loop {
+                    // draw from the Cauchy distribution
+                    comp_dev = rng.sample(cauchy);
+                    // shift the peak of the comparison ditribution
+                    lresult = expected + sq * comp_dev;
+                    // repeat the drawing until we are in the range of possible values
+                    if lresult >= 0.0 && lresult < float_n + 1.0 {
+                        break;
+                    }
                 }
-            }
 
-            // the result should be discrete
-            lresult = lresult.floor();
+                // the result should be discrete
+                lresult = lresult.floor();
 
-            let log_binomial_dist = ln_fact_n - log_gamma(lresult+1.0) -
-                log_gamma(float_n - lresult + 1.0) + lresult*log_p + (float_n - lresult)*log_pc;
-            // this is the binomial probability divided by the comparison probability
-            // we will generate a uniform random value and if it is larger than this,
-            // we interpret it as a value falling out of the distribution and repeat
-            let comparison_coeff = (log_binomial_dist.exp() * sq) * (1.2 * (1.0 + comp_dev*comp_dev));
+                let log_binomial_dist = ln_fact_n - log_gamma(lresult+1.0) -
+                    log_gamma(float_n - lresult + 1.0) + lresult*log_p + (float_n - lresult)*log_pc;
+                // this is the binomial probability divided by the comparison probability
+                // we will generate a uniform random value and if it is larger than this,
+                // we interpret it as a value falling out of the distribution and repeat
+                let comparison_coeff = (log_binomial_dist.exp() * sq) * (1.2 * (1.0 + comp_dev*comp_dev));
 
-            if comparison_coeff >= rng.gen() {
-                break;
+                if comparison_coeff >= rng.gen() {
+                    break;
+                }
             }
+            result = lresult as u64;
         }
 
         // invert the result for p < 0.5
         if p != self.p {
-            self.n - lresult as u64
+            self.n - result
         } else {
-            lresult as u64
+            result
         }
     }
 }

From 6323d2951ec2a827b0113e2d81c038e846613b8d Mon Sep 17 00:00:00 2001
From: Vinzent Steinberg <Vinzent.Steinberg@gmail.com>
Date: Tue, 19 Feb 2019 17:14:09 +0100
Subject: [PATCH 2/3] Binomial: Add benchmark for small n * p

Our previous implementation was very slow (#734). The benchmarks makes
sure we won't have such regressions in the future.
---
 benches/distributions.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benches/distributions.rs b/benches/distributions.rs
index 56a8d43b791..5ff91f56e2b 100644
--- a/benches/distributions.rs
+++ b/benches/distributions.rs
@@ -203,6 +203,7 @@ distr_float!(distr_gamma_large_shape, f64, Gamma::new(10., 1.0));
 distr_float!(distr_gamma_small_shape, f64, Gamma::new(0.1, 1.0));
 distr_float!(distr_cauchy, f64, Cauchy::new(4.2, 6.9));
 distr_int!(distr_binomial, u64, Binomial::new(20, 0.7));
+distr_int!(distr_binomial_small, u64, Binomial::new(1000000, 1e-30));
 distr_int!(distr_poisson, u64, Poisson::new(4.0));
 distr!(distr_bernoulli, bool, Bernoulli::new(0.18));
 distr_arr!(distr_circle, [f64; 2], UnitCircle::new());

From 5f6f89e5c7903da9fd6984dfc95300ccd0af630b Mon Sep 17 00:00:00 2001
From: Vinzent Steinberg <Vinzent.Steinberg@gmail.com>
Date: Mon, 25 Feb 2019 14:05:00 +0100
Subject: [PATCH 3/3] Binomial: Use integer exponentation

This increase performance significantly, see below. However, it requires
shipping our own integer exponentiation implementation, because `u64`
exponents are not supported by `std` or `num`.

Before:
```
test distr_binomial       ... bench:  88,365 ns/iter (+/- 2,215) =90 MB/s
test distr_binomial_small ... bench:  33,775 ns/iter (+/- 5,494) =236 MB/s
test misc_binomial_1      ... bench:      13 ns/iter (+/- 2)
test misc_binomial_10     ... bench:      72 ns/iter (+/- 1)
test misc_binomial_100    ... bench:      71 ns/iter (+/- 11)
test misc_binomial_1000   ... bench:     624 ns/iter (+/- 19)
test misc_binomial_1e12   ... bench:     681 ns/iter (+/- 18)
```

After:
```
test distr_binomial       ... bench:  44,354 ns/iter (+/- 3,518) = 180 MB/s
test distr_binomial_small ... bench:  22,736 ns/iter (+/- 514) = 351 MB/s
test misc_binomial_1      ... bench:      10 ns/iter (+/- 0)
test misc_binomial_10     ... bench:      23 ns/iter (+/- 0)
test misc_binomial_100    ... bench:      27 ns/iter (+/- 4)
test misc_binomial_1000   ... bench:     621 ns/iter (+/- 15)
test misc_binomial_1e12   ... bench:     686 ns/iter (+/- 20)
```
---
 src/distributions/binomial.rs | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/src/distributions/binomial.rs b/src/distributions/binomial.rs
index 907155b24ed..2bad03b78bd 100644
--- a/src/distributions/binomial.rs
+++ b/src/distributions/binomial.rs
@@ -47,6 +47,31 @@ impl Binomial {
     }
 }
 
+/// Raise a `base` to the power of `exp`, using exponentiation by squaring.
+///
+/// This implementation is based on the one in the `num_traits` crate. It is
+/// slightly modified to accept `u64` exponents.
+fn pow(mut base: f64, mut exp: u64) -> f64 {
+    if exp == 0 {
+        return 1.;
+    }
+
+    while exp & 1 == 0 {
+        base *= base;
+        exp >>= 1;
+    }
+
+    let mut acc = base;
+    while exp > 1 {
+        exp >>= 1;
+        base *= base;
+        if exp & 1 == 1 {
+            acc *= base;
+        }
+    }
+    acc
+}
+
 impl Distribution<u64> for Binomial {
     fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> u64 {
         // Handle these values directly.
@@ -77,9 +102,7 @@ impl Distribution<u64> for Binomial {
             let q = 1. - p;
             let s = p / q;
             let a = ((self.n + 1) as f64) * s;
-            let mut r = q.powf(self.n as f64);
-            // FIXME: Using integer exponentiation should be faster, but it is not available for
-            // `u64` exponents, neither in `std` or `num`.
+            let mut r = pow(q, self.n);
             let mut u: f64 = rng.gen();
             let mut x = 0;
             while u > r as f64 {