From 77f697eaa1cf030ce687d2ebd503b592dfbcb808 Mon Sep 17 00:00:00 2001
From: Josh Stone <cuviper@gmail.com>
Date: Sat, 13 Mar 2021 16:56:03 -0800
Subject: [PATCH 1/6] Add more mul/pow tests

---
 benches/bigint.rs | 17 ++++++++++++++++-
 tests/bigint.rs   |  4 ++++
 tests/biguint.rs  |  4 ++++
 3 files changed, 24 insertions(+), 1 deletion(-)
diff --git a/benches/bigint.rs b/benches/bigint.rs
index b7f5fd21..960b7cee 100644
--- a/benches/bigint.rs
+++ b/benches/bigint.rs
@@ -39,7 +39,7 @@ fn factorial(n: usize) -> BigUint {
     let mut f: BigUint = One::one();
     for i in 1..=n {
         let bu: BigUint = FromPrimitive::from_usize(i).unwrap();
-        f += bu;
+        f *= bu;
     }
     f
 }
@@ -351,6 +351,21 @@ fn pow_bench_bigexp(b: &mut Bencher) {
     });
 }
 
+#[bench]
+fn pow_bench_1e1000(b: &mut Bencher) {
+    b.iter(|| BigUint::from(10u32).pow(1_000));
+}
+
+#[bench]
+fn pow_bench_1e10000(b: &mut Bencher) {
+    b.iter(|| BigUint::from(10u32).pow(10_000));
+}
+
+#[bench]
+fn pow_bench_1e100000(b: &mut Bencher) {
+    b.iter(|| BigUint::from(10u32).pow(100_000));
+}
+
 /// This modulus is the prime from the 2048-bit MODP DH group:
 /// https://tools.ietf.org/html/rfc3526#section-3
 const RFC3526_2048BIT_MODP_GROUP: &str = "\
diff --git a/tests/bigint.rs b/tests/bigint.rs
index 8eff5ba7..f244bc4b 100644
--- a/tests/bigint.rs
+++ b/tests/bigint.rs
@@ -1306,6 +1306,10 @@ fn test_pow() {
     check!(u32);
     check!(u64);
     check!(usize);
+
+    let pow_1e10000 = BigInt::from(10u32).pow(10_000_u32);
+    let manual_1e10000 = repeat(10u32).take(10_000).product::<BigInt>();
+    assert!(manual_1e10000 == pow_1e10000);
 }
 
 #[test]
diff --git a/tests/biguint.rs b/tests/biguint.rs
index 13b69f21..716bcc81 100644
--- a/tests/biguint.rs
+++ b/tests/biguint.rs
@@ -1776,6 +1776,10 @@ fn test_pow() {
     check!(u64);
     check!(u128);
     check!(usize);
+
+    let pow_1e10000 = BigUint::from(10u32).pow(10_000_u32);
+    let manual_1e10000 = repeat(10u32).take(10_000).product::<BigUint>();
+    assert!(manual_1e10000 == pow_1e10000);
 }
 
 #[test]

From d4015d90f2c7330e03d133c6f6d65c58907a73e8 Mon Sep 17 00:00:00 2001
From: Josh Stone <cuviper@gmail.com>
Date: Sat, 13 Mar 2021 16:59:31 -0800
Subject: [PATCH 2/6] Do normalization truncation all at once

---
 src/biguint.rs | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/biguint.rs b/src/biguint.rs
index 42350714..271a8837 100644
--- a/src/biguint.rs
+++ b/src/biguint.rs
@@ -846,8 +846,9 @@ impl BigUint {
     /// be nonzero.
     #[inline]
     fn normalize(&mut self) {
-        while let Some(&0) = self.data.last() {
-            self.data.pop();
+        if let Some(&0) = self.data.last() {
+            let len = self.data.iter().rposition(|&d| d != 0).map_or(0, |i| i + 1);
+            self.data.truncate(len);
         }
         if self.data.len() < self.data.capacity() / 4 {
             self.data.shrink_to_fit();

From 42c9c9ac7b567d4ebc2b436d5b6b45d4d260b5ed Mon Sep 17 00:00:00 2001
From: Josh Stone <cuviper@gmail.com>
Date: Sat, 13 Mar 2021 17:05:23 -0800
Subject: [PATCH 3/6] Enable allocation reuse in scalar multiplication

---
 src/bigint/multiplication.rs  |  57 ++++++++----
 src/biguint/multiplication.rs | 168 ++++++++++++++++++++--------------
 src/biguint/power.rs          |   5 +-
 3 files changed, 144 insertions(+), 86 deletions(-)

diff --git a/src/bigint/multiplication.rs b/src/bigint/multiplication.rs
index aaf5b142..a2d97081 100644
--- a/src/bigint/multiplication.rs
+++ b/src/bigint/multiplication.rs
@@ -21,24 +21,49 @@ impl Mul<Sign> for Sign {
     }
 }
 
-forward_all_binop_to_ref_ref!(impl Mul for BigInt, mul);
-
-impl<'a, 'b> Mul<&'b BigInt> for &'a BigInt {
-    type Output = BigInt;
-
-    #[inline]
-    fn mul(self, other: &BigInt) -> BigInt {
-        BigInt::from_biguint(self.sign * other.sign, &self.data * &other.data)
-    }
+macro_rules! impl_mul {
+    ($(impl<$($a:lifetime),*> Mul<$Other:ty> for $Self:ty;)*) => {$(
+        impl<$($a),*> Mul<$Other> for $Self {
+            type Output = BigInt;
+
+            #[inline]
+            fn mul(self, other: $Other) -> BigInt {
+                // automatically match value/ref
+                let BigInt { data: x, .. } = self;
+                let BigInt { data: y, .. } = other;
+                BigInt::from_biguint(self.sign * other.sign, x * y)
+            }
+        }
+    )*}
+}
+impl_mul! {
+    impl<> Mul<BigInt> for BigInt;
+    impl<'b> Mul<&'b BigInt> for BigInt;
+    impl<'a> Mul<BigInt> for &'a BigInt;
+    impl<'a, 'b> Mul<&'b BigInt> for &'a BigInt;
+}
+
+macro_rules! impl_mul_assign {
+    ($(impl<$($a:lifetime),*> MulAssign<$Other:ty> for BigInt;)*) => {$(
+        impl<$($a),*> MulAssign<$Other> for BigInt {
+            #[inline]
+            fn mul_assign(&mut self, other: $Other) {
+                // automatically match value/ref
+                let BigInt { data: y, .. } = other;
+                self.data *= y;
+                if self.data.is_zero() {
+                    self.sign = NoSign;
+                } else {
+                    self.sign = self.sign * other.sign;
+                }
+            }
+        }
+    )*}
 }
-
-impl<'a> MulAssign<&'a BigInt> for BigInt {
-    #[inline]
-    fn mul_assign(&mut self, other: &BigInt) {
-        *self = &*self * other;
-    }
+impl_mul_assign! {
+    impl<> MulAssign<BigInt> for BigInt;
+    impl<'a> MulAssign<&'a BigInt> for BigInt;
 }
-forward_val_assign!(impl MulAssign for BigInt, mul_assign);
 
 promote_all_scalars!(impl Mul for BigInt, mul);
 promote_all_scalars_assign!(impl MulAssign for BigInt, mul_assign);
diff --git a/src/biguint/multiplication.rs b/src/biguint/multiplication.rs
index aaa69344..c07cca17 100644
--- a/src/biguint/multiplication.rs
+++ b/src/biguint/multiplication.rs
@@ -11,7 +11,7 @@ use crate::{BigInt, UsizePromotion};
 use core::cmp::Ordering;
 use core::iter::Product;
 use core::ops::{Mul, MulAssign};
-use num_traits::{CheckedMul, One, Zero};
+use num_traits::{CheckedMul, FromPrimitive, One, Zero};
 
 #[inline]
 pub(super) fn mac_with_carry(
@@ -155,28 +155,28 @@ fn mac3(acc: &mut [BigDigit], b: &[BigDigit], c: &[BigDigit]) {
 
         // We reuse the same BigUint for all the intermediate multiplies and have to size p
         // appropriately here: x1.len() >= x0.len and y1.len() >= y0.len():
-        let len = x1.len() + y1.len() + 1;
+        let len = x1.len() + y1.len();
         let mut p = BigUint { data: vec![0; len] };
 
         // p2 = x1 * y1
-        mac3(&mut p.data[..], x1, y1);
+        mac3(&mut p.data, x1, y1);
 
         // Not required, but the adds go faster if we drop any unneeded 0s from the end:
         p.normalize();
 
-        add2(&mut acc[b..], &p.data[..]);
-        add2(&mut acc[b * 2..], &p.data[..]);
+        add2(&mut acc[b..], &p.data);
+        add2(&mut acc[b * 2..], &p.data);
 
         // Zero out p before the next multiply:
         p.data.truncate(0);
         p.data.resize(len, 0);
 
         // p0 = x0 * y0
-        mac3(&mut p.data[..], x0, y0);
+        mac3(&mut p.data, x0, y0);
         p.normalize();
 
-        add2(&mut acc[..], &p.data[..]);
-        add2(&mut acc[b..], &p.data[..]);
+        add2(acc, &p.data);
+        add2(&mut acc[b..], &p.data);
 
         // p1 = (x1 - x0) * (y1 - y0)
         // We do this one last, since it may be negative and acc can't ever be negative:
@@ -188,13 +188,13 @@ fn mac3(acc: &mut [BigDigit], b: &[BigDigit], c: &[BigDigit]) {
                 p.data.truncate(0);
                 p.data.resize(len, 0);
 
-                mac3(&mut p.data[..], &j0.data[..], &j1.data[..]);
+                mac3(&mut p.data, &j0.data, &j1.data);
                 p.normalize();
 
-                sub2(&mut acc[b..], &p.data[..]);
+                sub2(&mut acc[b..], &p.data);
             }
             Minus => {
-                mac3(&mut acc[b..], &j0.data[..], &j1.data[..]);
+                mac3(&mut acc[b..], &j0.data, &j1.data);
             }
             NoSign => (),
         }
@@ -321,25 +321,41 @@ fn mac3(acc: &mut [BigDigit], b: &[BigDigit], c: &[BigDigit]) {
 }
 
 fn mul3(x: &[BigDigit], y: &[BigDigit]) -> BigUint {
-    let len = x.len() + y.len() + 1;
+    let len = x.len() + y.len();
     let mut prod = BigUint { data: vec![0; len] };
 
-    mac3(&mut prod.data[..], x, y);
+    mac3(&mut prod.data, x, y);
     prod.normalized()
 }
 
-fn scalar_mul(a: &mut [BigDigit], b: BigDigit) -> BigDigit {
-    let mut carry = 0;
-    for a in a.iter_mut() {
-        *a = mul_with_carry(*a, b, &mut carry);
+fn scalar_mul(a: &mut BigUint, b: BigDigit) {
+    match b {
+        0 => a.set_zero(),
+        1 => {}
+        _ => {
+            if b.is_power_of_two() {
+                *a <<= b.trailing_zeros();
+            } else {
+                let mut carry = 0;
+                for a in a.data.iter_mut() {
+                    *a = mul_with_carry(*a, b, &mut carry);
+                }
+                if carry != 0 {
+                    a.data.push(carry as BigDigit);
+                }
+            }
+        }
     }
-    carry as BigDigit
 }
 
 fn sub_sign(mut a: &[BigDigit], mut b: &[BigDigit]) -> (Sign, BigUint) {
     // Normalize:
-    a = &a[..a.iter().rposition(|&x| x != 0).map_or(0, |i| i + 1)];
-    b = &b[..b.iter().rposition(|&x| x != 0).map_or(0, |i| i + 1)];
+    if let Some(&0) = a.last() {
+        a = &a[..a.iter().rposition(|&x| x != 0).map_or(0, |i| i + 1)];
+    }
+    if let Some(&0) = b.last() {
+        b = &b[..b.iter().rposition(|&x| x != 0).map_or(0, |i| i + 1)];
+    }
 
     match cmp_slice(a, b) {
         Ordering::Greater => {
@@ -356,22 +372,55 @@ fn sub_sign(mut a: &[BigDigit], mut b: &[BigDigit]) -> (Sign, BigUint) {
     }
 }
 
-forward_all_binop_to_ref_ref!(impl Mul for BigUint, mul);
-forward_val_assign!(impl MulAssign for BigUint, mul_assign);
-
-impl<'a, 'b> Mul<&'b BigUint> for &'a BigUint {
-    type Output = BigUint;
+macro_rules! impl_mul {
+    ($(impl<$($a:lifetime),*> Mul<$Other:ty> for $Self:ty;)*) => {$(
+        impl<$($a),*> Mul<$Other> for $Self {
+            type Output = BigUint;
+
+            #[inline]
+            fn mul(self, other: $Other) -> BigUint {
+                match (&*self.data, &*other.data) {
+                    // multiply by zero
+                    (&[], _) | (_, &[]) => BigUint::zero(),
+                    // multiply by a scalar
+                    (_, &[digit]) => self * digit,
+                    (&[digit], _) => other * digit,
+                    // full multiplication
+                    (x, y) => mul3(x, y),
+                }
+            }
+        }
+    )*}
+}
+impl_mul! {
+    impl<> Mul<BigUint> for BigUint;
+    impl<'b> Mul<&'b BigUint> for BigUint;
+    impl<'a> Mul<BigUint> for &'a BigUint;
+    impl<'a, 'b> Mul<&'b BigUint> for &'a BigUint;
+}
 
-    #[inline]
-    fn mul(self, other: &BigUint) -> BigUint {
-        mul3(&self.data[..], &other.data[..])
-    }
+macro_rules! impl_mul_assign {
+    ($(impl<$($a:lifetime),*> MulAssign<$Other:ty> for BigUint;)*) => {$(
+        impl<$($a),*> MulAssign<$Other> for BigUint {
+            #[inline]
+            fn mul_assign(&mut self, other: $Other) {
+                match (&*self.data, &*other.data) {
+                    // multiply by zero
+                    (&[], _) => {},
+                    (_, &[]) => self.set_zero(),
+                    // multiply by a scalar
+                    (_, &[digit]) => *self *= digit,
+                    (&[digit], _) => *self = other * digit,
+                    // full multiplication
+                    (x, y) => *self = mul3(x, y),
+                }
+            }
+        }
+    )*}
 }
-impl<'a> MulAssign<&'a BigUint> for BigUint {
-    #[inline]
-    fn mul_assign(&mut self, other: &'a BigUint) {
-        *self = &*self * other
-    }
+impl_mul_assign! {
+    impl<> MulAssign<BigUint> for BigUint;
+    impl<'a> MulAssign<&'a BigUint> for BigUint;
 }
 
 promote_unsigned_scalars!(impl Mul for BigUint, mul);
@@ -392,14 +441,7 @@ impl Mul<u32> for BigUint {
 impl MulAssign<u32> for BigUint {
     #[inline]
     fn mul_assign(&mut self, other: u32) {
-        if other == 0 {
-            self.data.clear();
-        } else {
-            let carry = scalar_mul(&mut self.data[..], other as BigDigit);
-            if carry != 0 {
-                self.data.push(carry);
-            }
-        }
+        scalar_mul(self, other as BigDigit);
     }
 }
 
@@ -416,27 +458,18 @@ impl MulAssign<u64> for BigUint {
     #[cfg(not(u64_digit))]
     #[inline]
     fn mul_assign(&mut self, other: u64) {
-        if other == 0 {
-            self.data.clear();
-        } else if other <= u64::from(BigDigit::max_value()) {
-            *self *= other as BigDigit
+        if let Some(other) = BigDigit::from_u64(other) {
+            scalar_mul(self, other);
         } else {
             let (hi, lo) = big_digit::from_doublebigdigit(other);
-            *self = mul3(&self.data[..], &[lo, hi])
+            *self = mul3(&self.data, &[lo, hi]);
         }
     }
 
     #[cfg(u64_digit)]
     #[inline]
     fn mul_assign(&mut self, other: u64) {
-        if other == 0 {
-            self.data.clear();
-        } else {
-            let carry = scalar_mul(&mut self.data[..], other as BigDigit);
-            if carry != 0 {
-                self.data.push(carry);
-            }
-        }
+        scalar_mul(self, other);
     }
 }
 
@@ -454,26 +487,25 @@ impl MulAssign<u128> for BigUint {
     #[cfg(not(u64_digit))]
     #[inline]
     fn mul_assign(&mut self, other: u128) {
-        if other == 0 {
-            self.data.clear();
-        } else if other <= u128::from(BigDigit::max_value()) {
-            *self *= other as BigDigit
+        if let Some(other) = BigDigit::from_u128(other) {
+            scalar_mul(self, other);
         } else {
-            let (a, b, c, d) = u32_from_u128(other);
-            *self = mul3(&self.data[..], &[d, c, b, a])
+            *self = match u32_from_u128(other) {
+                (0, 0, c, d) => mul3(&self.data, &[d, c]),
+                (0, b, c, d) => mul3(&self.data, &[d, c, b]),
+                (a, b, c, d) => mul3(&self.data, &[d, c, b, a]),
+            };
         }
     }
 
     #[cfg(u64_digit)]
     #[inline]
     fn mul_assign(&mut self, other: u128) {
-        if other == 0 {
-            self.data.clear();
-        } else if other <= BigDigit::max_value() as u128 {
-            *self *= other as BigDigit
+        if let Some(other) = BigDigit::from_u128(other) {
+            scalar_mul(self, other);
         } else {
             let (hi, lo) = big_digit::from_doublebigdigit(other);
-            *self = mul3(&self.data[..], &[lo, hi])
+            *self = mul3(&self.data, &[lo, hi]);
         }
     }
 }
@@ -502,6 +534,6 @@ fn test_sub_sign() {
     let a_i = BigInt::from(a.clone());
     let b_i = BigInt::from(b.clone());
 
-    assert_eq!(sub_sign_i(&a.data[..], &b.data[..]), &a_i - &b_i);
-    assert_eq!(sub_sign_i(&b.data[..], &a.data[..]), &b_i - &a_i);
+    assert_eq!(sub_sign_i(&a.data, &b.data), &a_i - &b_i);
+    assert_eq!(sub_sign_i(&b.data, &a.data), &b_i - &a_i);
 }
diff --git a/src/biguint/power.rs b/src/biguint/power.rs
index 44b38146..d24651bf 100644
--- a/src/biguint/power.rs
+++ b/src/biguint/power.rs
@@ -85,7 +85,7 @@ macro_rules! pow_impl {
                     exp >>= 1;
                     base = &base * &base;
                     if exp & 1 == 1 {
-                        acc = &acc * &base;
+                        acc *= &base;
                     }
                 }
                 acc
@@ -185,7 +185,8 @@ fn plain_modpow(base: &BigUint, exp_data: &[BigDigit], modulus: &BigUint) -> Big
         let mut unit = |exp_is_odd| {
             base = &base * &base % modulus;
             if exp_is_odd {
-                acc = &acc * &base % modulus;
+                acc *= &base;
+                acc %= modulus;
             }
         };
 

From c528be1468570fbab6707e43068e4b382bd8852c Mon Sep 17 00:00:00 2001
From: Josh Stone <cuviper@gmail.com>
Date: Sat, 13 Mar 2021 17:06:12 -0800
Subject: [PATCH 4/6] Skip trailing zeros in multiplication

---
 src/biguint/multiplication.rs | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/biguint/multiplication.rs b/src/biguint/multiplication.rs
index c07cca17..860c6e16 100644
--- a/src/biguint/multiplication.rs
+++ b/src/biguint/multiplication.rs
@@ -66,7 +66,20 @@ fn bigint_from_slice(slice: &[BigDigit]) -> BigInt {
 /// Three argument multiply accumulate:
 /// acc += b * c
 #[allow(clippy::many_single_char_names)]
-fn mac3(acc: &mut [BigDigit], b: &[BigDigit], c: &[BigDigit]) {
+fn mac3(mut acc: &mut [BigDigit], mut b: &[BigDigit], mut c: &[BigDigit]) {
+    // Least-significant zeros have no effect on the output.
+    if let Some(&0) = b.first() {
+        let nz = b.iter().position(|&d| d != 0).unwrap();
+        b = &b[nz..];
+        acc = &mut acc[nz..];
+    }
+    if let Some(&0) = c.first() {
+        let nz = c.iter().position(|&d| d != 0).unwrap();
+        c = &c[nz..];
+        acc = &mut acc[nz..];
+    }
+
+    let acc = acc;
     let (x, y) = if b.len() < c.len() { (b, c) } else { (c, b) };
 
     // We use three algorithms for different input sizes.

From bc967f6abea8395a787ed364172d1405952e4135 Mon Sep 17 00:00:00 2001
From: Josh Stone <cuviper@gmail.com>
Date: Sat, 13 Mar 2021 17:06:54 -0800
Subject: [PATCH 5/6] Use shifts instead of mul/div by 2 in Toom-3

---
 src/biguint/multiplication.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/biguint/multiplication.rs b/src/biguint/multiplication.rs
index 860c6e16..c7afd3f4 100644
--- a/src/biguint/multiplication.rs
+++ b/src/biguint/multiplication.rs
@@ -312,10 +312,10 @@ fn mac3(mut acc: &mut [BigDigit], mut b: &[BigDigit], mut c: &[BigDigit]) {
         //
         // This particular sequence is given by Bodrato and is an interpolation
         // of the above equations.
-        let mut comp3: BigInt = (r3 - &r1) / 3;
-        let mut comp1: BigInt = (r1 - &r2) / 2;
+        let mut comp3: BigInt = (r3 - &r1) / 3u32;
+        let mut comp1: BigInt = (r1 - &r2) >> 1;
         let mut comp2: BigInt = r2 - &r0;
-        comp3 = (&comp2 - comp3) / 2 + &r4 * 2;
+        comp3 = ((&comp2 - comp3) >> 1) + (&r4 << 1);
         comp2 += &comp1 - &r4;
         comp1 -= &comp3;
 

From dce75e10ce76a7209bbae5cbc8351064e10abd4a Mon Sep 17 00:00:00 2001
From: Josh Stone <cuviper@gmail.com>
Date: Sat, 13 Mar 2021 17:07:37 -0800
Subject: [PATCH 6/6] Avoid allocation in the Toom-3 finalization

---
 src/biguint/multiplication.rs | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/src/biguint/multiplication.rs b/src/biguint/multiplication.rs
index c7afd3f4..581c9e17 100644
--- a/src/biguint/multiplication.rs
+++ b/src/biguint/multiplication.rs
@@ -2,7 +2,7 @@ use super::addition::{__add2, add2};
 use super::subtraction::sub2;
 #[cfg(not(u64_digit))]
 use super::u32_from_u128;
-use super::{biguint_from_vec, cmp_slice, BigUint};
+use super::{biguint_from_vec, cmp_slice, BigUint, IntDigits};
 
 use crate::big_digit::{self, BigDigit, DoubleBigDigit};
 use crate::Sign::{self, Minus, NoSign, Plus};
@@ -322,14 +322,24 @@ fn mac3(mut acc: &mut [BigDigit], mut b: &[BigDigit], mut c: &[BigDigit]) {
         // Recomposition. The coefficients of the polynomial are now known.
         //
         // Evaluate at w(t) where t is our given base to get the result.
-        let bits = u64::from(big_digit::BITS) * i as u64;
-        let result = r0
-            + (comp1 << bits)
-            + (comp2 << (2 * bits))
-            + (comp3 << (3 * bits))
-            + (r4 << (4 * bits));
-        let result_pos = result.to_biguint().unwrap();
-        add2(&mut acc[..], &result_pos.data);
+        //
+        //     let bits = u64::from(big_digit::BITS) * i as u64;
+        //     let result = r0
+        //         + (comp1 << bits)
+        //         + (comp2 << (2 * bits))
+        //         + (comp3 << (3 * bits))
+        //         + (r4 << (4 * bits));
+        //     let result_pos = result.to_biguint().unwrap();
+        //     add2(&mut acc[..], &result_pos.data);
+        //
+        // But with less intermediate copying:
+        for (j, result) in [&r0, &comp1, &comp2, &comp3, &r4].iter().enumerate().rev() {
+            match result.sign() {
+                Plus => add2(&mut acc[i * j..], result.digits()),
+                Minus => sub2(&mut acc[i * j..], result.digits()),
+                NoSign => {}
+            }
+        }
     }
 }