From 788b1708cb568baac6fc1193218575017dbd39e1 Mon Sep 17 00:00:00 2001
From: Joel Jakobsson <joel@compiler.org>
Date: Tue, 16 Apr 2024 22:19:26 +0200
Subject: [PATCH 1/4] Add two new multiply benchmarks testing large factors of
 unequal sizes

test multiply_0           ... bench:          42 ns/iter (+/- 0)
test multiply_1           ... bench:       4,271 ns/iter (+/- 68)
test multiply_2           ... bench:     289,339 ns/iter (+/- 10,532)
test multiply_3           ... bench:     663,387 ns/iter (+/- 17,381)
test multiply_4           ... bench:       7,474 ns/iter (+/- 384)
test multiply_5           ... bench:      16,376 ns/iter (+/- 254)
---
 benches/bigint.rs | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/benches/bigint.rs b/benches/bigint.rs
index 80ec191c..22795274 100644
--- a/benches/bigint.rs
+++ b/benches/bigint.rs
@@ -87,6 +87,16 @@ fn multiply_3(b: &mut Bencher) {
     multiply_bench(b, 1 << 16, 1 << 17);
 }
 
+#[bench]
+fn multiply_4(b: &mut Bencher) {
+    multiply_bench(b, 1 << 12, 1 << 13);
+}
+
+#[bench]
+fn multiply_5(b: &mut Bencher) {
+    multiply_bench(b, 1 << 12, 1 << 14);
+}
+
 #[bench]
 fn divide_0(b: &mut Bencher) {
     divide_bench(b, 1 << 8, 1 << 6);

From bdfe16590ec230caf058c1113fb04a20e14f21ea Mon Sep 17 00:00:00 2001
From: Joel Jakobsson <joel@compiler.org>
Date: Tue, 16 Apr 2024 22:22:06 +0200
Subject: [PATCH 2/4] Implement Half-Karatsuba for unequal factor sizes

Introduces Half-Karatsuba in multiplication module for cases where there is a
significant size disparity between factors. Optimizes performance by evenly
splitting the larger factor for more balanced multiplication calculations.

-test multiply_0           ... bench:          42 ns/iter (+/- 0)
+test multiply_0           ... bench:          42 ns/iter (+/- 0)

-test multiply_1           ... bench:       4,271 ns/iter (+/- 68)
+test multiply_1           ... bench:       4,242 ns/iter (+/- 164)

-test multiply_2           ... bench:     289,339 ns/iter (+/- 10,532)
+test multiply_2           ... bench:     288,857 ns/iter (+/- 8,715)

-test multiply_3           ... bench:     663,387 ns/iter (+/- 17,381)
+test multiply_3           ... bench:     583,635 ns/iter (+/- 14,997)

-test multiply_4           ... bench:       7,474 ns/iter (+/- 384)
+test multiply_4           ... bench:       6,649 ns/iter (+/- 164)

-test multiply_5           ... bench:      16,376 ns/iter (+/- 254)
+test multiply_5           ... bench:      13,922 ns/iter (+/- 323)
---
 src/biguint/multiplication.rs | 83 ++++++++++++++++++++++++++++++++++-
 1 file changed, 82 insertions(+), 1 deletion(-)

diff --git a/src/biguint/multiplication.rs b/src/biguint/multiplication.rs
index 4d7f1f21..37c24a42 100644
--- a/src/biguint/multiplication.rs
+++ b/src/biguint/multiplication.rs
@@ -88,9 +88,10 @@ fn mac3(mut acc: &mut [BigDigit], mut b: &[BigDigit], mut c: &[BigDigit]) {
     let acc = acc;
     let (x, y) = if b.len() < c.len() { (b, c) } else { (c, b) };
 
-    // We use three algorithms for different input sizes.
+    // We use four algorithms for different input sizes.
     //
     // - For small inputs, long multiplication is fastest.
+    // - If y is at least least twice as long as x, split using Half-Karatsuba.
     // - Next we use Karatsuba multiplication (Toom-2), which we have optimized
     //   to avoid unnecessary allocations for intermediate values.
     // - For the largest inputs we use Toom-3, which better optimizes the
@@ -104,6 +105,86 @@ fn mac3(mut acc: &mut [BigDigit], mut b: &[BigDigit], mut c: &[BigDigit]) {
         for (i, xi) in x.iter().enumerate() {
             mac_digit(&mut acc[i..], y, *xi);
         }
+    } else if x.len() * 2 <= y.len() {
+        // Karatsuba Multiplication for factors with significant length disparity.
+        //
+        // The Half-Karatsuba Multiplication Algorithm is a specialized case of
+        // the normal Karatsuba multiplication algorithm, designed for the scenario
+        // where y has at least twice as many base digits as x.
+        //
+        // In this case y (the longer input) is split into high2 and low2,
+        // at m2 (half the length of y) and x (the shorter input),
+        // is used directly without splitting.
+        //
+        // The algorithm then proceeds as follows:
+        //
+        // 1. Compute the product z0 = x * low2.
+        // 2. Compute the product temp = x * high2.
+        // 3. Adjust the weight of temp by adding m2 (* NBASE ^ m2)
+        // 4. Add temp and z0 to obtain the final result.
+        //
+        // Proof:
+        //
+        // The algorithm can be derived from the original Karatsuba algorithm by
+        // simplifying the formula when the shorter factor x is not split into
+        // high and low parts, as shown below.
+        //
+        // Original Karatsuba formula:
+        //
+        //     result = (z2 * NBASE ^ (m2 × 2)) + ((z1 - z2 - z0) * NBASE ^ m2) + z0
+        //
+        // Substitutions:
+        //
+        //     low1 = x
+        //     high1 = 0
+        //
+        // Applying substitutions:
+        //
+        //     z0 = (low1 * low2)
+        //        = (x * low2)
+        //
+        //     z1 = ((low1 + high1) * (low2 + high2))
+        //        = ((x + 0) * (low2 + high2))
+        //        = (x * low2) + (x * high2)
+        //
+        //     z2 = (high1 * high2)
+        //        = (0 * high2)
+        //        = 0
+        //
+        // Simplified using the above substitutions:
+        //
+        //     result = (z2 * NBASE ^ (m2 × 2)) + ((z1 - z2 - z0) * NBASE ^ m2) + z0
+        //            = (0 * NBASE ^ (m2 × 2)) + ((z1 - 0 - z0) * NBASE ^ m2) + z0
+        //            = ((z1 - z0) * NBASE ^ m2) + z0
+        //            = ((z1 - z0) * NBASE ^ m2) + z0
+        //            = (x * high2) * NBASE ^ m2 + z0
+        let m2 = y.len() / 2;
+        let (low2, high2) = y.split_at(m2);
+
+        // We reuse the same BigUint for all the intermediate multiplies and have to size p
+        // appropriately here: x.len() and high2.len() could be different:
+        let len = x.len() + high2.len() + 1;
+        let mut p = BigUint { data: vec![0; len] };
+
+        // z0 = x * low2
+        mac3(&mut p.data, x, low2);
+        p.normalize();
+
+        // Add z0 directly to the accumulator
+        add2(acc, &p.data);
+
+        // Zero out p before the next multiply:
+        p.data.truncate(0);
+        p.data.resize(len, 0);
+
+        // temp = x * high2
+        mac3(&mut p.data, x, high2);
+        p.normalize();
+
+        // Add temp shifted by m2 to the accumulator
+        // This simulates the effect of multiplying temp by b^m2.
+        // Add directly starting at index m2 in the accumulator.
+        add2(&mut acc[m2..], &p.data);
     } else if x.len() <= 256 {
         // Karatsuba multiplication:
         //

From 5ff5164d3cb834e03b925847c3dfc5bbefd38a87 Mon Sep 17 00:00:00 2001
From: Joel Jakobsson <joel@compiler.org>
Date: Wed, 17 Apr 2024 04:40:16 +0200
Subject: [PATCH 3/4] Simplify further by eliminating p to avoid allocation

Per suggestion from cuviper
---
 src/biguint/multiplication.rs | 26 +++-----------------------
 1 file changed, 3 insertions(+), 23 deletions(-)

diff --git a/src/biguint/multiplication.rs b/src/biguint/multiplication.rs
index 37c24a42..a4e85ec7 100644
--- a/src/biguint/multiplication.rs
+++ b/src/biguint/multiplication.rs
@@ -161,30 +161,10 @@ fn mac3(mut acc: &mut [BigDigit], mut b: &[BigDigit], mut c: &[BigDigit]) {
         let m2 = y.len() / 2;
         let (low2, high2) = y.split_at(m2);
 
-        // We reuse the same BigUint for all the intermediate multiplies and have to size p
-        // appropriately here: x.len() and high2.len() could be different:
-        let len = x.len() + high2.len() + 1;
-        let mut p = BigUint { data: vec![0; len] };
-
-        // z0 = x * low2
-        mac3(&mut p.data, x, low2);
-        p.normalize();
-
-        // Add z0 directly to the accumulator
-        add2(acc, &p.data);
-
-        // Zero out p before the next multiply:
-        p.data.truncate(0);
-        p.data.resize(len, 0);
-
-        // temp = x * high2
-        mac3(&mut p.data, x, high2);
-        p.normalize();
+        // (x * high2) * NBASE ^ m2 + z0
+        mac3(acc, x, low2);
+        mac3(&mut acc[m2..], x, high2);
 
-        // Add temp shifted by m2 to the accumulator
-        // This simulates the effect of multiplying temp by b^m2.
-        // Add directly starting at index m2 in the accumulator.
-        add2(&mut acc[m2..], &p.data);
     } else if x.len() <= 256 {
         // Karatsuba multiplication:
         //

From aea928412ecbaa82d4b8e51fc595c8a96fb05566 Mon Sep 17 00:00:00 2001
From: Joel Jakobsson <joel@compiler.org>
Date: Wed, 17 Apr 2024 07:34:35 +0200
Subject: [PATCH 4/4] Fix format complaint; remove trailing newline

---
 src/biguint/multiplication.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/biguint/multiplication.rs b/src/biguint/multiplication.rs
index a4e85ec7..fe56ac36 100644
--- a/src/biguint/multiplication.rs
+++ b/src/biguint/multiplication.rs
@@ -164,7 +164,6 @@ fn mac3(mut acc: &mut [BigDigit], mut b: &[BigDigit], mut c: &[BigDigit]) {
         // (x * high2) * NBASE ^ m2 + z0
         mac3(acc, x, low2);
         mac3(&mut acc[m2..], x, high2);
-
     } else if x.len() <= 256 {
         // Karatsuba multiplication:
         //