Merge pull request #295 from joelonsql/half-karatsuba

Optimise multiplication
rust-num · Apr 24, 2024 · 06b61c8 · 06b61c8
2 parents e50a1be + aea9284
commit 06b61c8
Show file tree

Hide file tree

Showing 2 changed files with 71 additions and 1 deletion.
diff --git a/benches/bigint.rs b/benches/bigint.rs
@@ -87,6 +87,16 @@ fn multiply_3(b: &mut Bencher) {
     multiply_bench(b, 1 << 16, 1 << 17);
 }
 
+#[bench]
+fn multiply_4(b: &mut Bencher) {
+    multiply_bench(b, 1 << 12, 1 << 13);
+}
+
+#[bench]
+fn multiply_5(b: &mut Bencher) {
+    multiply_bench(b, 1 << 12, 1 << 14);
+}
+
 #[bench]
 fn divide_0(b: &mut Bencher) {
     divide_bench(b, 1 << 8, 1 << 6);

diff --git a/src/biguint/multiplication.rs b/src/biguint/multiplication.rs
@@ -88,9 +88,10 @@ fn mac3(mut acc: &mut [BigDigit], mut b: &[BigDigit], mut c: &[BigDigit]) {
     let acc = acc;
     let (x, y) = if b.len() < c.len() { (b, c) } else { (c, b) };
 
-    // We use three algorithms for different input sizes.
+    // We use four algorithms for different input sizes.
     //
     // - For small inputs, long multiplication is fastest.
+    // - If y is at least least twice as long as x, split using Half-Karatsuba.
     // - Next we use Karatsuba multiplication (Toom-2), which we have optimized
     //   to avoid unnecessary allocations for intermediate values.
     // - For the largest inputs we use Toom-3, which better optimizes the
@@ -104,6 +105,65 @@ fn mac3(mut acc: &mut [BigDigit], mut b: &[BigDigit], mut c: &[BigDigit]) {
         for (i, xi) in x.iter().enumerate() {
             mac_digit(&mut acc[i..], y, *xi);
         }
+    } else if x.len() * 2 <= y.len() {
+        // Karatsuba Multiplication for factors with significant length disparity.
+        //
+        // The Half-Karatsuba Multiplication Algorithm is a specialized case of
+        // the normal Karatsuba multiplication algorithm, designed for the scenario
+        // where y has at least twice as many base digits as x.
+        //
+        // In this case y (the longer input) is split into high2 and low2,
+        // at m2 (half the length of y) and x (the shorter input),
+        // is used directly without splitting.
+        //
+        // The algorithm then proceeds as follows:
+        //
+        // 1. Compute the product z0 = x * low2.
+        // 2. Compute the product temp = x * high2.
+        // 3. Adjust the weight of temp by adding m2 (* NBASE ^ m2)
+        // 4. Add temp and z0 to obtain the final result.
+        //
+        // Proof:
+        //
+        // The algorithm can be derived from the original Karatsuba algorithm by
+        // simplifying the formula when the shorter factor x is not split into
+        // high and low parts, as shown below.
+        //
+        // Original Karatsuba formula:
+        //
+        //     result = (z2 * NBASE ^ (m2 × 2)) + ((z1 - z2 - z0) * NBASE ^ m2) + z0
+        //
+        // Substitutions:
+        //
+        //     low1 = x
+        //     high1 = 0
+        //
+        // Applying substitutions:
+        //
+        //     z0 = (low1 * low2)
+        //        = (x * low2)
+        //
+        //     z1 = ((low1 + high1) * (low2 + high2))
+        //        = ((x + 0) * (low2 + high2))
+        //        = (x * low2) + (x * high2)
+        //
+        //     z2 = (high1 * high2)
+        //        = (0 * high2)
+        //        = 0
+        //
+        // Simplified using the above substitutions:
+        //
+        //     result = (z2 * NBASE ^ (m2 × 2)) + ((z1 - z2 - z0) * NBASE ^ m2) + z0
+        //            = (0 * NBASE ^ (m2 × 2)) + ((z1 - 0 - z0) * NBASE ^ m2) + z0
+        //            = ((z1 - z0) * NBASE ^ m2) + z0
+        //            = ((z1 - z0) * NBASE ^ m2) + z0
+        //            = (x * high2) * NBASE ^ m2 + z0
+        let m2 = y.len() / 2;
+        let (low2, high2) = y.split_at(m2);
+
+        // (x * high2) * NBASE ^ m2 + z0
+        mac3(acc, x, low2);
+        mac3(&mut acc[m2..], x, high2);
     } else if x.len() <= 256 {
         // Karatsuba multiplication:
         //