From 788b1708cb568baac6fc1193218575017dbd39e1 Mon Sep 17 00:00:00 2001 From: Joel Jakobsson Date: Tue, 16 Apr 2024 22:19:26 +0200 Subject: [PATCH 1/4] Add two new multiply benchmarks testing large factors of unequal sizes test multiply_0 ... bench: 42 ns/iter (+/- 0) test multiply_1 ... bench: 4,271 ns/iter (+/- 68) test multiply_2 ... bench: 289,339 ns/iter (+/- 10,532) test multiply_3 ... bench: 663,387 ns/iter (+/- 17,381) test multiply_4 ... bench: 7,474 ns/iter (+/- 384) test multiply_5 ... bench: 16,376 ns/iter (+/- 254) --- benches/bigint.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/benches/bigint.rs b/benches/bigint.rs index 80ec191c..22795274 100644 --- a/benches/bigint.rs +++ b/benches/bigint.rs @@ -87,6 +87,16 @@ fn multiply_3(b: &mut Bencher) { multiply_bench(b, 1 << 16, 1 << 17); } +#[bench] +fn multiply_4(b: &mut Bencher) { + multiply_bench(b, 1 << 12, 1 << 13); +} + +#[bench] +fn multiply_5(b: &mut Bencher) { + multiply_bench(b, 1 << 12, 1 << 14); +} + #[bench] fn divide_0(b: &mut Bencher) { divide_bench(b, 1 << 8, 1 << 6); From bdfe16590ec230caf058c1113fb04a20e14f21ea Mon Sep 17 00:00:00 2001 From: Joel Jakobsson Date: Tue, 16 Apr 2024 22:22:06 +0200 Subject: [PATCH 2/4] Implement Half-Karatsuba for unequal factor sizes Introduces Half-Karatsuba in multiplication module for cases where there is a significant size disparity between factors. Optimizes performance by evenly splitting the larger factor for more balanced multiplication calculations. -test multiply_0 ... bench: 42 ns/iter (+/- 0) +test multiply_0 ... bench: 42 ns/iter (+/- 0) -test multiply_1 ... bench: 4,271 ns/iter (+/- 68) +test multiply_1 ... bench: 4,242 ns/iter (+/- 164) -test multiply_2 ... bench: 289,339 ns/iter (+/- 10,532) +test multiply_2 ... bench: 288,857 ns/iter (+/- 8,715) -test multiply_3 ... bench: 663,387 ns/iter (+/- 17,381) +test multiply_3 ... bench: 583,635 ns/iter (+/- 14,997) -test multiply_4 ... bench: 7,474 ns/iter (+/- 384) +test multiply_4 ... bench: 6,649 ns/iter (+/- 164) -test multiply_5 ... bench: 16,376 ns/iter (+/- 254) +test multiply_5 ... bench: 13,922 ns/iter (+/- 323) --- src/biguint/multiplication.rs | 83 ++++++++++++++++++++++++++++++++++- 1 file changed, 82 insertions(+), 1 deletion(-) diff --git a/src/biguint/multiplication.rs b/src/biguint/multiplication.rs index 4d7f1f21..37c24a42 100644 --- a/src/biguint/multiplication.rs +++ b/src/biguint/multiplication.rs @@ -88,9 +88,10 @@ fn mac3(mut acc: &mut [BigDigit], mut b: &[BigDigit], mut c: &[BigDigit]) { let acc = acc; let (x, y) = if b.len() < c.len() { (b, c) } else { (c, b) }; - // We use three algorithms for different input sizes. + // We use four algorithms for different input sizes. // // - For small inputs, long multiplication is fastest. + // - If y is at least least twice as long as x, split using Half-Karatsuba. // - Next we use Karatsuba multiplication (Toom-2), which we have optimized // to avoid unnecessary allocations for intermediate values. // - For the largest inputs we use Toom-3, which better optimizes the @@ -104,6 +105,86 @@ fn mac3(mut acc: &mut [BigDigit], mut b: &[BigDigit], mut c: &[BigDigit]) { for (i, xi) in x.iter().enumerate() { mac_digit(&mut acc[i..], y, *xi); } + } else if x.len() * 2 <= y.len() { + // Karatsuba Multiplication for factors with significant length disparity. + // + // The Half-Karatsuba Multiplication Algorithm is a specialized case of + // the normal Karatsuba multiplication algorithm, designed for the scenario + // where y has at least twice as many base digits as x. + // + // In this case y (the longer input) is split into high2 and low2, + // at m2 (half the length of y) and x (the shorter input), + // is used directly without splitting. + // + // The algorithm then proceeds as follows: + // + // 1. Compute the product z0 = x * low2. + // 2. Compute the product temp = x * high2. + // 3. Adjust the weight of temp by adding m2 (* NBASE ^ m2) + // 4. Add temp and z0 to obtain the final result. + // + // Proof: + // + // The algorithm can be derived from the original Karatsuba algorithm by + // simplifying the formula when the shorter factor x is not split into + // high and low parts, as shown below. + // + // Original Karatsuba formula: + // + // result = (z2 * NBASE ^ (m2 × 2)) + ((z1 - z2 - z0) * NBASE ^ m2) + z0 + // + // Substitutions: + // + // low1 = x + // high1 = 0 + // + // Applying substitutions: + // + // z0 = (low1 * low2) + // = (x * low2) + // + // z1 = ((low1 + high1) * (low2 + high2)) + // = ((x + 0) * (low2 + high2)) + // = (x * low2) + (x * high2) + // + // z2 = (high1 * high2) + // = (0 * high2) + // = 0 + // + // Simplified using the above substitutions: + // + // result = (z2 * NBASE ^ (m2 × 2)) + ((z1 - z2 - z0) * NBASE ^ m2) + z0 + // = (0 * NBASE ^ (m2 × 2)) + ((z1 - 0 - z0) * NBASE ^ m2) + z0 + // = ((z1 - z0) * NBASE ^ m2) + z0 + // = ((z1 - z0) * NBASE ^ m2) + z0 + // = (x * high2) * NBASE ^ m2 + z0 + let m2 = y.len() / 2; + let (low2, high2) = y.split_at(m2); + + // We reuse the same BigUint for all the intermediate multiplies and have to size p + // appropriately here: x.len() and high2.len() could be different: + let len = x.len() + high2.len() + 1; + let mut p = BigUint { data: vec![0; len] }; + + // z0 = x * low2 + mac3(&mut p.data, x, low2); + p.normalize(); + + // Add z0 directly to the accumulator + add2(acc, &p.data); + + // Zero out p before the next multiply: + p.data.truncate(0); + p.data.resize(len, 0); + + // temp = x * high2 + mac3(&mut p.data, x, high2); + p.normalize(); + + // Add temp shifted by m2 to the accumulator + // This simulates the effect of multiplying temp by b^m2. + // Add directly starting at index m2 in the accumulator. + add2(&mut acc[m2..], &p.data); } else if x.len() <= 256 { // Karatsuba multiplication: // From 5ff5164d3cb834e03b925847c3dfc5bbefd38a87 Mon Sep 17 00:00:00 2001 From: Joel Jakobsson Date: Wed, 17 Apr 2024 04:40:16 +0200 Subject: [PATCH 3/4] Simplify further by eliminating p to avoid allocation Per suggestion from cuviper --- src/biguint/multiplication.rs | 26 +++----------------------- 1 file changed, 3 insertions(+), 23 deletions(-) diff --git a/src/biguint/multiplication.rs b/src/biguint/multiplication.rs index 37c24a42..a4e85ec7 100644 --- a/src/biguint/multiplication.rs +++ b/src/biguint/multiplication.rs @@ -161,30 +161,10 @@ fn mac3(mut acc: &mut [BigDigit], mut b: &[BigDigit], mut c: &[BigDigit]) { let m2 = y.len() / 2; let (low2, high2) = y.split_at(m2); - // We reuse the same BigUint for all the intermediate multiplies and have to size p - // appropriately here: x.len() and high2.len() could be different: - let len = x.len() + high2.len() + 1; - let mut p = BigUint { data: vec![0; len] }; - - // z0 = x * low2 - mac3(&mut p.data, x, low2); - p.normalize(); - - // Add z0 directly to the accumulator - add2(acc, &p.data); - - // Zero out p before the next multiply: - p.data.truncate(0); - p.data.resize(len, 0); - - // temp = x * high2 - mac3(&mut p.data, x, high2); - p.normalize(); + // (x * high2) * NBASE ^ m2 + z0 + mac3(acc, x, low2); + mac3(&mut acc[m2..], x, high2); - // Add temp shifted by m2 to the accumulator - // This simulates the effect of multiplying temp by b^m2. - // Add directly starting at index m2 in the accumulator. - add2(&mut acc[m2..], &p.data); } else if x.len() <= 256 { // Karatsuba multiplication: // From aea928412ecbaa82d4b8e51fc595c8a96fb05566 Mon Sep 17 00:00:00 2001 From: Joel Jakobsson Date: Wed, 17 Apr 2024 07:34:35 +0200 Subject: [PATCH 4/4] Fix format complaint; remove trailing newline --- src/biguint/multiplication.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/biguint/multiplication.rs b/src/biguint/multiplication.rs index a4e85ec7..fe56ac36 100644 --- a/src/biguint/multiplication.rs +++ b/src/biguint/multiplication.rs @@ -164,7 +164,6 @@ fn mac3(mut acc: &mut [BigDigit], mut b: &[BigDigit], mut c: &[BigDigit]) { // (x * high2) * NBASE ^ m2 + z0 mac3(acc, x, low2); mac3(&mut acc[m2..], x, high2); - } else if x.len() <= 256 { // Karatsuba multiplication: //