Skip to content

Commit

Permalink
Merge #216
Browse files Browse the repository at this point in the history
216: Reduce the computational complexity of formatting r=cuviper a=cuviper

This only matters for very large values, more than about 1000 digits, but it does make a big difference for those.

For the included benchmarks, I got these base-10 results before:

```text
test to_str_radix_10      ... bench:       2,095 ns/iter (+/- 20)
test to_str_radix_10_2    ... bench:     150,702 ns/iter (+/- 1,231)
```

The `_2` variant is over 10,000 bits. With the new code, I get these results:

```text
test to_str_radix_10      ... bench:       2,065 ns/iter (+/- 22)
test to_str_radix_10_2    ... bench:      44,130 ns/iter (+/- 231)
```

Co-authored-by: Josh Stone <cuviper@gmail.com>
  • Loading branch information
bors[bot] and cuviper committed Aug 28, 2021
2 parents 3753602 + 3af4287 commit 9ea4757
Show file tree
Hide file tree
Showing 4 changed files with 79 additions and 34 deletions.
19 changes: 12 additions & 7 deletions benches/bigint.rs
Expand Up @@ -174,35 +174,40 @@ fn fib_to_string(b: &mut Bencher) {
b.iter(|| fib.to_string());
}

fn to_str_radix_bench(b: &mut Bencher, radix: u32) {
fn to_str_radix_bench(b: &mut Bencher, radix: u32, bits: u64) {
let mut rng = get_rng();
let x = rng.gen_bigint(1009);
let x = rng.gen_bigint(bits);
b.iter(|| x.to_str_radix(radix));
}

#[bench]
fn to_str_radix_02(b: &mut Bencher) {
to_str_radix_bench(b, 2);
to_str_radix_bench(b, 2, 1009);
}

#[bench]
fn to_str_radix_08(b: &mut Bencher) {
to_str_radix_bench(b, 8);
to_str_radix_bench(b, 8, 1009);
}

#[bench]
fn to_str_radix_10(b: &mut Bencher) {
to_str_radix_bench(b, 10);
to_str_radix_bench(b, 10, 1009);
}

#[bench]
fn to_str_radix_10_2(b: &mut Bencher) {
to_str_radix_bench(b, 10, 10009);
}

#[bench]
fn to_str_radix_16(b: &mut Bencher) {
to_str_radix_bench(b, 16);
to_str_radix_bench(b, 16, 1009);
}

#[bench]
fn to_str_radix_36(b: &mut Bencher) {
to_str_radix_bench(b, 36);
to_str_radix_bench(b, 36, 1009);
}

fn from_str_radix_bench(b: &mut Bencher, radix: u32) {
Expand Down
35 changes: 34 additions & 1 deletion src/biguint/convert.rs
Expand Up @@ -15,7 +15,7 @@ use core::cmp::Ordering::{Equal, Greater, Less};
use core::convert::TryFrom;
use core::mem;
use core::str::FromStr;
use num_integer::Integer;
use num_integer::{Integer, Roots};
use num_traits::float::FloatCore;
use num_traits::{FromPrimitive, Num, PrimInt, ToPrimitive, Zero};

Expand Down Expand Up @@ -665,6 +665,39 @@ pub(super) fn to_radix_digits_le(u: &BigUint, radix: u32) -> Vec<u8> {
let (base, power) = get_radix_base(radix, big_digit::HALF_BITS);
let radix = radix as BigDigit;

// For very large numbers, the O(n²) loop of repeated `div_rem_digit` dominates the
// performance. We can mitigate this by dividing into chunks of a larger base first.
// The threshold for this was chosen by anecdotal performance measurements to
// approximate where this starts to make a noticeable difference.
if digits.data.len() >= 64 {
let mut big_base = BigUint::from(base * base);
let mut big_power = 2usize;

// Choose a target base length near √n.
let target_len = digits.data.len().sqrt();
while big_base.data.len() < target_len {
big_base = &big_base * &big_base;
big_power *= 2;
}

// This outer loop will run approximately √n times.
while digits > big_base {
// This is still the dominating factor, with n digits divided by √n digits.
let (q, mut big_r) = digits.div_rem(&big_base);
digits = q;

// This inner loop now has O(√n²)=O(n) behavior altogether.
for _ in 0..big_power {
let (q, mut r) = div_rem_digit(big_r, base);
big_r = q;
for _ in 0..power {
res.push((r % radix) as u8);
r /= radix;
}
}
}
}

while digits.data.len() > 1 {
let (q, mut r) = div_rem_digit(digits, base);
for _ in 0..power {
Expand Down
49 changes: 23 additions & 26 deletions src/biguint/division.rs
@@ -1,7 +1,7 @@
use super::addition::__add2;
#[cfg(not(u64_digit))]
use super::u32_to_u128;
use super::BigUint;
use super::{cmp_slice, BigUint};

use crate::big_digit::{self, BigDigit, DoubleBigDigit};
use crate::UsizePromotion;
Expand Down Expand Up @@ -153,14 +153,14 @@ fn div_rem(mut u: BigUint, mut d: BigUint) -> (BigUint, BigUint) {
//
let shift = d.data.last().unwrap().leading_zeros() as usize;

let (q, r) = if shift == 0 {
if shift == 0 {
// no need to clone d
div_rem_core(u, &d)
div_rem_core(u, &d.data)
} else {
div_rem_core(u << shift, &(d << shift))
};
// renormalize the remainder
(q, r >> shift)
let (q, r) = div_rem_core(u << shift, &(d << shift).data);
// renormalize the remainder
(q, r >> shift)
}
}

pub(super) fn div_rem_ref(u: &BigUint, d: &BigUint) -> (BigUint, BigUint) {
Expand Down Expand Up @@ -195,24 +195,21 @@ pub(super) fn div_rem_ref(u: &BigUint, d: &BigUint) -> (BigUint, BigUint) {
//
let shift = d.data.last().unwrap().leading_zeros() as usize;

let (q, r) = if shift == 0 {
if shift == 0 {
// no need to clone d
div_rem_core(u.clone(), d)
div_rem_core(u.clone(), &d.data)
} else {
div_rem_core(u << shift, &(d << shift))
};
// renormalize the remainder
(q, r >> shift)
let (q, r) = div_rem_core(u << shift, &(d << shift).data);
// renormalize the remainder
(q, r >> shift)
}
}

/// An implementation of the base division algorithm.
/// Knuth, TAOCP vol 2 section 4.3.1, algorithm D, with an improvement from exercises 19-21.
fn div_rem_core(mut a: BigUint, b: &BigUint) -> (BigUint, BigUint) {
debug_assert!(
a.data.len() >= b.data.len()
&& b.data.len() > 1
&& b.data.last().unwrap().leading_zeros() == 0
);
fn div_rem_core(mut a: BigUint, b: &[BigDigit]) -> (BigUint, BigUint) {
debug_assert!(a.data.len() >= b.len() && b.len() > 1);
debug_assert!(b.last().unwrap().leading_zeros() == 0);

// The algorithm works by incrementally calculating "guesses", q0, for the next digit of the
// quotient. Once we have any number q0 such that (q0 << j) * b <= a, we can set
Expand All @@ -235,16 +232,16 @@ fn div_rem_core(mut a: BigUint, b: &BigUint) -> (BigUint, BigUint) {
let mut a0 = 0;

// [b1, b0] are the two most significant digits of the divisor. They never change.
let b0 = *b.data.last().unwrap();
let b1 = b.data[b.data.len() - 2];
let b0 = *b.last().unwrap();
let b1 = b[b.len() - 2];

let q_len = a.data.len() - b.data.len() + 1;
let q_len = a.data.len() - b.len() + 1;
let mut q = BigUint {
data: vec![0; q_len],
};

for j in (0..q_len).rev() {
debug_assert!(a.data.len() == b.data.len() + j);
debug_assert!(a.data.len() == b.len() + j);

let a1 = *a.data.last().unwrap();
let a2 = a.data[a.data.len() - 2];
Expand Down Expand Up @@ -280,11 +277,11 @@ fn div_rem_core(mut a: BigUint, b: &BigUint) -> (BigUint, BigUint) {
// q0 is now either the correct quotient digit, or in rare cases 1 too large.
// Subtract (q0 << j) from a. This may overflow, in which case we will have to correct.

let mut borrow = sub_mul_digit_same_len(&mut a.data[j..], &b.data, q0);
let mut borrow = sub_mul_digit_same_len(&mut a.data[j..], b, q0);
if borrow > a0 {
// q0 is too large. We need to add back one multiple of b.
q0 -= 1;
borrow -= __add2(&mut a.data[j..], &b.data);
borrow -= __add2(&mut a.data[j..], b);
}
// The top digit of a, stored in a0, has now been zeroed.
debug_assert!(borrow == a0);
Expand All @@ -298,7 +295,7 @@ fn div_rem_core(mut a: BigUint, b: &BigUint) -> (BigUint, BigUint) {
a.data.push(a0);
a.normalize();

debug_assert!(a < *b);
debug_assert_eq!(cmp_slice(&a.data, b), Less);

(q.normalized(), a)
}
Expand Down
10 changes: 10 additions & 0 deletions tests/biguint.rs
Expand Up @@ -1600,6 +1600,16 @@ fn test_all_str_radix() {
}
}

#[test]
fn test_big_str() {
for n in 2..=20_u32 {
let x: BigUint = BigUint::from(n).pow(10_000_u32);
let s = x.to_string();
let y: BigUint = s.parse().unwrap();
assert_eq!(x, y);
}
}

#[test]
fn test_lower_hex() {
let a = BigUint::parse_bytes(b"A", 16).unwrap();
Expand Down

0 comments on commit 9ea4757

Please sign in to comment.