Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use inline asm! for x86 DIV #236

Merged
merged 2 commits into from May 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
14 changes: 10 additions & 4 deletions src/biguint/convert.rs
Expand Up @@ -4,7 +4,7 @@
use super::{biguint_from_vec, BigUint, ToBigUint};

use super::addition::add2;
use super::division::div_rem_digit;
use super::division::{div_rem_digit, FAST_DIV_WIDE};
use super::multiplication::mac_with_carry;

use crate::big_digit::{self, BigDigit};
Expand Down Expand Up @@ -688,16 +688,22 @@ pub(super) fn to_radix_digits_le(u: &BigUint, radix: u32) -> Vec<u8> {

let mut digits = u.clone();

let (base, power) = get_half_radix_base(radix);
// X86 DIV can quickly divide by a full digit, otherwise we choose a divisor
// that's suitable for `div_half` to avoid slow `DoubleBigDigit` division.
let (base, power) = if FAST_DIV_WIDE {
get_radix_base(radix)
} else {
get_half_radix_base(radix)
};
let radix = radix as BigDigit;

// For very large numbers, the O(n²) loop of repeated `div_rem_digit` dominates the
// performance. We can mitigate this by dividing into chunks of a larger base first.
// The threshold for this was chosen by anecdotal performance measurements to
// approximate where this starts to make a noticeable difference.
if digits.data.len() >= 64 {
let mut big_base = BigUint::from(base * base);
let mut big_power = 2usize;
let mut big_base = BigUint::from(base);
let mut big_power = 1usize;

// Choose a target base length near √n.
let target_len = digits.data.len().sqrt();
Expand Down
37 changes: 34 additions & 3 deletions src/biguint/division.rs
Expand Up @@ -10,12 +10,15 @@ use core::ops::{Div, DivAssign, Rem, RemAssign};
use num_integer::Integer;
use num_traits::{CheckedDiv, CheckedEuclid, Euclid, One, ToPrimitive, Zero};

pub(super) const FAST_DIV_WIDE: bool = cfg!(any(target_arch = "x86", target_arch = "x86_64"));

/// Divide a two digit numerator by a one digit divisor, returns quotient and remainder:
///
/// Note: the caller must ensure that both the quotient and remainder will fit into a single digit.
/// This is _not_ true for an arbitrary numerator/denominator.
///
/// (This function also matches what the x86 divide instruction does).
#[cfg(any(miri, not(any(target_arch = "x86", target_arch = "x86_64"))))]
#[inline]
fn div_wide(hi: BigDigit, lo: BigDigit, divisor: BigDigit) -> (BigDigit, BigDigit) {
debug_assert!(hi < divisor);
Expand All @@ -25,6 +28,34 @@ fn div_wide(hi: BigDigit, lo: BigDigit, divisor: BigDigit) -> (BigDigit, BigDigi
((lhs / rhs) as BigDigit, (lhs % rhs) as BigDigit)
}

/// x86 and x86_64 can use a real `div` instruction.
#[cfg(all(not(miri), any(target_arch = "x86", target_arch = "x86_64")))]
#[inline]
fn div_wide(hi: BigDigit, lo: BigDigit, divisor: BigDigit) -> (BigDigit, BigDigit) {
// This debug assertion covers the potential #DE for divisor==0 or a quotient too large for one
// register, otherwise in release mode it will become a target-specific fault like SIGFPE.
// This should never occur with the inputs from our few `div_wide` callers.
debug_assert!(hi < divisor);

// SAFETY: The `div` instruction only affects registers, reading the explicit operand as the
// divisor, and implicitly reading RDX:RAX or EDX:EAX as the dividend. The result is implicitly
// written back to RAX or EAX for the quotient and RDX or EDX for the remainder. No memory is
// used, and flags are not preserved.
unsafe {
let (div, rem);

core::arch::asm!(
"div {}",
in(reg) divisor,
inout("dx") hi => rem,
inout("ax") lo => div,
options(pure, nomem, nostack),
);

(div, rem)
}
}

/// For small divisors, we can divide without promoting to `DoubleBigDigit` by
/// using half-size pieces of digit, like long-division.
#[inline]
Expand All @@ -45,7 +76,7 @@ pub(super) fn div_rem_digit(mut a: BigUint, b: BigDigit) -> (BigUint, BigDigit)

let mut rem = 0;

if b <= big_digit::HALF {
if !FAST_DIV_WIDE && b <= big_digit::HALF {
for d in a.data.iter_mut().rev() {
let (q, r) = div_half(rem, *d, b);
*d = q;
Expand All @@ -70,7 +101,7 @@ fn rem_digit(a: &BigUint, b: BigDigit) -> BigDigit {

let mut rem = 0;

if b <= big_digit::HALF {
if !FAST_DIV_WIDE && b <= big_digit::HALF {
for &digit in a.data.iter().rev() {
let (_, r) = div_half(rem, digit, b);
rem = r;
Expand Down Expand Up @@ -230,7 +261,7 @@ fn div_rem_core(mut a: BigUint, b: &[BigDigit]) -> (BigUint, BigUint) {
let mut a0 = 0;

// [b1, b0] are the two most significant digits of the divisor. They never change.
let b0 = *b.last().unwrap();
let b0 = b[b.len() - 1];
let b1 = b[b.len() - 2];

let q_len = a.data.len() - b.len() + 1;
Expand Down