Skip to content

Commit c0771f2

Browse files
committed
Auto merge of #44051 - eddyb:apfloat-faster-div, r=nagisa
Speed up APFloat division by using short division for small divisors. Fixes #43828 (hopefully), by not doing long division bit-by-bit for small divisors. When parsing the ~200,000 decimal float literals in the `tuple-stress` benchmark, this change brings roughly a 5x speed increase (from `0.6s` to `0.12s`), and the hottest instructions are native `div`s.
2 parents 2c0558f + b9c69ec commit c0771f2

File tree

1 file changed

+96
-30
lines changed

1 file changed

+96
-30
lines changed

src/librustc_apfloat/ieee.rs

+96-30
Original file line numberDiff line numberDiff line change
@@ -460,18 +460,15 @@ impl<S: Semantics> fmt::Display for IeeeFloat<S> {
460460
// rem <- sig % 10
461461
// sig <- sig / 10
462462
let mut rem = 0;
463-
for limb in sig.iter_mut().rev() {
464-
// We don't have an integer doubly wide than Limb,
465-
// so we have to split the divrem on two halves.
466-
const HALF_BITS: usize = LIMB_BITS / 2;
467-
let mut halves = [*limb & ((1 << HALF_BITS) - 1), *limb >> HALF_BITS];
468-
for half in halves.iter_mut().rev() {
469-
*half |= rem << HALF_BITS;
470-
rem = *half % 10;
471-
*half /= 10;
472-
}
473-
*limb = halves[0] | (halves[1] << HALF_BITS);
474-
}
463+
464+
// Use 64-bit division and remainder, with 32-bit chunks from sig.
465+
sig::each_chunk(&mut sig, 32, |chunk| {
466+
let chunk = chunk as u32;
467+
let combined = ((rem as u64) << 32) | (chunk as u64);
468+
rem = (combined % 10) as u8;
469+
(combined / 10) as u32 as Limb
470+
});
471+
475472
// Reduce the sigificand to avoid wasting time dividing 0's.
476473
while sig.last() == Some(&0) {
477474
sig.pop();
@@ -491,7 +488,7 @@ impl<S: Semantics> fmt::Display for IeeeFloat<S> {
491488
exp += 1;
492489
} else {
493490
in_trail = false;
494-
buffer.push(b'0' + digit as u8);
491+
buffer.push(b'0' + digit);
495492
}
496493
}
497494

@@ -2065,7 +2062,7 @@ impl<S: Semantics> IeeeFloat<S> {
20652062
};
20662063

20672064
// Attempt dec_sig * 10^dec_exp with increasing precision.
2068-
let mut attempt = 1;
2065+
let mut attempt = 0;
20692066
loop {
20702067
let calc_precision = (LIMB_BITS << attempt) - 1;
20712068
attempt += 1;
@@ -2310,6 +2307,17 @@ mod sig {
23102307
limbs.iter().all(|&l| l == 0)
23112308
}
23122309

2310+
/// One, not zero, based LSB. That is, returns 0 for a zeroed significand.
2311+
pub(super) fn olsb(limbs: &[Limb]) -> usize {
2312+
for i in 0..limbs.len() {
2313+
if limbs[i] != 0 {
2314+
return i * LIMB_BITS + limbs[i].trailing_zeros() as usize + 1;
2315+
}
2316+
}
2317+
2318+
0
2319+
}
2320+
23132321
/// One, not zero, based MSB. That is, returns 0 for a zeroed significand.
23142322
pub(super) fn omsb(limbs: &[Limb]) -> usize {
23152323
for i in (0..limbs.len()).rev() {
@@ -2468,6 +2476,20 @@ mod sig {
24682476
}
24692477
}
24702478

2479+
/// For every consecutive chunk of `bits` bits from `limbs`,
2480+
/// going from most significant to the least significant bits,
2481+
/// call `f` to transform those bits and store the result back.
2482+
pub(super) fn each_chunk<F: FnMut(Limb) -> Limb>(limbs: &mut [Limb], bits: usize, mut f: F) {
2483+
assert_eq!(LIMB_BITS % bits, 0);
2484+
for limb in limbs.iter_mut().rev() {
2485+
let mut r = 0;
2486+
for i in (0..LIMB_BITS / bits).rev() {
2487+
r |= f((*limb >> (i * bits)) & ((1 << bits) - 1)) << (i * bits);
2488+
}
2489+
*limb = r;
2490+
}
2491+
}
2492+
24712493
/// Increment in-place, return the carry flag.
24722494
pub(super) fn increment(dst: &mut [Limb]) -> Limb {
24732495
for x in dst {
@@ -2686,10 +2708,6 @@ mod sig {
26862708
divisor: &mut [Limb],
26872709
precision: usize,
26882710
) -> Loss {
2689-
// Zero the quotient before setting bits in it.
2690-
for x in &mut quotient[..limbs_for_bits(precision)] {
2691-
*x = 0;
2692-
}
26932711

26942712
// Normalize the divisor.
26952713
let bits = precision - omsb(divisor);
@@ -2700,6 +2718,13 @@ mod sig {
27002718
let bits = precision - omsb(dividend);
27012719
shift_left(dividend, exp, bits);
27022720

2721+
// Division by 1.
2722+
let olsb_divisor = olsb(divisor);
2723+
if olsb_divisor == precision {
2724+
quotient.copy_from_slice(dividend);
2725+
return Loss::ExactlyZero;
2726+
}
2727+
27032728
// Ensure the dividend >= divisor initially for the loop below.
27042729
// Incidentally, this means that the division loop below is
27052730
// guaranteed to set the integer bit to one.
@@ -2708,6 +2733,58 @@ mod sig {
27082733
assert_ne!(cmp(dividend, divisor), Ordering::Less)
27092734
}
27102735

2736+
// Helper for figuring out the lost fraction.
2737+
let lost_fraction = |dividend: &[Limb], divisor: &[Limb]| {
2738+
match cmp(dividend, divisor) {
2739+
Ordering::Greater => Loss::MoreThanHalf,
2740+
Ordering::Equal => Loss::ExactlyHalf,
2741+
Ordering::Less => {
2742+
if is_all_zeros(dividend) {
2743+
Loss::ExactlyZero
2744+
} else {
2745+
Loss::LessThanHalf
2746+
}
2747+
}
2748+
}
2749+
};
2750+
2751+
// Try to perform a (much faster) short division for small divisors.
2752+
let divisor_bits = precision - (olsb_divisor - 1);
2753+
macro_rules! try_short_div {
2754+
($W:ty, $H:ty, $half:expr) => {
2755+
if divisor_bits * 2 <= $half {
2756+
// Extract the small divisor.
2757+
let _: Loss = shift_right(divisor, &mut 0, olsb_divisor - 1);
2758+
let divisor = divisor[0] as $H as $W;
2759+
2760+
// Shift the dividend to produce a quotient with the unit bit set.
2761+
let top_limb = *dividend.last().unwrap();
2762+
let mut rem = (top_limb >> (LIMB_BITS - (divisor_bits - 1))) as $H;
2763+
shift_left(dividend, &mut 0, divisor_bits - 1);
2764+
2765+
// Apply short division in place on $H (of $half bits) chunks.
2766+
each_chunk(dividend, $half, |chunk| {
2767+
let chunk = chunk as $H;
2768+
let combined = ((rem as $W) << $half) | (chunk as $W);
2769+
rem = (combined % divisor) as $H;
2770+
(combined / divisor) as $H as Limb
2771+
});
2772+
quotient.copy_from_slice(dividend);
2773+
2774+
return lost_fraction(&[(rem as Limb) << 1], &[divisor as Limb]);
2775+
}
2776+
}
2777+
}
2778+
2779+
try_short_div!(u32, u16, 16);
2780+
try_short_div!(u64, u32, 32);
2781+
try_short_div!(u128, u64, 64);
2782+
2783+
// Zero the quotient before setting bits in it.
2784+
for x in &mut quotient[..limbs_for_bits(precision)] {
2785+
*x = 0;
2786+
}
2787+
27112788
// Long division.
27122789
for bit in (0..precision).rev() {
27132790
if cmp(dividend, divisor) != Ordering::Less {
@@ -2717,17 +2794,6 @@ mod sig {
27172794
shift_left(dividend, &mut 0, 1);
27182795
}
27192796

2720-
// Figure out the lost fraction.
2721-
match cmp(dividend, divisor) {
2722-
Ordering::Greater => Loss::MoreThanHalf,
2723-
Ordering::Equal => Loss::ExactlyHalf,
2724-
Ordering::Less => {
2725-
if is_all_zeros(dividend) {
2726-
Loss::ExactlyZero
2727-
} else {
2728-
Loss::LessThanHalf
2729-
}
2730-
}
2731-
}
2797+
lost_fraction(dividend, divisor)
27322798
}
27332799
}

0 commit comments

Comments
 (0)