@@ -1081,9 +1081,8 @@ pub(crate) unsafe fn align_offset<T: Sized>(p: *const T, a: usize) -> usize {
1081
1081
// uses e.g., subtraction `mod n`. It is entirely fine to do them `mod
1082
1082
// usize::max_value()` instead, because we take the result `mod n` at the end
1083
1083
// anyway.
1084
- inverse = inverse. wrapping_mul ( 2usize . wrapping_sub ( x. wrapping_mul ( inverse) ) )
1085
- & ( going_mod - 1 ) ;
1086
- if going_mod > m {
1084
+ inverse = inverse. wrapping_mul ( 2usize . wrapping_sub ( x. wrapping_mul ( inverse) ) ) ;
1085
+ if going_mod >= m {
1087
1086
return inverse & ( m - 1 ) ;
1088
1087
}
1089
1088
going_mod = going_mod. wrapping_mul ( going_mod) ;
@@ -1115,26 +1114,33 @@ pub(crate) unsafe fn align_offset<T: Sized>(p: *const T, a: usize) -> usize {
1115
1114
let gcdpow = intrinsics:: cttz_nonzero ( stride) . min ( intrinsics:: cttz_nonzero ( a) ) ;
1116
1115
let gcd = 1usize << gcdpow;
1117
1116
1118
- if p as usize & ( gcd - 1 ) == 0 {
1117
+ if p as usize & ( gcd. wrapping_sub ( 1 ) ) == 0 {
1119
1118
// This branch solves for the following linear congruence equation:
1120
1119
//
1121
- // $$ p + so ≡ 0 mod a $$
1120
+ // ` p + so = 0 mod a `
1122
1121
//
1123
- // $p$ here is the pointer value, $s$ – stride of `T`, $o$ offset in `T`s, and $a$ – the
1122
+ // `p` here is the pointer value, `s` - stride of `T`, `o` offset in `T`s, and `a` - the
1124
1123
// requested alignment.
1125
1124
//
1126
- // g = gcd(a, s)
1127
- // o = (a - (p mod a))/g * ((s/g)⁻¹ mod a)
1125
+ // With ` g = gcd(a, s)`, and the above asserting that `p` is also divisible by `g`, we can
1126
+ // denote `a' = a/g`, `s' = s/g`, `p' = p/g`, then this becomes equivalent to:
1128
1127
//
1129
- // The first term is “the relative alignment of p to a”, the second term is “how does
1130
- // incrementing p by s bytes change the relative alignment of p”. Division by `g` is
1131
- // necessary to make this equation well formed if $a$ and $s$ are not co-prime.
1128
+ // ` p' + s'o = 0 mod a' `
1129
+ // ` o = (a' - (p' mod a')) * (s'^-1 mod a') `
1132
1130
//
1133
- // Furthermore, the result produced by this solution is not “minimal”, so it is necessary
1134
- // to take the result $o mod lcm(s, a)$. We can replace $lcm(s, a)$ with just a $a / g$.
1135
- let j = a. wrapping_sub ( pmoda) >> gcdpow;
1136
- let k = smoda >> gcdpow;
1137
- return intrinsics:: unchecked_rem ( j. wrapping_mul ( mod_inv ( k, a) ) , a >> gcdpow) ;
1131
+ // The first term is "the relative alignment of `p` to `a`" (divided by the `g`), the second
1132
+ // term is "how does incrementing `p` by `s` bytes change the relative alignment of `p`" (again
1133
+ // divided by `g`).
1134
+ // Division by `g` is necessary to make the inverse well formed if `a` and `s` are not
1135
+ // co-prime.
1136
+ //
1137
+ // Furthermore, the result produced by this solution is not "minimal", so it is necessary
1138
+ // to take the result `o mod lcm(s, a)`. We can replace `lcm(s, a)` with just a `a'`.
1139
+ let a2 = a >> gcdpow;
1140
+ let a2minus1 = a2. wrapping_sub ( 1 ) ;
1141
+ let s2 = smoda >> gcdpow;
1142
+ let minusp2 = a2. wrapping_sub ( pmoda >> gcdpow) ;
1143
+ return ( minusp2. wrapping_mul ( mod_inv ( s2, a2) ) ) & a2minus1;
1138
1144
}
1139
1145
1140
1146
// Cannot be aligned at all.
0 commit comments