Skip to content

Commit 47faf1d

Browse files
committed
Auto merge of #42819 - scottmcm:swap-nonoverlapping, r=sfackler
Reuse the mem::swap optimizations to speed up slice::rotate This is most helpful for compound types where LLVM didn't vectorize the loop. Highlight: bench slice::rotate_medium_by727_strings gets 38% faster. Exposes the swapping logic from PR #40454 as `pub unsafe fn ptr::swap_nonoverlapping` under library feature `swap_nonoverlapping` #42818. (The new method seemed plausible, and was the simplest way to share the logic. I'm not attached to it, though, so let me know if a different way would be better.)
2 parents c169307 + 47fa016 commit 47faf1d

File tree

3 files changed

+86
-61
lines changed

3 files changed

+86
-61
lines changed

src/libcore/mem.rs

+1-53
Original file line numberDiff line numberDiff line change
@@ -506,59 +506,7 @@ pub unsafe fn uninitialized<T>() -> T {
506506
#[stable(feature = "rust1", since = "1.0.0")]
507507
pub fn swap<T>(x: &mut T, y: &mut T) {
508508
unsafe {
509-
// The approach here is to utilize simd to swap x & y efficiently. Testing reveals
510-
// that swapping either 32 bytes or 64 bytes at a time is most efficient for intel
511-
// Haswell E processors. LLVM is more able to optimize if we give a struct a
512-
// #[repr(simd)], even if we don't actually use this struct directly.
513-
//
514-
// FIXME repr(simd) broken on emscripten and redox
515-
#[cfg_attr(not(any(target_os = "emscripten", target_os = "redox")), repr(simd))]
516-
struct Block(u64, u64, u64, u64);
517-
struct UnalignedBlock(u64, u64, u64, u64);
518-
519-
let block_size = size_of::<Block>();
520-
521-
// Get raw pointers to the bytes of x & y for easier manipulation
522-
let x = x as *mut T as *mut u8;
523-
let y = y as *mut T as *mut u8;
524-
525-
// Loop through x & y, copying them `Block` at a time
526-
// The optimizer should unroll the loop fully for most types
527-
// N.B. We can't use a for loop as the `range` impl calls `mem::swap` recursively
528-
let len = size_of::<T>();
529-
let mut i = 0;
530-
while i + block_size <= len {
531-
// Create some uninitialized memory as scratch space
532-
// Declaring `t` here avoids aligning the stack when this loop is unused
533-
let mut t: Block = uninitialized();
534-
let t = &mut t as *mut _ as *mut u8;
535-
let x = x.offset(i as isize);
536-
let y = y.offset(i as isize);
537-
538-
// Swap a block of bytes of x & y, using t as a temporary buffer
539-
// This should be optimized into efficient SIMD operations where available
540-
ptr::copy_nonoverlapping(x, t, block_size);
541-
ptr::copy_nonoverlapping(y, x, block_size);
542-
ptr::copy_nonoverlapping(t, y, block_size);
543-
i += block_size;
544-
}
545-
546-
547-
if i < len {
548-
// Swap any remaining bytes, using aligned types to copy
549-
// where appropriate (this information is lost by conversion
550-
// to *mut u8, so restore it manually here)
551-
let mut t: UnalignedBlock = uninitialized();
552-
let rem = len - i;
553-
554-
let t = &mut t as *mut _ as *mut u8;
555-
let x = x.offset(i as isize);
556-
let y = y.offset(i as isize);
557-
558-
ptr::copy_nonoverlapping(x, t, rem);
559-
ptr::copy_nonoverlapping(y, x, rem);
560-
ptr::copy_nonoverlapping(t, y, rem);
561-
}
509+
ptr::swap_nonoverlapping(x, y, 1);
562510
}
563511
}
564512

src/libcore/ptr.rs

+84
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,90 @@ pub unsafe fn swap<T>(x: *mut T, y: *mut T) {
117117
mem::forget(tmp);
118118
}
119119

120+
/// Swaps a sequence of values at two mutable locations of the same type.
121+
///
122+
/// # Safety
123+
///
124+
/// The two arguments must each point to the beginning of `count` locations
125+
/// of valid memory, and the two memory ranges must not overlap.
126+
///
127+
/// # Examples
128+
///
129+
/// Basic usage:
130+
///
131+
/// ```
132+
/// #![feature(swap_nonoverlapping)]
133+
///
134+
/// use std::ptr;
135+
///
136+
/// let mut x = [1, 2, 3, 4];
137+
/// let mut y = [7, 8, 9];
138+
///
139+
/// unsafe {
140+
/// ptr::swap_nonoverlapping(x.as_mut_ptr(), y.as_mut_ptr(), 2);
141+
/// }
142+
///
143+
/// assert_eq!(x, [7, 8, 3, 4]);
144+
/// assert_eq!(y, [1, 2, 9]);
145+
/// ```
146+
#[inline]
147+
#[unstable(feature = "swap_nonoverlapping", issue = "42818")]
148+
pub unsafe fn swap_nonoverlapping<T>(x: *mut T, y: *mut T, count: usize) {
149+
let x = x as *mut u8;
150+
let y = y as *mut u8;
151+
let len = mem::size_of::<T>() * count;
152+
swap_nonoverlapping_bytes(x, y, len)
153+
}
154+
155+
#[inline]
156+
unsafe fn swap_nonoverlapping_bytes(x: *mut u8, y: *mut u8, len: usize) {
157+
// The approach here is to utilize simd to swap x & y efficiently. Testing reveals
158+
// that swapping either 32 bytes or 64 bytes at a time is most efficient for intel
159+
// Haswell E processors. LLVM is more able to optimize if we give a struct a
160+
// #[repr(simd)], even if we don't actually use this struct directly.
161+
//
162+
// FIXME repr(simd) broken on emscripten and redox
163+
#[cfg_attr(not(any(target_os = "emscripten", target_os = "redox")), repr(simd))]
164+
struct Block(u64, u64, u64, u64);
165+
struct UnalignedBlock(u64, u64, u64, u64);
166+
167+
let block_size = mem::size_of::<Block>();
168+
169+
// Loop through x & y, copying them `Block` at a time
170+
// The optimizer should unroll the loop fully for most types
171+
// N.B. We can't use a for loop as the `range` impl calls `mem::swap` recursively
172+
let mut i = 0;
173+
while i + block_size <= len {
174+
// Create some uninitialized memory as scratch space
175+
// Declaring `t` here avoids aligning the stack when this loop is unused
176+
let mut t: Block = mem::uninitialized();
177+
let t = &mut t as *mut _ as *mut u8;
178+
let x = x.offset(i as isize);
179+
let y = y.offset(i as isize);
180+
181+
// Swap a block of bytes of x & y, using t as a temporary buffer
182+
// This should be optimized into efficient SIMD operations where available
183+
copy_nonoverlapping(x, t, block_size);
184+
copy_nonoverlapping(y, x, block_size);
185+
copy_nonoverlapping(t, y, block_size);
186+
i += block_size;
187+
}
188+
189+
if i < len {
190+
// Swap any remaining bytes
191+
let mut t: UnalignedBlock = mem::uninitialized();
192+
let rem = len - i;
193+
194+
let t = &mut t as *mut _ as *mut u8;
195+
let x = x.offset(i as isize);
196+
let y = y.offset(i as isize);
197+
198+
copy_nonoverlapping(x, t, rem);
199+
copy_nonoverlapping(y, x, rem);
200+
copy_nonoverlapping(t, y, rem);
201+
}
202+
}
203+
120204
/// Replaces the value at `dest` with `src`, returning the old
121205
/// value, without dropping either.
122206
///

src/libcore/slice/rotate.rs

+1-8
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ pub unsafe fn ptr_rotate<T>(mut left: usize, mid: *mut T, mut right: usize) {
7676
break;
7777
}
7878

79-
ptr_swap_n(
79+
ptr::swap_nonoverlapping(
8080
mid.offset(-(left as isize)),
8181
mid.offset((right-delta) as isize),
8282
delta);
@@ -103,10 +103,3 @@ pub unsafe fn ptr_rotate<T>(mut left: usize, mid: *mut T, mut right: usize) {
103103
ptr::copy_nonoverlapping(buf, mid.offset(-(left as isize)), right);
104104
}
105105
}
106-
107-
unsafe fn ptr_swap_n<T>(a: *mut T, b: *mut T, n: usize) {
108-
for i in 0..n {
109-
// These are nonoverlapping, so use mem::swap instead of ptr::swap
110-
mem::swap(&mut *a.offset(i as isize), &mut *b.offset(i as isize));
111-
}
112-
}

0 commit comments

Comments
 (0)