Skip to content

Commit 42de36d

Browse files
authored
Rollup merge of rust-lang#48639 - varkor:sort_by_key-cached, r=bluss
Add slice::sort_by_cached_key as a memoised sort_by_key At present, `slice::sort_by_key` calls its key function twice for each comparison that is made. When the key function is expensive (which can often be the case when `sort_by_key` is chosen over `sort_by`), this can lead to very suboptimal behaviour. To address this, I've introduced a new slice method, `sort_by_cached_key`, which has identical semantic behaviour to `sort_by_key`, except that it guarantees the key function will only be called once per element. Where there are `n` elements and the key function is `O(m)`: - `slice::sort_by_cached_key` time complexity is `O(m n log m n)`, compared to `slice::sort_by_key`'s `O(m n + n log n)`. - `slice::sort_by_cached_key` space complexity remains at `O(n + m)`. (Technically, it now reserves a slice of size `n`, whereas before it reserved a slice of size `n/2`.) `slice::sort_unstable_by_key` has not been given an analogue, as it is important that unstable sorts are in-place, which is not a property that is guaranteed here. However, this also means that `slice::sort_unstable_by_key` is likely to be slower than `slice::sort_by_cached_key` when the key function does not have negligible complexity. We might want to explore this trade-off further in the future. Benchmarks (for a vector of 100 `i32`s): ``` # Lexicographic: `|x| x.to_string()` test bench_sort_by_key ... bench: 112,638 ns/iter (+/- 19,563) test bench_sort_by_cached_key ... bench: 15,038 ns/iter (+/- 4,814) # Identity: `|x| *x` test bench_sort_by_key ... bench: 1,346 ns/iter (+/- 238) test bench_sort_by_cached_key ... bench: 1,839 ns/iter (+/- 765) # Power: `|x| x.pow(31)` test bench_sort_by_key ... bench: 3,624 ns/iter (+/- 738) test bench_sort_by_cached_key ... bench: 1,997 ns/iter (+/- 311) # Abs: `|x| x.abs()` test bench_sort_by_key ... bench: 1,546 ns/iter (+/- 174) test bench_sort_by_cached_key ... bench: 1,668 ns/iter (+/- 790) ``` (So it seems functions that are single operations do perform slightly worse with this method, but for pretty much any more complex key, you're better off with this optimisation.) I've definitely found myself using expensive keys in the past and wishing this optimisation was made (e.g. for rust-lang#47415). This feels like both desirable and expected behaviour, at the small cost of slightly more stack allocation and minute degradation in performance for extremely trivial keys. Resolves rust-lang#34447.
2 parents 31ae4f9 + 9c7b69e commit 42de36d

File tree

5 files changed

+111
-13
lines changed

5 files changed

+111
-13
lines changed

src/liballoc/benches/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#![cfg_attr(stage0, feature(i128_type))]
1414
#![feature(rand)]
1515
#![feature(repr_simd)]
16+
#![feature(slice_sort_by_cached_key)]
1617
#![feature(test)]
1718

1819
extern crate rand;

src/liballoc/benches/slice.rs

+15
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,17 @@ macro_rules! sort_expensive {
284284
}
285285
}
286286

287+
macro_rules! sort_lexicographic {
288+
($f:ident, $name:ident, $gen:expr, $len:expr) => {
289+
#[bench]
290+
fn $name(b: &mut Bencher) {
291+
let v = $gen($len);
292+
b.iter(|| v.clone().$f(|x| x.to_string()));
293+
b.bytes = $len * mem::size_of_val(&$gen(1)[0]) as u64;
294+
}
295+
}
296+
}
297+
287298
sort!(sort, sort_small_ascending, gen_ascending, 10);
288299
sort!(sort, sort_small_descending, gen_descending, 10);
289300
sort!(sort, sort_small_random, gen_random, 10);
@@ -312,6 +323,10 @@ sort!(sort_unstable, sort_unstable_large_big, gen_big_random, 10000);
312323
sort_strings!(sort_unstable, sort_unstable_large_strings, gen_strings, 10000);
313324
sort_expensive!(sort_unstable_by, sort_unstable_large_expensive, gen_random, 10000);
314325

326+
sort_lexicographic!(sort_by_key, sort_by_key_lexicographic, gen_random, 10000);
327+
sort_lexicographic!(sort_unstable_by_key, sort_unstable_by_key_lexicographic, gen_random, 10000);
328+
sort_lexicographic!(sort_by_cached_key, sort_by_cached_key_lexicographic, gen_random, 10000);
329+
315330
macro_rules! reverse {
316331
($name:ident, $ty:ty, $f:expr) => {
317332
#[bench]

src/liballoc/slice.rs

+78-10
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ use core::mem::size_of;
102102
use core::mem;
103103
use core::ptr;
104104
use core::slice as core_slice;
105+
use core::{u8, u16, u32};
105106

106107
use borrow::{Borrow, BorrowMut, ToOwned};
107108
use boxed::Box;
@@ -1302,7 +1303,8 @@ impl<T> [T] {
13021303

13031304
/// Sorts the slice with a key extraction function.
13041305
///
1305-
/// This sort is stable (i.e. does not reorder equal elements) and `O(n log n)` worst-case.
1306+
/// This sort is stable (i.e. does not reorder equal elements) and `O(m n log(m n))`
1307+
/// worst-case, where the key function is `O(m)`.
13061308
///
13071309
/// When applicable, unstable sorting is preferred because it is generally faster than stable
13081310
/// sorting and it doesn't allocate auxiliary memory.
@@ -1328,12 +1330,82 @@ impl<T> [T] {
13281330
/// ```
13291331
#[stable(feature = "slice_sort_by_key", since = "1.7.0")]
13301332
#[inline]
1331-
pub fn sort_by_key<B, F>(&mut self, mut f: F)
1332-
where F: FnMut(&T) -> B, B: Ord
1333+
pub fn sort_by_key<K, F>(&mut self, mut f: F)
1334+
where F: FnMut(&T) -> K, K: Ord
13331335
{
13341336
merge_sort(self, |a, b| f(a).lt(&f(b)));
13351337
}
13361338

1339+
/// Sorts the slice with a key extraction function.
1340+
///
1341+
/// During sorting, the key function is called only once per element.
1342+
///
1343+
/// This sort is stable (i.e. does not reorder equal elements) and `O(m n + n log n)`
1344+
/// worst-case, where the key function is `O(m)`.
1345+
///
1346+
/// For simple key functions (e.g. functions that are property accesses or
1347+
/// basic operations), [`sort_by_key`](#method.sort_by_key) is likely to be
1348+
/// faster.
1349+
///
1350+
/// # Current implementation
1351+
///
1352+
/// The current algorithm is based on [pattern-defeating quicksort][pdqsort] by Orson Peters,
1353+
/// which combines the fast average case of randomized quicksort with the fast worst case of
1354+
/// heapsort, while achieving linear time on slices with certain patterns. It uses some
1355+
/// randomization to avoid degenerate cases, but with a fixed seed to always provide
1356+
/// deterministic behavior.
1357+
///
1358+
/// In the worst case, the algorithm allocates temporary storage in a `Vec<(K, usize)>` the
1359+
/// length of the slice.
1360+
///
1361+
/// # Examples
1362+
///
1363+
/// ```
1364+
/// #![feature(slice_sort_by_cached_key)]
1365+
/// let mut v = [-5i32, 4, 32, -3, 2];
1366+
///
1367+
/// v.sort_by_cached_key(|k| k.to_string());
1368+
/// assert!(v == [-3, -5, 2, 32, 4]);
1369+
/// ```
1370+
///
1371+
/// [pdqsort]: https://github.com/orlp/pdqsort
1372+
#[unstable(feature = "slice_sort_by_cached_key", issue = "34447")]
1373+
#[inline]
1374+
pub fn sort_by_cached_key<K, F>(&mut self, f: F)
1375+
where F: FnMut(&T) -> K, K: Ord
1376+
{
1377+
// Helper macro for indexing our vector by the smallest possible type, to reduce allocation.
1378+
macro_rules! sort_by_key {
1379+
($t:ty, $slice:ident, $f:ident) => ({
1380+
let mut indices: Vec<_> =
1381+
$slice.iter().map($f).enumerate().map(|(i, k)| (k, i as $t)).collect();
1382+
// The elements of `indices` are unique, as they are indexed, so any sort will be
1383+
// stable with respect to the original slice. We use `sort_unstable` here because
1384+
// it requires less memory allocation.
1385+
indices.sort_unstable();
1386+
for i in 0..$slice.len() {
1387+
let mut index = indices[i].1;
1388+
while (index as usize) < i {
1389+
index = indices[index as usize].1;
1390+
}
1391+
indices[i].1 = index;
1392+
$slice.swap(i, index as usize);
1393+
}
1394+
})
1395+
}
1396+
1397+
let sz_u8 = mem::size_of::<(K, u8)>();
1398+
let sz_u16 = mem::size_of::<(K, u16)>();
1399+
let sz_u32 = mem::size_of::<(K, u32)>();
1400+
let sz_usize = mem::size_of::<(K, usize)>();
1401+
1402+
let len = self.len();
1403+
if sz_u8 < sz_u16 && len <= ( u8::MAX as usize) { return sort_by_key!( u8, self, f) }
1404+
if sz_u16 < sz_u32 && len <= (u16::MAX as usize) { return sort_by_key!(u16, self, f) }
1405+
if sz_u32 < sz_usize && len <= (u32::MAX as usize) { return sort_by_key!(u32, self, f) }
1406+
sort_by_key!(usize, self, f)
1407+
}
1408+
13371409
/// Sorts the slice, but may not preserve the order of equal elements.
13381410
///
13391411
/// This sort is unstable (i.e. may reorder equal elements), in-place (i.e. does not allocate),
@@ -1410,7 +1482,7 @@ impl<T> [T] {
14101482
/// elements.
14111483
///
14121484
/// This sort is unstable (i.e. may reorder equal elements), in-place (i.e. does not allocate),
1413-
/// and `O(n log n)` worst-case.
1485+
/// and `O(m n log(m n))` worst-case, where the key function is `O(m)`.
14141486
///
14151487
/// # Current implementation
14161488
///
@@ -1420,9 +1492,6 @@ impl<T> [T] {
14201492
/// randomization to avoid degenerate cases, but with a fixed seed to always provide
14211493
/// deterministic behavior.
14221494
///
1423-
/// It is typically faster than stable sorting, except in a few special cases, e.g. when the
1424-
/// slice consists of several concatenated sorted sequences.
1425-
///
14261495
/// # Examples
14271496
///
14281497
/// ```
@@ -1435,9 +1504,8 @@ impl<T> [T] {
14351504
/// [pdqsort]: https://github.com/orlp/pdqsort
14361505
#[stable(feature = "sort_unstable", since = "1.20.0")]
14371506
#[inline]
1438-
pub fn sort_unstable_by_key<B, F>(&mut self, f: F)
1439-
where F: FnMut(&T) -> B,
1440-
B: Ord
1507+
pub fn sort_unstable_by_key<K, F>(&mut self, f: F)
1508+
where F: FnMut(&T) -> K, K: Ord
14411509
{
14421510
core_slice::SliceExt::sort_unstable_by_key(self, f);
14431511
}

src/liballoc/tests/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#![feature(pattern)]
2424
#![feature(placement_in_syntax)]
2525
#![feature(rand)]
26+
#![feature(slice_sort_by_cached_key)]
2627
#![feature(splice)]
2728
#![feature(str_escape)]
2829
#![feature(string_retain)]

src/liballoc/tests/slice.rs

+16-3
Original file line numberDiff line numberDiff line change
@@ -425,6 +425,14 @@ fn test_sort() {
425425
v.sort_by(|a, b| b.cmp(a));
426426
assert!(v.windows(2).all(|w| w[0] >= w[1]));
427427

428+
// Sort in lexicographic order.
429+
let mut v1 = orig.clone();
430+
let mut v2 = orig.clone();
431+
v1.sort_by_key(|x| x.to_string());
432+
v2.sort_by_cached_key(|x| x.to_string());
433+
assert!(v1.windows(2).all(|w| w[0].to_string() <= w[1].to_string()));
434+
assert!(v1 == v2);
435+
428436
// Sort with many pre-sorted runs.
429437
let mut v = orig.clone();
430438
v.sort();
@@ -477,24 +485,29 @@ fn test_sort_stability() {
477485
// the second item represents which occurrence of that
478486
// number this element is, i.e. the second elements
479487
// will occur in sorted order.
480-
let mut v: Vec<_> = (0..len)
488+
let mut orig: Vec<_> = (0..len)
481489
.map(|_| {
482490
let n = thread_rng().gen::<usize>() % 10;
483491
counts[n] += 1;
484492
(n, counts[n])
485493
})
486494
.collect();
487495

488-
// only sort on the first element, so an unstable sort
496+
let mut v = orig.clone();
497+
// Only sort on the first element, so an unstable sort
489498
// may mix up the counts.
490499
v.sort_by(|&(a, _), &(b, _)| a.cmp(&b));
491500

492-
// this comparison includes the count (the second item
501+
// This comparison includes the count (the second item
493502
// of the tuple), so elements with equal first items
494503
// will need to be ordered with increasing
495504
// counts... i.e. exactly asserting that this sort is
496505
// stable.
497506
assert!(v.windows(2).all(|w| w[0] <= w[1]));
507+
508+
let mut v = orig.clone();
509+
v.sort_by_cached_key(|&(x, _)| x);
510+
assert!(v.windows(2).all(|w| w[0] <= w[1]));
498511
}
499512
}
500513
}

0 commit comments

Comments
 (0)