Skip to content

Commit c67fe37

Browse files
Mel Gormantorvalds
Mel Gorman
authored andcommitted
mm: compaction: Abort async compaction if locks are contended or taking too long
Jim Schutt reported a problem that pointed at compaction contending heavily on locks. The workload is straight-forward and in his own words; The systems in question have 24 SAS drives spread across 3 HBAs, running 24 Ceph OSD instances, one per drive. FWIW these servers are dual-socket Intel 5675 Xeons w/48 GB memory. I've got ~160 Ceph Linux clients doing dd simultaneously to a Ceph file system backed by 12 of these servers. Early in the test everything looks fine procs -------------------memory------------------ ---swap-- -----io---- --system-- -----cpu------- r b swpd free buff cache si so bi bo in cs us sy id wa st 31 15 0 287216 576 38606628 0 0 2 1158 2 14 1 3 95 0 0 27 15 0 225288 576 38583384 0 0 18 2222016 203357 134876 11 56 17 15 0 28 17 0 219256 576 38544736 0 0 11 2305932 203141 146296 11 49 23 17 0 6 18 0 215596 576 38552872 0 0 7 2363207 215264 166502 12 45 22 20 0 22 18 0 226984 576 38596404 0 0 3 2445741 223114 179527 12 43 23 22 0 and then it goes to pot procs -------------------memory------------------ ---swap-- -----io---- --system-- -----cpu------- r b swpd free buff cache si so bi bo in cs us sy id wa st 163 8 0 464308 576 36791368 0 0 11 22210 866 536 3 13 79 4 0 207 14 0 917752 576 36181928 0 0 712 1345376 134598 47367 7 90 1 2 0 123 12 0 685516 576 36296148 0 0 429 1386615 158494 60077 8 84 5 3 0 123 12 0 598572 576 36333728 0 0 1107 1233281 147542 62351 7 84 5 4 0 622 7 0 660768 576 36118264 0 0 557 1345548 151394 59353 7 85 4 3 0 223 11 0 283960 576 36463868 0 0 46 1107160 121846 33006 6 93 1 1 0 Note that system CPU usage is very high blocks being written out has dropped by 42%. He analysed this with perf and found perf record -g -a sleep 10 perf report --sort symbol --call-graph fractal,5 34.63% [k] _raw_spin_lock_irqsave | |--97.30%-- isolate_freepages | compaction_alloc | unmap_and_move | migrate_pages | compact_zone | compact_zone_order | try_to_compact_pages | __alloc_pages_direct_compact | __alloc_pages_slowpath | __alloc_pages_nodemask | alloc_pages_vma | do_huge_pmd_anonymous_page | handle_mm_fault | do_page_fault | page_fault | | | |--87.39%-- skb_copy_datagram_iovec | | tcp_recvmsg | | inet_recvmsg | | sock_recvmsg | | sys_recvfrom | | system_call | | __recv | | | | | --100.00%-- (nil) | | | --12.61%-- memcpy --2.70%-- [...] There was other data but primarily it is all showing that compaction is contended heavily on the zone->lock and zone->lru_lock. commit [b2eef8c: mm: compaction: minimise the time IRQs are disabled while isolating pages for migration] noted that it was possible for migration to hold the lru_lock for an excessive amount of time. Very broadly speaking this patch expands the concept. This patch introduces compact_checklock_irqsave() to check if a lock is contended or the process needs to be scheduled. If either condition is true then async compaction is aborted and the caller is informed. The page allocator will fail a THP allocation if compaction failed due to contention. This patch also introduces compact_trylock_irqsave() which will acquire the lock only if it is not contended and the process does not need to schedule. Reported-by: Jim Schutt <jaschut@sandia.gov> Tested-by: Jim Schutt <jaschut@sandia.gov> Signed-off-by: Mel Gorman <mgorman@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent de74f1c commit c67fe37

File tree

4 files changed

+93
-29
lines changed

4 files changed

+93
-29
lines changed

include/linux/compaction.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
2222
extern int fragmentation_index(struct zone *zone, unsigned int order);
2323
extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
2424
int order, gfp_t gfp_mask, nodemask_t *mask,
25-
bool sync);
25+
bool sync, bool *contended);
2626
extern int compact_pgdat(pg_data_t *pgdat, int order);
2727
extern unsigned long compaction_suitable(struct zone *zone, int order);
2828

@@ -64,7 +64,7 @@ static inline bool compaction_deferred(struct zone *zone, int order)
6464
#else
6565
static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
6666
int order, gfp_t gfp_mask, nodemask_t *nodemask,
67-
bool sync)
67+
bool sync, bool *contended)
6868
{
6969
return COMPACT_CONTINUE;
7070
}

mm/compaction.c

+79-21
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,47 @@ static inline bool migrate_async_suitable(int migratetype)
5050
return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
5151
}
5252

53+
/*
54+
* Compaction requires the taking of some coarse locks that are potentially
55+
* very heavily contended. Check if the process needs to be scheduled or
56+
* if the lock is contended. For async compaction, back out in the event
57+
* if contention is severe. For sync compaction, schedule.
58+
*
59+
* Returns true if the lock is held.
60+
* Returns false if the lock is released and compaction should abort
61+
*/
62+
static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
63+
bool locked, struct compact_control *cc)
64+
{
65+
if (need_resched() || spin_is_contended(lock)) {
66+
if (locked) {
67+
spin_unlock_irqrestore(lock, *flags);
68+
locked = false;
69+
}
70+
71+
/* async aborts if taking too long or contended */
72+
if (!cc->sync) {
73+
if (cc->contended)
74+
*cc->contended = true;
75+
return false;
76+
}
77+
78+
cond_resched();
79+
if (fatal_signal_pending(current))
80+
return false;
81+
}
82+
83+
if (!locked)
84+
spin_lock_irqsave(lock, *flags);
85+
return true;
86+
}
87+
88+
static inline bool compact_trylock_irqsave(spinlock_t *lock,
89+
unsigned long *flags, struct compact_control *cc)
90+
{
91+
return compact_checklock_irqsave(lock, flags, false, cc);
92+
}
93+
5394
/*
5495
* Isolate free pages onto a private freelist. Caller must hold zone->lock.
5596
* If @strict is true, will abort returning 0 on any invalid PFNs or non-free
@@ -173,16 +214,22 @@ isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn)
173214
}
174215

175216
/* Update the number of anon and file isolated pages in the zone */
176-
static void acct_isolated(struct zone *zone, struct compact_control *cc)
217+
static void acct_isolated(struct zone *zone, bool locked, struct compact_control *cc)
177218
{
178219
struct page *page;
179220
unsigned int count[2] = { 0, };
180221

181222
list_for_each_entry(page, &cc->migratepages, lru)
182223
count[!!page_is_file_cache(page)]++;
183224

184-
__mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
185-
__mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
225+
/* If locked we can use the interrupt unsafe versions */
226+
if (locked) {
227+
__mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
228+
__mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
229+
} else {
230+
mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
231+
mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
232+
}
186233
}
187234

188235
/* Similar to reclaim, but different enough that they don't share logic */
@@ -228,6 +275,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
228275
struct list_head *migratelist = &cc->migratepages;
229276
isolate_mode_t mode = 0;
230277
struct lruvec *lruvec;
278+
unsigned long flags;
279+
bool locked;
231280

232281
/*
233282
* Ensure that there are not too many pages isolated from the LRU
@@ -247,25 +296,22 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
247296

248297
/* Time to isolate some pages for migration */
249298
cond_resched();
250-
spin_lock_irq(&zone->lru_lock);
299+
spin_lock_irqsave(&zone->lru_lock, flags);
300+
locked = true;
251301
for (; low_pfn < end_pfn; low_pfn++) {
252302
struct page *page;
253-
bool locked = true;
254303

255304
/* give a chance to irqs before checking need_resched() */
256305
if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) {
257-
spin_unlock_irq(&zone->lru_lock);
306+
spin_unlock_irqrestore(&zone->lru_lock, flags);
258307
locked = false;
259308
}
260-
if (need_resched() || spin_is_contended(&zone->lru_lock)) {
261-
if (locked)
262-
spin_unlock_irq(&zone->lru_lock);
263-
cond_resched();
264-
spin_lock_irq(&zone->lru_lock);
265-
if (fatal_signal_pending(current))
266-
break;
267-
} else if (!locked)
268-
spin_lock_irq(&zone->lru_lock);
309+
310+
/* Check if it is ok to still hold the lock */
311+
locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
312+
locked, cc);
313+
if (!locked)
314+
break;
269315

270316
/*
271317
* migrate_pfn does not necessarily start aligned to a
@@ -349,9 +395,10 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
349395
}
350396
}
351397

352-
acct_isolated(zone, cc);
398+
acct_isolated(zone, locked, cc);
353399

354-
spin_unlock_irq(&zone->lru_lock);
400+
if (locked)
401+
spin_unlock_irqrestore(&zone->lru_lock, flags);
355402

356403
trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
357404

@@ -461,7 +508,16 @@ static void isolate_freepages(struct zone *zone,
461508
* are disabled
462509
*/
463510
isolated = 0;
464-
spin_lock_irqsave(&zone->lock, flags);
511+
512+
/*
513+
* The zone lock must be held to isolate freepages. This
514+
* unfortunately this is a very coarse lock and can be
515+
* heavily contended if there are parallel allocations
516+
* or parallel compactions. For async compaction do not
517+
* spin on the lock
518+
*/
519+
if (!compact_trylock_irqsave(&zone->lock, &flags, cc))
520+
break;
465521
if (suitable_migration_target(page)) {
466522
end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
467523
isolated = isolate_freepages_block(pfn, end_pfn,
@@ -773,7 +829,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
773829

774830
static unsigned long compact_zone_order(struct zone *zone,
775831
int order, gfp_t gfp_mask,
776-
bool sync)
832+
bool sync, bool *contended)
777833
{
778834
struct compact_control cc = {
779835
.nr_freepages = 0,
@@ -782,6 +838,7 @@ static unsigned long compact_zone_order(struct zone *zone,
782838
.migratetype = allocflags_to_migratetype(gfp_mask),
783839
.zone = zone,
784840
.sync = sync,
841+
.contended = contended,
785842
};
786843
INIT_LIST_HEAD(&cc.freepages);
787844
INIT_LIST_HEAD(&cc.migratepages);
@@ -803,7 +860,7 @@ int sysctl_extfrag_threshold = 500;
803860
*/
804861
unsigned long try_to_compact_pages(struct zonelist *zonelist,
805862
int order, gfp_t gfp_mask, nodemask_t *nodemask,
806-
bool sync)
863+
bool sync, bool *contended)
807864
{
808865
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
809866
int may_enter_fs = gfp_mask & __GFP_FS;
@@ -827,7 +884,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
827884
nodemask) {
828885
int status;
829886

830-
status = compact_zone_order(zone, order, gfp_mask, sync);
887+
status = compact_zone_order(zone, order, gfp_mask, sync,
888+
contended);
831889
rc = max(status, rc);
832890

833891
/* If a normal allocation would succeed, stop compacting */

mm/internal.h

+1
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@ struct compact_control {
130130
int order; /* order a direct compactor needs */
131131
int migratetype; /* MOVABLE, RECLAIMABLE etc */
132132
struct zone *zone;
133+
bool *contended; /* True if a lock was contended */
133134
};
134135

135136
unsigned long

mm/page_alloc.c

+11-6
Original file line numberDiff line numberDiff line change
@@ -2102,7 +2102,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
21022102
struct zonelist *zonelist, enum zone_type high_zoneidx,
21032103
nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
21042104
int migratetype, bool sync_migration,
2105-
bool *deferred_compaction,
2105+
bool *contended_compaction, bool *deferred_compaction,
21062106
unsigned long *did_some_progress)
21072107
{
21082108
struct page *page;
@@ -2117,7 +2117,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
21172117

21182118
current->flags |= PF_MEMALLOC;
21192119
*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
2120-
nodemask, sync_migration);
2120+
nodemask, sync_migration,
2121+
contended_compaction);
21212122
current->flags &= ~PF_MEMALLOC;
21222123
if (*did_some_progress != COMPACT_SKIPPED) {
21232124

@@ -2163,7 +2164,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
21632164
struct zonelist *zonelist, enum zone_type high_zoneidx,
21642165
nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
21652166
int migratetype, bool sync_migration,
2166-
bool *deferred_compaction,
2167+
bool *contended_compaction, bool *deferred_compaction,
21672168
unsigned long *did_some_progress)
21682169
{
21692170
return NULL;
@@ -2336,6 +2337,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
23362337
unsigned long did_some_progress;
23372338
bool sync_migration = false;
23382339
bool deferred_compaction = false;
2340+
bool contended_compaction = false;
23392341

23402342
/*
23412343
* In the slowpath, we sanity check order to avoid ever trying to
@@ -2425,6 +2427,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
24252427
nodemask,
24262428
alloc_flags, preferred_zone,
24272429
migratetype, sync_migration,
2430+
&contended_compaction,
24282431
&deferred_compaction,
24292432
&did_some_progress);
24302433
if (page)
@@ -2434,10 +2437,11 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
24342437
/*
24352438
* If compaction is deferred for high-order allocations, it is because
24362439
* sync compaction recently failed. In this is the case and the caller
2437-
* has requested the system not be heavily disrupted, fail the
2438-
* allocation now instead of entering direct reclaim
2440+
* requested a movable allocation that does not heavily disrupt the
2441+
* system then fail the allocation instead of entering direct reclaim.
24392442
*/
2440-
if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD))
2443+
if ((deferred_compaction || contended_compaction) &&
2444+
(gfp_mask & __GFP_NO_KSWAPD))
24412445
goto nopage;
24422446

24432447
/* Try direct reclaim and then allocating */
@@ -2508,6 +2512,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
25082512
nodemask,
25092513
alloc_flags, preferred_zone,
25102514
migratetype, sync_migration,
2515+
&contended_compaction,
25112516
&deferred_compaction,
25122517
&did_some_progress);
25132518
if (page)

0 commit comments

Comments
 (0)