Skip to content

Commit 21a9273

Browse files
Sagi Grimbergtorvalds
Sagi Grimberg
authored andcommitted
mm: mmu_notifier: have mmu_notifiers use a global SRCU so they may safely schedule
With an RCU based mmu_notifier implementation, any callout to mmu_notifier_invalidate_range_{start,end}() or mmu_notifier_invalidate_page() would not be allowed to call schedule() as that could potentially allow a modification to the mmu_notifier structure while it is currently being used. Since srcu allocs 4 machine words per instance per cpu, we may end up with memory exhaustion if we use srcu per mm. So all mms share a global srcu. Note that during large mmu_notifier activity exit & unregister paths might hang for longer periods, but it is tolerable for current mmu_notifier clients. Signed-off-by: Sagi Grimberg <sagig@mellanox.co.il> Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Haggai Eran <haggaie@mellanox.com> Cc: "Paul E. McKenney" <paulmck@us.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 48af0d7 commit 21a9273

File tree

2 files changed

+49
-25
lines changed

2 files changed

+49
-25
lines changed

include/linux/mmu_notifier.h

+1
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include <linux/list.h>
55
#include <linux/spinlock.h>
66
#include <linux/mm_types.h>
7+
#include <linux/srcu.h>
78

89
struct mmu_notifier;
910
struct mmu_notifier_ops;

mm/mmu_notifier.c

+48-25
Original file line numberDiff line numberDiff line change
@@ -14,32 +14,37 @@
1414
#include <linux/export.h>
1515
#include <linux/mm.h>
1616
#include <linux/err.h>
17+
#include <linux/srcu.h>
1718
#include <linux/rcupdate.h>
1819
#include <linux/sched.h>
1920
#include <linux/slab.h>
2021

22+
/* global SRCU for all MMs */
23+
struct srcu_struct srcu;
24+
2125
/*
2226
* This function can't run concurrently against mmu_notifier_register
2327
* because mm->mm_users > 0 during mmu_notifier_register and exit_mmap
2428
* runs with mm_users == 0. Other tasks may still invoke mmu notifiers
2529
* in parallel despite there being no task using this mm any more,
2630
* through the vmas outside of the exit_mmap context, such as with
2731
* vmtruncate. This serializes against mmu_notifier_unregister with
28-
* the mmu_notifier_mm->lock in addition to RCU and it serializes
29-
* against the other mmu notifiers with RCU. struct mmu_notifier_mm
32+
* the mmu_notifier_mm->lock in addition to SRCU and it serializes
33+
* against the other mmu notifiers with SRCU. struct mmu_notifier_mm
3034
* can't go away from under us as exit_mmap holds an mm_count pin
3135
* itself.
3236
*/
3337
void __mmu_notifier_release(struct mm_struct *mm)
3438
{
3539
struct mmu_notifier *mn;
3640
struct hlist_node *n;
41+
int id;
3742

3843
/*
3944
* RCU here will block mmu_notifier_unregister until
4045
* ->release returns.
4146
*/
42-
rcu_read_lock();
47+
id = srcu_read_lock(&srcu);
4348
hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist)
4449
/*
4550
* if ->release runs before mmu_notifier_unregister it
@@ -50,7 +55,7 @@ void __mmu_notifier_release(struct mm_struct *mm)
5055
*/
5156
if (mn->ops->release)
5257
mn->ops->release(mn, mm);
53-
rcu_read_unlock();
58+
srcu_read_unlock(&srcu, id);
5459

5560
spin_lock(&mm->mmu_notifier_mm->lock);
5661
while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
@@ -68,15 +73,15 @@ void __mmu_notifier_release(struct mm_struct *mm)
6873
spin_unlock(&mm->mmu_notifier_mm->lock);
6974

7075
/*
71-
* synchronize_rcu here prevents mmu_notifier_release to
76+
* synchronize_srcu here prevents mmu_notifier_release to
7277
* return to exit_mmap (which would proceed freeing all pages
7378
* in the mm) until the ->release method returns, if it was
7479
* invoked by mmu_notifier_unregister.
7580
*
7681
* The mmu_notifier_mm can't go away from under us because one
7782
* mm_count is hold by exit_mmap.
7883
*/
79-
synchronize_rcu();
84+
synchronize_srcu(&srcu);
8085
}
8186

8287
/*
@@ -89,14 +94,14 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
8994
{
9095
struct mmu_notifier *mn;
9196
struct hlist_node *n;
92-
int young = 0;
97+
int young = 0, id;
9398

94-
rcu_read_lock();
99+
id = srcu_read_lock(&srcu);
95100
hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
96101
if (mn->ops->clear_flush_young)
97102
young |= mn->ops->clear_flush_young(mn, mm, address);
98103
}
99-
rcu_read_unlock();
104+
srcu_read_unlock(&srcu, id);
100105

101106
return young;
102107
}
@@ -106,17 +111,17 @@ int __mmu_notifier_test_young(struct mm_struct *mm,
106111
{
107112
struct mmu_notifier *mn;
108113
struct hlist_node *n;
109-
int young = 0;
114+
int young = 0, id;
110115

111-
rcu_read_lock();
116+
id = srcu_read_lock(&srcu);
112117
hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
113118
if (mn->ops->test_young) {
114119
young = mn->ops->test_young(mn, mm, address);
115120
if (young)
116121
break;
117122
}
118123
}
119-
rcu_read_unlock();
124+
srcu_read_unlock(&srcu, id);
120125

121126
return young;
122127
}
@@ -126,8 +131,9 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
126131
{
127132
struct mmu_notifier *mn;
128133
struct hlist_node *n;
134+
int id;
129135

130-
rcu_read_lock();
136+
id = srcu_read_lock(&srcu);
131137
hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
132138
if (mn->ops->change_pte)
133139
mn->ops->change_pte(mn, mm, address, pte);
@@ -138,49 +144,52 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
138144
else if (mn->ops->invalidate_page)
139145
mn->ops->invalidate_page(mn, mm, address);
140146
}
141-
rcu_read_unlock();
147+
srcu_read_unlock(&srcu, id);
142148
}
143149

144150
void __mmu_notifier_invalidate_page(struct mm_struct *mm,
145151
unsigned long address)
146152
{
147153
struct mmu_notifier *mn;
148154
struct hlist_node *n;
155+
int id;
149156

150-
rcu_read_lock();
157+
id = srcu_read_lock(&srcu);
151158
hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
152159
if (mn->ops->invalidate_page)
153160
mn->ops->invalidate_page(mn, mm, address);
154161
}
155-
rcu_read_unlock();
162+
srcu_read_unlock(&srcu, id);
156163
}
157164

158165
void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
159166
unsigned long start, unsigned long end)
160167
{
161168
struct mmu_notifier *mn;
162169
struct hlist_node *n;
170+
int id;
163171

164-
rcu_read_lock();
172+
id = srcu_read_lock(&srcu);
165173
hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
166174
if (mn->ops->invalidate_range_start)
167175
mn->ops->invalidate_range_start(mn, mm, start, end);
168176
}
169-
rcu_read_unlock();
177+
srcu_read_unlock(&srcu, id);
170178
}
171179

172180
void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
173181
unsigned long start, unsigned long end)
174182
{
175183
struct mmu_notifier *mn;
176184
struct hlist_node *n;
185+
int id;
177186

178-
rcu_read_lock();
187+
id = srcu_read_lock(&srcu);
179188
hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
180189
if (mn->ops->invalidate_range_end)
181190
mn->ops->invalidate_range_end(mn, mm, start, end);
182191
}
183-
rcu_read_unlock();
192+
srcu_read_unlock(&srcu, id);
184193
}
185194

186195
static int do_mmu_notifier_register(struct mmu_notifier *mn,
@@ -192,6 +201,12 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn,
192201

193202
BUG_ON(atomic_read(&mm->mm_users) <= 0);
194203

204+
/*
205+
* Verify that mmu_notifier_init() already run and the global srcu is
206+
* initialized.
207+
*/
208+
BUG_ON(!srcu.per_cpu_ref);
209+
195210
ret = -ENOMEM;
196211
mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL);
197212
if (unlikely(!mmu_notifier_mm))
@@ -274,8 +289,8 @@ void __mmu_notifier_mm_destroy(struct mm_struct *mm)
274289
/*
275290
* This releases the mm_count pin automatically and frees the mm
276291
* structure if it was the last user of it. It serializes against
277-
* running mmu notifiers with RCU and against mmu_notifier_unregister
278-
* with the unregister lock + RCU. All sptes must be dropped before
292+
* running mmu notifiers with SRCU and against mmu_notifier_unregister
293+
* with the unregister lock + SRCU. All sptes must be dropped before
279294
* calling mmu_notifier_unregister. ->release or any other notifier
280295
* method may be invoked concurrently with mmu_notifier_unregister,
281296
* and only after mmu_notifier_unregister returned we're guaranteed
@@ -290,16 +305,17 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
290305
* RCU here will force exit_mmap to wait ->release to finish
291306
* before freeing the pages.
292307
*/
293-
rcu_read_lock();
308+
int id;
294309

310+
id = srcu_read_lock(&srcu);
295311
/*
296312
* exit_mmap will block in mmu_notifier_release to
297313
* guarantee ->release is called before freeing the
298314
* pages.
299315
*/
300316
if (mn->ops->release)
301317
mn->ops->release(mn, mm);
302-
rcu_read_unlock();
318+
srcu_read_unlock(&srcu, id);
303319

304320
spin_lock(&mm->mmu_notifier_mm->lock);
305321
hlist_del_rcu(&mn->hlist);
@@ -310,10 +326,17 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
310326
* Wait any running method to finish, of course including
311327
* ->release if it was run by mmu_notifier_relase instead of us.
312328
*/
313-
synchronize_rcu();
329+
synchronize_srcu(&srcu);
314330

315331
BUG_ON(atomic_read(&mm->mm_count) <= 0);
316332

317333
mmdrop(mm);
318334
}
319335
EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
336+
337+
static int __init mmu_notifier_init(void)
338+
{
339+
return init_srcu_struct(&srcu);
340+
}
341+
342+
module_init(mmu_notifier_init);

0 commit comments

Comments
 (0)