Skip to content

Commit dd51fb8

Browse files
Hugh Dickinsgregkh
Hugh Dickins
authored andcommitted
memcg: fix hotplugged memory zone oops
commit bea8c15 upstream. When MEMCG is configured on (even when it's disabled by boot option), when adding or removing a page to/from its lru list, the zone pointer used for stats updates is nowadays taken from the struct lruvec. (On many configurations, calculating zone from page is slower.) But we have no code to update all the lruvecs (per zone, per memcg) when a memory node is hotadded. Here's an extract from the oops which results when running numactl to bind a program to a newly onlined node: BUG: unable to handle kernel NULL pointer dereference at 0000000000000f60 IP: __mod_zone_page_state+0x9/0x60 Pid: 1219, comm: numactl Not tainted 3.6.0-rc5+ raspberrypi#180 Bochs Bochs Process numactl (pid: 1219, threadinfo ffff880039abc000, task ffff8800383c4ce0) Call Trace: __pagevec_lru_add_fn+0xdf/0x140 pagevec_lru_move_fn+0xb1/0x100 __pagevec_lru_add+0x1c/0x30 lru_add_drain_cpu+0xa3/0x130 lru_add_drain+0x2f/0x40 ... The natural solution might be to use a memcg callback whenever memory is hotadded; but that solution has not been scoped out, and it happens that we do have an easy location at which to update lruvec->zone. The lruvec pointer is discovered either by mem_cgroup_zone_lruvec() or by mem_cgroup_page_lruvec(), and both of those do know the right zone. So check and set lruvec->zone in those; and remove the inadequate attempt to set lruvec->zone from lruvec_init(), which is called before NODE_DATA(node) has been allocated in such cases. Ah, there was one exceptionr. For no particularly good reason, mem_cgroup_force_empty_list() has its own code for deciding lruvec. Change it to use the standard mem_cgroup_zone_lruvec() and mem_cgroup_get_lru_size() too. In fact it was already safe against such an oops (the lru lists in danger could only be empty), but we're better proofed against future changes this way. I've marked this for stable (3.6) since we introduced the problem in 3.5 (now closed to stable); but I have no idea if this is the only fix needed to get memory hotadd working with memcg in 3.6, and received no answer when I enquired twice before. Reported-by: Tang Chen <tangchen@cn.fujitsu.com> Signed-off-by: Hugh Dickins <hughd@google.com> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Konstantin Khlebnikov <khlebnikov@openvz.org> Cc: Wen Congyang <wency@cn.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
1 parent 7adaec3 commit dd51fb8

File tree

4 files changed

+38
-18
lines changed

4 files changed

+38
-18
lines changed

include/linux/mmzone.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -744,7 +744,7 @@ extern int init_currently_empty_zone(struct zone *zone, unsigned long start_pfn,
744744
unsigned long size,
745745
enum memmap_context context);
746746

747-
extern void lruvec_init(struct lruvec *lruvec, struct zone *zone);
747+
extern void lruvec_init(struct lruvec *lruvec);
748748

749749
static inline struct zone *lruvec_zone(struct lruvec *lruvec)
750750
{

mm/memcontrol.c

+35-11
Original file line numberDiff line numberDiff line change
@@ -1061,12 +1061,24 @@ struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
10611061
struct mem_cgroup *memcg)
10621062
{
10631063
struct mem_cgroup_per_zone *mz;
1064+
struct lruvec *lruvec;
10641065

1065-
if (mem_cgroup_disabled())
1066-
return &zone->lruvec;
1066+
if (mem_cgroup_disabled()) {
1067+
lruvec = &zone->lruvec;
1068+
goto out;
1069+
}
10671070

10681071
mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
1069-
return &mz->lruvec;
1072+
lruvec = &mz->lruvec;
1073+
out:
1074+
/*
1075+
* Since a node can be onlined after the mem_cgroup was created,
1076+
* we have to be prepared to initialize lruvec->zone here;
1077+
* and if offlined then reonlined, we need to reinitialize it.
1078+
*/
1079+
if (unlikely(lruvec->zone != zone))
1080+
lruvec->zone = zone;
1081+
return lruvec;
10701082
}
10711083

10721084
/*
@@ -1093,9 +1105,12 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
10931105
struct mem_cgroup_per_zone *mz;
10941106
struct mem_cgroup *memcg;
10951107
struct page_cgroup *pc;
1108+
struct lruvec *lruvec;
10961109

1097-
if (mem_cgroup_disabled())
1098-
return &zone->lruvec;
1110+
if (mem_cgroup_disabled()) {
1111+
lruvec = &zone->lruvec;
1112+
goto out;
1113+
}
10991114

11001115
pc = lookup_page_cgroup(page);
11011116
memcg = pc->mem_cgroup;
@@ -1113,7 +1128,16 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
11131128
pc->mem_cgroup = memcg = root_mem_cgroup;
11141129

11151130
mz = page_cgroup_zoneinfo(memcg, page);
1116-
return &mz->lruvec;
1131+
lruvec = &mz->lruvec;
1132+
out:
1133+
/*
1134+
* Since a node can be onlined after the mem_cgroup was created,
1135+
* we have to be prepared to initialize lruvec->zone here;
1136+
* and if offlined then reonlined, we need to reinitialize it.
1137+
*/
1138+
if (unlikely(lruvec->zone != zone))
1139+
lruvec->zone = zone;
1140+
return lruvec;
11171141
}
11181142

11191143
/**
@@ -3703,17 +3727,17 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
37033727
static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
37043728
int node, int zid, enum lru_list lru)
37053729
{
3706-
struct mem_cgroup_per_zone *mz;
3730+
struct lruvec *lruvec;
37073731
unsigned long flags, loop;
37083732
struct list_head *list;
37093733
struct page *busy;
37103734
struct zone *zone;
37113735

37123736
zone = &NODE_DATA(node)->node_zones[zid];
3713-
mz = mem_cgroup_zoneinfo(memcg, node, zid);
3714-
list = &mz->lruvec.lists[lru];
3737+
lruvec = mem_cgroup_zone_lruvec(zone, memcg);
3738+
list = &lruvec->lists[lru];
37153739

3716-
loop = mz->lru_size[lru];
3740+
loop = mem_cgroup_get_lru_size(lruvec, lru);
37173741
/* give some margin against EBUSY etc...*/
37183742
loop += 256;
37193743
busy = NULL;
@@ -4751,7 +4775,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
47514775

47524776
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
47534777
mz = &pn->zoneinfo[zone];
4754-
lruvec_init(&mz->lruvec, &NODE_DATA(node)->node_zones[zone]);
4778+
lruvec_init(&mz->lruvec);
47554779
mz->usage_in_excess = 0;
47564780
mz->on_tree = false;
47574781
mz->memcg = memcg;

mm/mmzone.c

+1-5
Original file line numberDiff line numberDiff line change
@@ -87,16 +87,12 @@ int memmap_valid_within(unsigned long pfn,
8787
}
8888
#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
8989

90-
void lruvec_init(struct lruvec *lruvec, struct zone *zone)
90+
void lruvec_init(struct lruvec *lruvec)
9191
{
9292
enum lru_list lru;
9393

9494
memset(lruvec, 0, sizeof(struct lruvec));
9595

9696
for_each_lru(lru)
9797
INIT_LIST_HEAD(&lruvec->lists[lru]);
98-
99-
#ifdef CONFIG_MEMCG
100-
lruvec->zone = zone;
101-
#endif
10298
}

mm/page_alloc.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -4456,7 +4456,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
44564456
zone->zone_pgdat = pgdat;
44574457

44584458
zone_pcp_init(zone);
4459-
lruvec_init(&zone->lruvec, zone);
4459+
lruvec_init(&zone->lruvec);
44604460
if (!size)
44614461
continue;
44624462

0 commit comments

Comments
 (0)