Skip to content

Commit 9458964

Browse files
author
Alexei Starovoitov
committed
Merge branch 'fix-the-unmatched-unit_size-of-bpf_mem_cache'
Hou Tao says: ==================== Fix the unmatched unit_size of bpf_mem_cache From: Hou Tao <houtao1@huawei.com> Hi, The patchset aims to fix the reported warning [0] when the unit_size of bpf_mem_cache is mismatched with the object size of underly slab-cache. Patch #1 fixes the warning by adjusting size_index according to the value of KMALLOC_MIN_SIZE, so bpf_mem_cache with unit_size which is smaller than KMALLOC_MIN_SIZE or is not aligned with KMALLOC_MIN_SIZE will be redirected to bpf_mem_cache with bigger unit_size. Patch #2 doesn't do prefill for these redirected bpf_mem_cache to save memory. Patch #3 adds further error check in bpf_mem_alloc_init() to ensure the unit_size and object_size are always matched and to prevent potential issues due to the mismatch. Please see individual patches for more details. And comments are always welcome. [0]: https://lore.kernel.org/bpf/87jztjmmy4.fsf@all.your.base.are.belong.to.us ==================== Link: https://lore.kernel.org/r/20230908133923.2675053-1-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2 parents 7182e56 + f0a42ab commit 9458964

File tree

3 files changed

+256
-4
lines changed

3 files changed

+256
-4
lines changed

kernel/bpf/memalloc.c

+83-4
Original file line numberDiff line numberDiff line change
@@ -459,8 +459,7 @@ static void notrace irq_work_raise(struct bpf_mem_cache *c)
459459
* Typical case will be between 11K and 116K closer to 11K.
460460
* bpf progs can and should share bpf_mem_cache when possible.
461461
*/
462-
463-
static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
462+
static void init_refill_work(struct bpf_mem_cache *c)
464463
{
465464
init_irq_work(&c->refill_work, bpf_mem_refill);
466465
if (c->unit_size <= 256) {
@@ -476,14 +475,35 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
476475
c->high_watermark = max(96 * 256 / c->unit_size, 3);
477476
}
478477
c->batch = max((c->high_watermark - c->low_watermark) / 4 * 3, 1);
478+
}
479479

480+
static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
481+
{
480482
/* To avoid consuming memory assume that 1st run of bpf
481483
* prog won't be doing more than 4 map_update_elem from
482484
* irq disabled region
483485
*/
484486
alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu), false);
485487
}
486488

489+
static int check_obj_size(struct bpf_mem_cache *c, unsigned int idx)
490+
{
491+
struct llist_node *first;
492+
unsigned int obj_size;
493+
494+
first = c->free_llist.first;
495+
if (!first)
496+
return 0;
497+
498+
obj_size = ksize(first);
499+
if (obj_size != c->unit_size) {
500+
WARN_ONCE(1, "bpf_mem_cache[%u]: unexpected object size %u, expect %u\n",
501+
idx, obj_size, c->unit_size);
502+
return -EINVAL;
503+
}
504+
return 0;
505+
}
506+
487507
/* When size != 0 bpf_mem_cache for each cpu.
488508
* This is typical bpf hash map use case when all elements have equal size.
489509
*
@@ -494,10 +514,10 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
494514
int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
495515
{
496516
static u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096};
517+
int cpu, i, err, unit_size, percpu_size = 0;
497518
struct bpf_mem_caches *cc, __percpu *pcc;
498519
struct bpf_mem_cache *c, __percpu *pc;
499520
struct obj_cgroup *objcg = NULL;
500-
int cpu, i, unit_size, percpu_size = 0;
501521

502522
if (size) {
503523
pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL);
@@ -521,6 +541,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
521541
c->objcg = objcg;
522542
c->percpu_size = percpu_size;
523543
c->tgt = c;
544+
init_refill_work(c);
524545
prefill_mem_cache(c, cpu);
525546
}
526547
ma->cache = pc;
@@ -534,6 +555,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
534555
pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL);
535556
if (!pcc)
536557
return -ENOMEM;
558+
err = 0;
537559
#ifdef CONFIG_MEMCG_KMEM
538560
objcg = get_obj_cgroup_from_current();
539561
#endif
@@ -544,11 +566,30 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
544566
c->unit_size = sizes[i];
545567
c->objcg = objcg;
546568
c->tgt = c;
569+
570+
init_refill_work(c);
571+
/* Another bpf_mem_cache will be used when allocating
572+
* c->unit_size in bpf_mem_alloc(), so doesn't prefill
573+
* for the bpf_mem_cache because these free objects will
574+
* never be used.
575+
*/
576+
if (i != bpf_mem_cache_idx(c->unit_size))
577+
continue;
547578
prefill_mem_cache(c, cpu);
579+
err = check_obj_size(c, i);
580+
if (err)
581+
goto out;
548582
}
549583
}
584+
585+
out:
550586
ma->caches = pcc;
551-
return 0;
587+
/* refill_work is either zeroed or initialized, so it is safe to
588+
* call irq_work_sync().
589+
*/
590+
if (err)
591+
bpf_mem_alloc_destroy(ma);
592+
return err;
552593
}
553594

554595
static void drain_mem_cache(struct bpf_mem_cache *c)
@@ -916,3 +957,41 @@ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags)
916957

917958
return !ret ? NULL : ret + LLIST_NODE_SZ;
918959
}
960+
961+
/* Most of the logic is taken from setup_kmalloc_cache_index_table() */
962+
static __init int bpf_mem_cache_adjust_size(void)
963+
{
964+
unsigned int size, index;
965+
966+
/* Normally KMALLOC_MIN_SIZE is 8-bytes, but it can be
967+
* up-to 256-bytes.
968+
*/
969+
size = KMALLOC_MIN_SIZE;
970+
if (size <= 192)
971+
index = size_index[(size - 1) / 8];
972+
else
973+
index = fls(size - 1) - 1;
974+
for (size = 8; size < KMALLOC_MIN_SIZE && size <= 192; size += 8)
975+
size_index[(size - 1) / 8] = index;
976+
977+
/* The minimal alignment is 64-bytes, so disable 96-bytes cache and
978+
* use 128-bytes cache instead.
979+
*/
980+
if (KMALLOC_MIN_SIZE >= 64) {
981+
index = size_index[(128 - 1) / 8];
982+
for (size = 64 + 8; size <= 96; size += 8)
983+
size_index[(size - 1) / 8] = index;
984+
}
985+
986+
/* The minimal alignment is 128-bytes, so disable 192-bytes cache and
987+
* use 256-bytes cache instead.
988+
*/
989+
if (KMALLOC_MIN_SIZE >= 128) {
990+
index = fls(256 - 1) - 1;
991+
for (size = 128 + 8; size <= 192; size += 8)
992+
size_index[(size - 1) / 8] = index;
993+
}
994+
995+
return 0;
996+
}
997+
subsys_initcall(bpf_mem_cache_adjust_size);
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
// SPDX-License-Identifier: GPL-2.0
2+
/* Copyright (C) 2023. Huawei Technologies Co., Ltd */
3+
#define _GNU_SOURCE
4+
#include <sched.h>
5+
#include <pthread.h>
6+
#include <stdbool.h>
7+
#include <bpf/btf.h>
8+
#include <test_progs.h>
9+
10+
#include "test_bpf_ma.skel.h"
11+
12+
void test_test_bpf_ma(void)
13+
{
14+
struct test_bpf_ma *skel;
15+
struct btf *btf;
16+
int i, err;
17+
18+
skel = test_bpf_ma__open();
19+
if (!ASSERT_OK_PTR(skel, "open"))
20+
return;
21+
22+
btf = bpf_object__btf(skel->obj);
23+
if (!ASSERT_OK_PTR(btf, "btf"))
24+
goto out;
25+
26+
for (i = 0; i < ARRAY_SIZE(skel->rodata->data_sizes); i++) {
27+
char name[32];
28+
int id;
29+
30+
snprintf(name, sizeof(name), "bin_data_%u", skel->rodata->data_sizes[i]);
31+
id = btf__find_by_name_kind(btf, name, BTF_KIND_STRUCT);
32+
if (!ASSERT_GT(id, 0, "bin_data"))
33+
goto out;
34+
skel->rodata->data_btf_ids[i] = id;
35+
}
36+
37+
err = test_bpf_ma__load(skel);
38+
if (!ASSERT_OK(err, "load"))
39+
goto out;
40+
41+
err = test_bpf_ma__attach(skel);
42+
if (!ASSERT_OK(err, "attach"))
43+
goto out;
44+
45+
skel->bss->pid = getpid();
46+
usleep(1);
47+
ASSERT_OK(skel->bss->err, "test error");
48+
out:
49+
test_bpf_ma__destroy(skel);
50+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
// SPDX-License-Identifier: GPL-2.0
2+
/* Copyright (C) 2023. Huawei Technologies Co., Ltd */
3+
#include <vmlinux.h>
4+
#include <bpf/bpf_tracing.h>
5+
#include <bpf/bpf_helpers.h>
6+
7+
#include "bpf_experimental.h"
8+
#include "bpf_misc.h"
9+
10+
#ifndef ARRAY_SIZE
11+
#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
12+
#endif
13+
14+
struct generic_map_value {
15+
void *data;
16+
};
17+
18+
char _license[] SEC("license") = "GPL";
19+
20+
const unsigned int data_sizes[] = {8, 16, 32, 64, 96, 128, 192, 256, 512, 1024, 2048, 4096};
21+
const volatile unsigned int data_btf_ids[ARRAY_SIZE(data_sizes)] = {};
22+
23+
int err = 0;
24+
int pid = 0;
25+
26+
#define DEFINE_ARRAY_WITH_KPTR(_size) \
27+
struct bin_data_##_size { \
28+
char data[_size - sizeof(void *)]; \
29+
}; \
30+
struct map_value_##_size { \
31+
struct bin_data_##_size __kptr * data; \
32+
/* To emit BTF info for bin_data_xx */ \
33+
struct bin_data_##_size not_used; \
34+
}; \
35+
struct { \
36+
__uint(type, BPF_MAP_TYPE_ARRAY); \
37+
__type(key, int); \
38+
__type(value, struct map_value_##_size); \
39+
__uint(max_entries, 128); \
40+
} array_##_size SEC(".maps");
41+
42+
static __always_inline void batch_alloc_free(struct bpf_map *map, unsigned int batch,
43+
unsigned int idx)
44+
{
45+
struct generic_map_value *value;
46+
unsigned int i, key;
47+
void *old, *new;
48+
49+
for (i = 0; i < batch; i++) {
50+
key = i;
51+
value = bpf_map_lookup_elem(map, &key);
52+
if (!value) {
53+
err = 1;
54+
return;
55+
}
56+
new = bpf_obj_new_impl(data_btf_ids[idx], NULL);
57+
if (!new) {
58+
err = 2;
59+
return;
60+
}
61+
old = bpf_kptr_xchg(&value->data, new);
62+
if (old) {
63+
bpf_obj_drop(old);
64+
err = 3;
65+
return;
66+
}
67+
}
68+
for (i = 0; i < batch; i++) {
69+
key = i;
70+
value = bpf_map_lookup_elem(map, &key);
71+
if (!value) {
72+
err = 4;
73+
return;
74+
}
75+
old = bpf_kptr_xchg(&value->data, NULL);
76+
if (!old) {
77+
err = 5;
78+
return;
79+
}
80+
bpf_obj_drop(old);
81+
}
82+
}
83+
84+
#define CALL_BATCH_ALLOC_FREE(size, batch, idx) \
85+
batch_alloc_free((struct bpf_map *)(&array_##size), batch, idx)
86+
87+
DEFINE_ARRAY_WITH_KPTR(8);
88+
DEFINE_ARRAY_WITH_KPTR(16);
89+
DEFINE_ARRAY_WITH_KPTR(32);
90+
DEFINE_ARRAY_WITH_KPTR(64);
91+
DEFINE_ARRAY_WITH_KPTR(96);
92+
DEFINE_ARRAY_WITH_KPTR(128);
93+
DEFINE_ARRAY_WITH_KPTR(192);
94+
DEFINE_ARRAY_WITH_KPTR(256);
95+
DEFINE_ARRAY_WITH_KPTR(512);
96+
DEFINE_ARRAY_WITH_KPTR(1024);
97+
DEFINE_ARRAY_WITH_KPTR(2048);
98+
DEFINE_ARRAY_WITH_KPTR(4096);
99+
100+
SEC("fentry/" SYS_PREFIX "sys_nanosleep")
101+
int test_bpf_mem_alloc_free(void *ctx)
102+
{
103+
if ((u32)bpf_get_current_pid_tgid() != pid)
104+
return 0;
105+
106+
/* Alloc 128 8-bytes objects in batch to trigger refilling,
107+
* then free 128 8-bytes objects in batch to trigger freeing.
108+
*/
109+
CALL_BATCH_ALLOC_FREE(8, 128, 0);
110+
CALL_BATCH_ALLOC_FREE(16, 128, 1);
111+
CALL_BATCH_ALLOC_FREE(32, 128, 2);
112+
CALL_BATCH_ALLOC_FREE(64, 128, 3);
113+
CALL_BATCH_ALLOC_FREE(96, 128, 4);
114+
CALL_BATCH_ALLOC_FREE(128, 128, 5);
115+
CALL_BATCH_ALLOC_FREE(192, 128, 6);
116+
CALL_BATCH_ALLOC_FREE(256, 128, 7);
117+
CALL_BATCH_ALLOC_FREE(512, 64, 8);
118+
CALL_BATCH_ALLOC_FREE(1024, 32, 9);
119+
CALL_BATCH_ALLOC_FREE(2048, 16, 10);
120+
CALL_BATCH_ALLOC_FREE(4096, 8, 11);
121+
122+
return 0;
123+
}

0 commit comments

Comments
 (0)