Skip to content

Commit 3a0af8f

Browse files
tgrafdavem330
authored andcommitted
bpf: BPF for lightweight tunnel infrastructure
Registers new BPF program types which correspond to the LWT hooks: - BPF_PROG_TYPE_LWT_IN => dst_input() - BPF_PROG_TYPE_LWT_OUT => dst_output() - BPF_PROG_TYPE_LWT_XMIT => lwtunnel_xmit() The separate program types are required to differentiate between the capabilities each LWT hook allows: * Programs attached to dst_input() or dst_output() are restricted and may only read the data of an skb. This prevent modification and possible invalidation of already validated packet headers on receive and the construction of illegal headers while the IP headers are still being assembled. * Programs attached to lwtunnel_xmit() are allowed to modify packet content as well as prepending an L2 header via a newly introduced helper bpf_skb_change_head(). This is safe as lwtunnel_xmit() is invoked after the IP header has been assembled completely. All BPF programs receive an skb with L3 headers attached and may return one of the following error codes: BPF_OK - Continue routing as per nexthop BPF_DROP - Drop skb and return EPERM BPF_REDIRECT - Redirect skb to device as per redirect() helper. (Only valid in lwtunnel_xmit() context) The return codes are binary compatible with their TC_ACT_ relatives to ease compatibility. Signed-off-by: Thomas Graf <tgraf@suug.ch> Acked-by: Alexei Starovoitov <ast@kernel.org> Acked-by: Daniel Borkmann <daniel@iogearbox.net> Signed-off-by: David S. Miller <davem@davemloft.net>
1 parent efd8570 commit 3a0af8f

File tree

9 files changed

+646
-5
lines changed

9 files changed

+646
-5
lines changed

include/linux/filter.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -438,7 +438,7 @@ struct xdp_buff {
438438
};
439439

440440
/* compute the linear packet data range [data, data_end) which
441-
* will be accessed by cls_bpf and act_bpf programs
441+
* will be accessed by cls_bpf, act_bpf and lwt programs
442442
*/
443443
static inline void bpf_compute_data_end(struct sk_buff *skb)
444444
{

include/uapi/linux/bpf.h

+31-1
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,9 @@ enum bpf_prog_type {
101101
BPF_PROG_TYPE_XDP,
102102
BPF_PROG_TYPE_PERF_EVENT,
103103
BPF_PROG_TYPE_CGROUP_SKB,
104+
BPF_PROG_TYPE_LWT_IN,
105+
BPF_PROG_TYPE_LWT_OUT,
106+
BPF_PROG_TYPE_LWT_XMIT,
104107
};
105108

106109
enum bpf_attach_type {
@@ -409,6 +412,16 @@ union bpf_attr {
409412
*
410413
* int bpf_get_numa_node_id()
411414
* Return: Id of current NUMA node.
415+
*
416+
* int bpf_skb_change_head()
417+
* Grows headroom of skb and adjusts MAC header offset accordingly.
418+
* Will extends/reallocae as required automatically.
419+
* May change skb data pointer and will thus invalidate any check
420+
* performed for direct packet access.
421+
* @skb: pointer to skb
422+
* @len: length of header to be pushed in front
423+
* @flags: Flags (unused for now)
424+
* Return: 0 on success or negative error
412425
*/
413426
#define __BPF_FUNC_MAPPER(FN) \
414427
FN(unspec), \
@@ -453,7 +466,8 @@ union bpf_attr {
453466
FN(skb_pull_data), \
454467
FN(csum_update), \
455468
FN(set_hash_invalid), \
456-
FN(get_numa_node_id),
469+
FN(get_numa_node_id), \
470+
FN(skb_change_head),
457471

458472
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
459473
* function eBPF program intends to call
@@ -537,6 +551,22 @@ struct bpf_tunnel_key {
537551
__u32 tunnel_label;
538552
};
539553

554+
/* Generic BPF return codes which all BPF program types may support.
555+
* The values are binary compatible with their TC_ACT_* counter-part to
556+
* provide backwards compatibility with existing SCHED_CLS and SCHED_ACT
557+
* programs.
558+
*
559+
* XDP is handled seprately, see XDP_*.
560+
*/
561+
enum bpf_ret_code {
562+
BPF_OK = 0,
563+
/* 1 reserved */
564+
BPF_DROP = 2,
565+
/* 3-6 reserved */
566+
BPF_REDIRECT = 7,
567+
/* >127 are reserved for prog type specific return codes */
568+
};
569+
540570
/* User return codes for XDP prog type.
541571
* A valid XDP program must return one of these defined values. All other
542572
* return codes are reserved for future use. Unknown return codes will result

include/uapi/linux/lwtunnel.h

+23
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ enum lwtunnel_encap_types {
1010
LWTUNNEL_ENCAP_ILA,
1111
LWTUNNEL_ENCAP_IP6,
1212
LWTUNNEL_ENCAP_SEG6,
13+
LWTUNNEL_ENCAP_BPF,
1314
__LWTUNNEL_ENCAP_MAX,
1415
};
1516

@@ -43,4 +44,26 @@ enum lwtunnel_ip6_t {
4344

4445
#define LWTUNNEL_IP6_MAX (__LWTUNNEL_IP6_MAX - 1)
4546

47+
enum {
48+
LWT_BPF_PROG_UNSPEC,
49+
LWT_BPF_PROG_FD,
50+
LWT_BPF_PROG_NAME,
51+
__LWT_BPF_PROG_MAX,
52+
};
53+
54+
#define LWT_BPF_PROG_MAX (__LWT_BPF_PROG_MAX - 1)
55+
56+
enum {
57+
LWT_BPF_UNSPEC,
58+
LWT_BPF_IN,
59+
LWT_BPF_OUT,
60+
LWT_BPF_XMIT,
61+
LWT_BPF_XMIT_HEADROOM,
62+
__LWT_BPF_MAX,
63+
};
64+
65+
#define LWT_BPF_MAX (__LWT_BPF_MAX - 1)
66+
67+
#define LWT_BPF_MAX_HEADROOM 256
68+
4669
#endif /* _UAPI_LWTUNNEL_H_ */

kernel/bpf/verifier.c

+11-3
Original file line numberDiff line numberDiff line change
@@ -633,12 +633,19 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
633633
#define MAX_PACKET_OFF 0xffff
634634

635635
static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
636-
const struct bpf_call_arg_meta *meta)
636+
const struct bpf_call_arg_meta *meta,
637+
enum bpf_access_type t)
637638
{
638639
switch (env->prog->type) {
640+
case BPF_PROG_TYPE_LWT_IN:
641+
case BPF_PROG_TYPE_LWT_OUT:
642+
/* dst_input() and dst_output() can't write for now */
643+
if (t == BPF_WRITE)
644+
return false;
639645
case BPF_PROG_TYPE_SCHED_CLS:
640646
case BPF_PROG_TYPE_SCHED_ACT:
641647
case BPF_PROG_TYPE_XDP:
648+
case BPF_PROG_TYPE_LWT_XMIT:
642649
if (meta)
643650
return meta->pkt_access;
644651

@@ -837,7 +844,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
837844
err = check_stack_read(state, off, size, value_regno);
838845
}
839846
} else if (state->regs[regno].type == PTR_TO_PACKET) {
840-
if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL)) {
847+
if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
841848
verbose("cannot write into packet\n");
842849
return -EACCES;
843850
}
@@ -970,7 +977,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
970977
return 0;
971978
}
972979

973-
if (type == PTR_TO_PACKET && !may_access_direct_pkt_data(env, meta)) {
980+
if (type == PTR_TO_PACKET &&
981+
!may_access_direct_pkt_data(env, meta, BPF_READ)) {
974982
verbose("helper access to the packet is not allowed\n");
975983
return -EACCES;
976984
}

net/Kconfig

+8
Original file line numberDiff line numberDiff line change
@@ -402,6 +402,14 @@ config LWTUNNEL
402402
weight tunnel endpoint. Tunnel encapsulation parameters are stored
403403
with light weight tunnel state associated with fib routes.
404404

405+
config LWTUNNEL_BPF
406+
bool "Execute BPF program as route nexthop action"
407+
depends on LWTUNNEL
408+
default y if LWTUNNEL=y
409+
---help---
410+
Allows to run BPF programs as a nexthop action following a route
411+
lookup for incoming and outgoing packets.
412+
405413
config DST_CACHE
406414
bool
407415
default n

net/core/Makefile

+1
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o
2424
obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
2525
obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
2626
obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
27+
obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o
2728
obj-$(CONFIG_DST_CACHE) += dst_cache.o
2829
obj-$(CONFIG_HWBM) += hwbm.o
2930
obj-$(CONFIG_NET_DEVLINK) += devlink.o

net/core/filter.c

+173
Original file line numberDiff line numberDiff line change
@@ -1689,6 +1689,12 @@ static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev,
16891689
static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev,
16901690
u32 flags)
16911691
{
1692+
/* Verify that a link layer header is carried */
1693+
if (unlikely(skb->mac_header >= skb->network_header)) {
1694+
kfree_skb(skb);
1695+
return -ERANGE;
1696+
}
1697+
16921698
bpf_push_mac_rcsum(skb);
16931699
return flags & BPF_F_INGRESS ?
16941700
__bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
@@ -2188,12 +2194,53 @@ static const struct bpf_func_proto bpf_skb_change_tail_proto = {
21882194
.arg3_type = ARG_ANYTHING,
21892195
};
21902196

2197+
BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room,
2198+
u64, flags)
2199+
{
2200+
u32 max_len = __bpf_skb_max_len(skb);
2201+
u32 new_len = skb->len + head_room;
2202+
int ret;
2203+
2204+
if (unlikely(flags || (!skb_is_gso(skb) && new_len > max_len) ||
2205+
new_len < skb->len))
2206+
return -EINVAL;
2207+
2208+
ret = skb_cow(skb, head_room);
2209+
if (likely(!ret)) {
2210+
/* Idea for this helper is that we currently only
2211+
* allow to expand on mac header. This means that
2212+
* skb->protocol network header, etc, stay as is.
2213+
* Compared to bpf_skb_change_tail(), we're more
2214+
* flexible due to not needing to linearize or
2215+
* reset GSO. Intention for this helper is to be
2216+
* used by an L3 skb that needs to push mac header
2217+
* for redirection into L2 device.
2218+
*/
2219+
__skb_push(skb, head_room);
2220+
memset(skb->data, 0, head_room);
2221+
skb_reset_mac_header(skb);
2222+
}
2223+
2224+
bpf_compute_data_end(skb);
2225+
return 0;
2226+
}
2227+
2228+
static const struct bpf_func_proto bpf_skb_change_head_proto = {
2229+
.func = bpf_skb_change_head,
2230+
.gpl_only = false,
2231+
.ret_type = RET_INTEGER,
2232+
.arg1_type = ARG_PTR_TO_CTX,
2233+
.arg2_type = ARG_ANYTHING,
2234+
.arg3_type = ARG_ANYTHING,
2235+
};
2236+
21912237
bool bpf_helper_changes_skb_data(void *func)
21922238
{
21932239
if (func == bpf_skb_vlan_push ||
21942240
func == bpf_skb_vlan_pop ||
21952241
func == bpf_skb_store_bytes ||
21962242
func == bpf_skb_change_proto ||
2243+
func == bpf_skb_change_head ||
21972244
func == bpf_skb_change_tail ||
21982245
func == bpf_skb_pull_data ||
21992246
func == bpf_l3_csum_replace ||
@@ -2639,6 +2686,68 @@ cg_skb_func_proto(enum bpf_func_id func_id)
26392686
}
26402687
}
26412688

2689+
static const struct bpf_func_proto *
2690+
lwt_inout_func_proto(enum bpf_func_id func_id)
2691+
{
2692+
switch (func_id) {
2693+
case BPF_FUNC_skb_load_bytes:
2694+
return &bpf_skb_load_bytes_proto;
2695+
case BPF_FUNC_skb_pull_data:
2696+
return &bpf_skb_pull_data_proto;
2697+
case BPF_FUNC_csum_diff:
2698+
return &bpf_csum_diff_proto;
2699+
case BPF_FUNC_get_cgroup_classid:
2700+
return &bpf_get_cgroup_classid_proto;
2701+
case BPF_FUNC_get_route_realm:
2702+
return &bpf_get_route_realm_proto;
2703+
case BPF_FUNC_get_hash_recalc:
2704+
return &bpf_get_hash_recalc_proto;
2705+
case BPF_FUNC_perf_event_output:
2706+
return &bpf_skb_event_output_proto;
2707+
case BPF_FUNC_get_smp_processor_id:
2708+
return &bpf_get_smp_processor_id_proto;
2709+
case BPF_FUNC_skb_under_cgroup:
2710+
return &bpf_skb_under_cgroup_proto;
2711+
default:
2712+
return sk_filter_func_proto(func_id);
2713+
}
2714+
}
2715+
2716+
static const struct bpf_func_proto *
2717+
lwt_xmit_func_proto(enum bpf_func_id func_id)
2718+
{
2719+
switch (func_id) {
2720+
case BPF_FUNC_skb_get_tunnel_key:
2721+
return &bpf_skb_get_tunnel_key_proto;
2722+
case BPF_FUNC_skb_set_tunnel_key:
2723+
return bpf_get_skb_set_tunnel_proto(func_id);
2724+
case BPF_FUNC_skb_get_tunnel_opt:
2725+
return &bpf_skb_get_tunnel_opt_proto;
2726+
case BPF_FUNC_skb_set_tunnel_opt:
2727+
return bpf_get_skb_set_tunnel_proto(func_id);
2728+
case BPF_FUNC_redirect:
2729+
return &bpf_redirect_proto;
2730+
case BPF_FUNC_clone_redirect:
2731+
return &bpf_clone_redirect_proto;
2732+
case BPF_FUNC_skb_change_tail:
2733+
return &bpf_skb_change_tail_proto;
2734+
case BPF_FUNC_skb_change_head:
2735+
return &bpf_skb_change_head_proto;
2736+
case BPF_FUNC_skb_store_bytes:
2737+
return &bpf_skb_store_bytes_proto;
2738+
case BPF_FUNC_csum_update:
2739+
return &bpf_csum_update_proto;
2740+
case BPF_FUNC_l3_csum_replace:
2741+
return &bpf_l3_csum_replace_proto;
2742+
case BPF_FUNC_l4_csum_replace:
2743+
return &bpf_l4_csum_replace_proto;
2744+
case BPF_FUNC_set_hash_invalid:
2745+
return &bpf_set_hash_invalid_proto;
2746+
default:
2747+
return lwt_inout_func_proto(func_id);
2748+
}
2749+
}
2750+
26422751
static bool __is_valid_access(int off, int size, enum bpf_access_type type)
26432752
{
26442753
if (off < 0 || off >= sizeof(struct __sk_buff))
@@ -2676,6 +2785,39 @@ static bool sk_filter_is_valid_access(int off, int size,
26762785
return __is_valid_access(off, size, type);
26772786
}
26782787

2788+
static bool lwt_is_valid_access(int off, int size,
2789+
enum bpf_access_type type,
2790+
enum bpf_reg_type *reg_type)
2791+
{
2792+
switch (off) {
2793+
case offsetof(struct __sk_buff, tc_classid):
2794+
return false;
2795+
}
2796+
2797+
if (type == BPF_WRITE) {
2798+
switch (off) {
2799+
case offsetof(struct __sk_buff, mark):
2800+
case offsetof(struct __sk_buff, priority):
2801+
case offsetof(struct __sk_buff, cb[0]) ...
2802+
offsetof(struct __sk_buff, cb[4]):
2803+
break;
2804+
default:
2805+
return false;
2806+
}
2807+
}
2808+
2809+
switch (off) {
2810+
case offsetof(struct __sk_buff, data):
2811+
*reg_type = PTR_TO_PACKET;
2812+
break;
2813+
case offsetof(struct __sk_buff, data_end):
2814+
*reg_type = PTR_TO_PACKET_END;
2815+
break;
2816+
}
2817+
2818+
return __is_valid_access(off, size, type);
2819+
}
2820+
26792821
static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
26802822
const struct bpf_prog *prog)
26812823
{
@@ -3007,6 +3149,19 @@ static const struct bpf_verifier_ops cg_skb_ops = {
30073149
.convert_ctx_access = sk_filter_convert_ctx_access,
30083150
};
30093151

3152+
static const struct bpf_verifier_ops lwt_inout_ops = {
3153+
.get_func_proto = lwt_inout_func_proto,
3154+
.is_valid_access = lwt_is_valid_access,
3155+
.convert_ctx_access = sk_filter_convert_ctx_access,
3156+
};
3157+
3158+
static const struct bpf_verifier_ops lwt_xmit_ops = {
3159+
.get_func_proto = lwt_xmit_func_proto,
3160+
.is_valid_access = lwt_is_valid_access,
3161+
.convert_ctx_access = sk_filter_convert_ctx_access,
3162+
.gen_prologue = tc_cls_act_prologue,
3163+
};
3164+
30103165
static struct bpf_prog_type_list sk_filter_type __read_mostly = {
30113166
.ops = &sk_filter_ops,
30123167
.type = BPF_PROG_TYPE_SOCKET_FILTER,
@@ -3032,13 +3187,31 @@ static struct bpf_prog_type_list cg_skb_type __read_mostly = {
30323187
.type = BPF_PROG_TYPE_CGROUP_SKB,
30333188
};
30343189

3190+
static struct bpf_prog_type_list lwt_in_type __read_mostly = {
3191+
.ops = &lwt_inout_ops,
3192+
.type = BPF_PROG_TYPE_LWT_IN,
3193+
};
3194+
3195+
static struct bpf_prog_type_list lwt_out_type __read_mostly = {
3196+
.ops = &lwt_inout_ops,
3197+
.type = BPF_PROG_TYPE_LWT_OUT,
3198+
};
3199+
3200+
static struct bpf_prog_type_list lwt_xmit_type __read_mostly = {
3201+
.ops = &lwt_xmit_ops,
3202+
.type = BPF_PROG_TYPE_LWT_XMIT,
3203+
};
3204+
30353205
static int __init register_sk_filter_ops(void)
30363206
{
30373207
bpf_register_prog_type(&sk_filter_type);
30383208
bpf_register_prog_type(&sched_cls_type);
30393209
bpf_register_prog_type(&sched_act_type);
30403210
bpf_register_prog_type(&xdp_type);
30413211
bpf_register_prog_type(&cg_skb_type);
3212+
bpf_register_prog_type(&lwt_in_type);
3213+
bpf_register_prog_type(&lwt_out_type);
3214+
bpf_register_prog_type(&lwt_xmit_type);
30423215

30433216
return 0;
30443217
}

0 commit comments

Comments
 (0)