From 88575199cc65de99a156888629a68180c830eff2 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 26 Nov 2016 01:28:04 +0100 Subject: bpf: drop unnecessary context cast from BPF_PROG_RUN Since long already bpf_func is not only about struct sk_buff * as input anymore. Make it generic as void *, so that callers don't need to cast for it each time they call BPF_PROG_RUN(). Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/filter.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux/filter.h') diff --git a/include/linux/filter.h b/include/linux/filter.h index 1f09c521adfe..7f246a281435 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -408,8 +408,8 @@ struct bpf_prog { enum bpf_prog_type type; /* Type of BPF program */ struct bpf_prog_aux *aux; /* Auxiliary fields */ struct sock_fprog_kern *orig_prog; /* Original BPF program */ - unsigned int (*bpf_func)(const struct sk_buff *skb, - const struct bpf_insn *filter); + unsigned int (*bpf_func)(const void *ctx, + const struct bpf_insn *insn); /* Instructions for interpreter */ union { struct sock_filter insns[0]; @@ -504,7 +504,7 @@ static inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog, u32 ret; rcu_read_lock(); - ret = BPF_PROG_RUN(prog, (void *)xdp); + ret = BPF_PROG_RUN(prog, xdp); rcu_read_unlock(); return ret; -- cgit v1.3-7-g2ca7 From 3a0af8fd61f90920f6fa04e4f1e9a6a73c1b4fd2 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Wed, 30 Nov 2016 17:10:10 +0100 Subject: bpf: BPF for lightweight tunnel infrastructure Registers new BPF program types which correspond to the LWT hooks: - BPF_PROG_TYPE_LWT_IN => dst_input() - BPF_PROG_TYPE_LWT_OUT => dst_output() - BPF_PROG_TYPE_LWT_XMIT => lwtunnel_xmit() The separate program types are required to differentiate between the capabilities each LWT hook allows: * Programs attached to dst_input() or dst_output() are restricted and may only read the data of an skb. This prevent modification and possible invalidation of already validated packet headers on receive and the construction of illegal headers while the IP headers are still being assembled. * Programs attached to lwtunnel_xmit() are allowed to modify packet content as well as prepending an L2 header via a newly introduced helper bpf_skb_change_head(). This is safe as lwtunnel_xmit() is invoked after the IP header has been assembled completely. All BPF programs receive an skb with L3 headers attached and may return one of the following error codes: BPF_OK - Continue routing as per nexthop BPF_DROP - Drop skb and return EPERM BPF_REDIRECT - Redirect skb to device as per redirect() helper. (Only valid in lwtunnel_xmit() context) The return codes are binary compatible with their TC_ACT_ relatives to ease compatibility. Signed-off-by: Thomas Graf Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/filter.h | 2 +- include/uapi/linux/bpf.h | 32 +++- include/uapi/linux/lwtunnel.h | 23 +++ kernel/bpf/verifier.c | 14 +- net/Kconfig | 8 + net/core/Makefile | 1 + net/core/filter.c | 173 ++++++++++++++++++ net/core/lwt_bpf.c | 396 ++++++++++++++++++++++++++++++++++++++++++ net/core/lwtunnel.c | 2 + 9 files changed, 646 insertions(+), 5 deletions(-) create mode 100644 net/core/lwt_bpf.c (limited to 'include/linux/filter.h') diff --git a/include/linux/filter.h b/include/linux/filter.h index 7f246a281435..7ba644626553 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -438,7 +438,7 @@ struct xdp_buff { }; /* compute the linear packet data range [data, data_end) which - * will be accessed by cls_bpf and act_bpf programs + * will be accessed by cls_bpf, act_bpf and lwt programs */ static inline void bpf_compute_data_end(struct sk_buff *skb) { diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1370a9d1456f..22ac82792687 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -101,6 +101,9 @@ enum bpf_prog_type { BPF_PROG_TYPE_XDP, BPF_PROG_TYPE_PERF_EVENT, BPF_PROG_TYPE_CGROUP_SKB, + BPF_PROG_TYPE_LWT_IN, + BPF_PROG_TYPE_LWT_OUT, + BPF_PROG_TYPE_LWT_XMIT, }; enum bpf_attach_type { @@ -409,6 +412,16 @@ union bpf_attr { * * int bpf_get_numa_node_id() * Return: Id of current NUMA node. + * + * int bpf_skb_change_head() + * Grows headroom of skb and adjusts MAC header offset accordingly. + * Will extends/reallocae as required automatically. + * May change skb data pointer and will thus invalidate any check + * performed for direct packet access. + * @skb: pointer to skb + * @len: length of header to be pushed in front + * @flags: Flags (unused for now) + * Return: 0 on success or negative error */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -453,7 +466,8 @@ union bpf_attr { FN(skb_pull_data), \ FN(csum_update), \ FN(set_hash_invalid), \ - FN(get_numa_node_id), + FN(get_numa_node_id), \ + FN(skb_change_head), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -537,6 +551,22 @@ struct bpf_tunnel_key { __u32 tunnel_label; }; +/* Generic BPF return codes which all BPF program types may support. + * The values are binary compatible with their TC_ACT_* counter-part to + * provide backwards compatibility with existing SCHED_CLS and SCHED_ACT + * programs. + * + * XDP is handled seprately, see XDP_*. + */ +enum bpf_ret_code { + BPF_OK = 0, + /* 1 reserved */ + BPF_DROP = 2, + /* 3-6 reserved */ + BPF_REDIRECT = 7, + /* >127 are reserved for prog type specific return codes */ +}; + /* User return codes for XDP prog type. * A valid XDP program must return one of these defined values. All other * return codes are reserved for future use. Unknown return codes will result diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h index 453cc6215bfd..92724cba1eba 100644 --- a/include/uapi/linux/lwtunnel.h +++ b/include/uapi/linux/lwtunnel.h @@ -10,6 +10,7 @@ enum lwtunnel_encap_types { LWTUNNEL_ENCAP_ILA, LWTUNNEL_ENCAP_IP6, LWTUNNEL_ENCAP_SEG6, + LWTUNNEL_ENCAP_BPF, __LWTUNNEL_ENCAP_MAX, }; @@ -43,4 +44,26 @@ enum lwtunnel_ip6_t { #define LWTUNNEL_IP6_MAX (__LWTUNNEL_IP6_MAX - 1) +enum { + LWT_BPF_PROG_UNSPEC, + LWT_BPF_PROG_FD, + LWT_BPF_PROG_NAME, + __LWT_BPF_PROG_MAX, +}; + +#define LWT_BPF_PROG_MAX (__LWT_BPF_PROG_MAX - 1) + +enum { + LWT_BPF_UNSPEC, + LWT_BPF_IN, + LWT_BPF_OUT, + LWT_BPF_XMIT, + LWT_BPF_XMIT_HEADROOM, + __LWT_BPF_MAX, +}; + +#define LWT_BPF_MAX (__LWT_BPF_MAX - 1) + +#define LWT_BPF_MAX_HEADROOM 256 + #endif /* _UAPI_LWTUNNEL_H_ */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8740c5fa02fc..8135cb1077ee 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -633,12 +633,19 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off, #define MAX_PACKET_OFF 0xffff static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, - const struct bpf_call_arg_meta *meta) + const struct bpf_call_arg_meta *meta, + enum bpf_access_type t) { switch (env->prog->type) { + case BPF_PROG_TYPE_LWT_IN: + case BPF_PROG_TYPE_LWT_OUT: + /* dst_input() and dst_output() can't write for now */ + if (t == BPF_WRITE) + return false; case BPF_PROG_TYPE_SCHED_CLS: case BPF_PROG_TYPE_SCHED_ACT: case BPF_PROG_TYPE_XDP: + case BPF_PROG_TYPE_LWT_XMIT: if (meta) return meta->pkt_access; @@ -837,7 +844,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, err = check_stack_read(state, off, size, value_regno); } } else if (state->regs[regno].type == PTR_TO_PACKET) { - if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL)) { + if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) { verbose("cannot write into packet\n"); return -EACCES; } @@ -970,7 +977,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, return 0; } - if (type == PTR_TO_PACKET && !may_access_direct_pkt_data(env, meta)) { + if (type == PTR_TO_PACKET && + !may_access_direct_pkt_data(env, meta, BPF_READ)) { verbose("helper access to the packet is not allowed\n"); return -EACCES; } diff --git a/net/Kconfig b/net/Kconfig index 7b6cd340b72b..a1005007224c 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -402,6 +402,14 @@ config LWTUNNEL weight tunnel endpoint. Tunnel encapsulation parameters are stored with light weight tunnel state associated with fib routes. +config LWTUNNEL_BPF + bool "Execute BPF program as route nexthop action" + depends on LWTUNNEL + default y if LWTUNNEL=y + ---help--- + Allows to run BPF programs as a nexthop action following a route + lookup for incoming and outgoing packets. + config DST_CACHE bool default n diff --git a/net/core/Makefile b/net/core/Makefile index d6508c2ddca5..f6761b6e3b29 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -24,6 +24,7 @@ obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o obj-$(CONFIG_LWTUNNEL) += lwtunnel.o +obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o obj-$(CONFIG_DST_CACHE) += dst_cache.o obj-$(CONFIG_HWBM) += hwbm.o obj-$(CONFIG_NET_DEVLINK) += devlink.o diff --git a/net/core/filter.c b/net/core/filter.c index 698a262b8ebb..1c4d0faf22c8 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1689,6 +1689,12 @@ static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev, static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev, u32 flags) { + /* Verify that a link layer header is carried */ + if (unlikely(skb->mac_header >= skb->network_header)) { + kfree_skb(skb); + return -ERANGE; + } + bpf_push_mac_rcsum(skb); return flags & BPF_F_INGRESS ? __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb); @@ -2188,12 +2194,53 @@ static const struct bpf_func_proto bpf_skb_change_tail_proto = { .arg3_type = ARG_ANYTHING, }; +BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room, + u64, flags) +{ + u32 max_len = __bpf_skb_max_len(skb); + u32 new_len = skb->len + head_room; + int ret; + + if (unlikely(flags || (!skb_is_gso(skb) && new_len > max_len) || + new_len < skb->len)) + return -EINVAL; + + ret = skb_cow(skb, head_room); + if (likely(!ret)) { + /* Idea for this helper is that we currently only + * allow to expand on mac header. This means that + * skb->protocol network header, etc, stay as is. + * Compared to bpf_skb_change_tail(), we're more + * flexible due to not needing to linearize or + * reset GSO. Intention for this helper is to be + * used by an L3 skb that needs to push mac header + * for redirection into L2 device. + */ + __skb_push(skb, head_room); + memset(skb->data, 0, head_room); + skb_reset_mac_header(skb); + } + + bpf_compute_data_end(skb); + return 0; +} + +static const struct bpf_func_proto bpf_skb_change_head_proto = { + .func = bpf_skb_change_head, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + bool bpf_helper_changes_skb_data(void *func) { if (func == bpf_skb_vlan_push || func == bpf_skb_vlan_pop || func == bpf_skb_store_bytes || func == bpf_skb_change_proto || + func == bpf_skb_change_head || func == bpf_skb_change_tail || func == bpf_skb_pull_data || func == bpf_l3_csum_replace || @@ -2639,6 +2686,68 @@ cg_skb_func_proto(enum bpf_func_id func_id) } } +static const struct bpf_func_proto * +lwt_inout_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_skb_load_bytes: + return &bpf_skb_load_bytes_proto; + case BPF_FUNC_skb_pull_data: + return &bpf_skb_pull_data_proto; + case BPF_FUNC_csum_diff: + return &bpf_csum_diff_proto; + case BPF_FUNC_get_cgroup_classid: + return &bpf_get_cgroup_classid_proto; + case BPF_FUNC_get_route_realm: + return &bpf_get_route_realm_proto; + case BPF_FUNC_get_hash_recalc: + return &bpf_get_hash_recalc_proto; + case BPF_FUNC_perf_event_output: + return &bpf_skb_event_output_proto; + case BPF_FUNC_get_smp_processor_id: + return &bpf_get_smp_processor_id_proto; + case BPF_FUNC_skb_under_cgroup: + return &bpf_skb_under_cgroup_proto; + default: + return sk_filter_func_proto(func_id); + } +} + +static const struct bpf_func_proto * +lwt_xmit_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_skb_get_tunnel_key: + return &bpf_skb_get_tunnel_key_proto; + case BPF_FUNC_skb_set_tunnel_key: + return bpf_get_skb_set_tunnel_proto(func_id); + case BPF_FUNC_skb_get_tunnel_opt: + return &bpf_skb_get_tunnel_opt_proto; + case BPF_FUNC_skb_set_tunnel_opt: + return bpf_get_skb_set_tunnel_proto(func_id); + case BPF_FUNC_redirect: + return &bpf_redirect_proto; + case BPF_FUNC_clone_redirect: + return &bpf_clone_redirect_proto; + case BPF_FUNC_skb_change_tail: + return &bpf_skb_change_tail_proto; + case BPF_FUNC_skb_change_head: + return &bpf_skb_change_head_proto; + case BPF_FUNC_skb_store_bytes: + return &bpf_skb_store_bytes_proto; + case BPF_FUNC_csum_update: + return &bpf_csum_update_proto; + case BPF_FUNC_l3_csum_replace: + return &bpf_l3_csum_replace_proto; + case BPF_FUNC_l4_csum_replace: + return &bpf_l4_csum_replace_proto; + case BPF_FUNC_set_hash_invalid: + return &bpf_set_hash_invalid_proto; + default: + return lwt_inout_func_proto(func_id); + } +} + static bool __is_valid_access(int off, int size, enum bpf_access_type type) { if (off < 0 || off >= sizeof(struct __sk_buff)) @@ -2676,6 +2785,39 @@ static bool sk_filter_is_valid_access(int off, int size, return __is_valid_access(off, size, type); } +static bool lwt_is_valid_access(int off, int size, + enum bpf_access_type type, + enum bpf_reg_type *reg_type) +{ + switch (off) { + case offsetof(struct __sk_buff, tc_classid): + return false; + } + + if (type == BPF_WRITE) { + switch (off) { + case offsetof(struct __sk_buff, mark): + case offsetof(struct __sk_buff, priority): + case offsetof(struct __sk_buff, cb[0]) ... + offsetof(struct __sk_buff, cb[4]): + break; + default: + return false; + } + } + + switch (off) { + case offsetof(struct __sk_buff, data): + *reg_type = PTR_TO_PACKET; + break; + case offsetof(struct __sk_buff, data_end): + *reg_type = PTR_TO_PACKET_END; + break; + } + + return __is_valid_access(off, size, type); +} + static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, const struct bpf_prog *prog) { @@ -3007,6 +3149,19 @@ static const struct bpf_verifier_ops cg_skb_ops = { .convert_ctx_access = sk_filter_convert_ctx_access, }; +static const struct bpf_verifier_ops lwt_inout_ops = { + .get_func_proto = lwt_inout_func_proto, + .is_valid_access = lwt_is_valid_access, + .convert_ctx_access = sk_filter_convert_ctx_access, +}; + +static const struct bpf_verifier_ops lwt_xmit_ops = { + .get_func_proto = lwt_xmit_func_proto, + .is_valid_access = lwt_is_valid_access, + .convert_ctx_access = sk_filter_convert_ctx_access, + .gen_prologue = tc_cls_act_prologue, +}; + static struct bpf_prog_type_list sk_filter_type __read_mostly = { .ops = &sk_filter_ops, .type = BPF_PROG_TYPE_SOCKET_FILTER, @@ -3032,6 +3187,21 @@ static struct bpf_prog_type_list cg_skb_type __read_mostly = { .type = BPF_PROG_TYPE_CGROUP_SKB, }; +static struct bpf_prog_type_list lwt_in_type __read_mostly = { + .ops = &lwt_inout_ops, + .type = BPF_PROG_TYPE_LWT_IN, +}; + +static struct bpf_prog_type_list lwt_out_type __read_mostly = { + .ops = &lwt_inout_ops, + .type = BPF_PROG_TYPE_LWT_OUT, +}; + +static struct bpf_prog_type_list lwt_xmit_type __read_mostly = { + .ops = &lwt_xmit_ops, + .type = BPF_PROG_TYPE_LWT_XMIT, +}; + static int __init register_sk_filter_ops(void) { bpf_register_prog_type(&sk_filter_type); @@ -3039,6 +3209,9 @@ static int __init register_sk_filter_ops(void) bpf_register_prog_type(&sched_act_type); bpf_register_prog_type(&xdp_type); bpf_register_prog_type(&cg_skb_type); + bpf_register_prog_type(&lwt_in_type); + bpf_register_prog_type(&lwt_out_type); + bpf_register_prog_type(&lwt_xmit_type); return 0; } diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c new file mode 100644 index 000000000000..71bb3e2eca08 --- /dev/null +++ b/net/core/lwt_bpf.c @@ -0,0 +1,396 @@ +/* Copyright (c) 2016 Thomas Graf + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include + +struct bpf_lwt_prog { + struct bpf_prog *prog; + char *name; +}; + +struct bpf_lwt { + struct bpf_lwt_prog in; + struct bpf_lwt_prog out; + struct bpf_lwt_prog xmit; + int family; +}; + +#define MAX_PROG_NAME 256 + +static inline struct bpf_lwt *bpf_lwt_lwtunnel(struct lwtunnel_state *lwt) +{ + return (struct bpf_lwt *)lwt->data; +} + +#define NO_REDIRECT false +#define CAN_REDIRECT true + +static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, + struct dst_entry *dst, bool can_redirect) +{ + int ret; + + /* Preempt disable is needed to protect per-cpu redirect_info between + * BPF prog and skb_do_redirect(). The call_rcu in bpf_prog_put() and + * access to maps strictly require a rcu_read_lock() for protection, + * mixing with BH RCU lock doesn't work. + */ + preempt_disable(); + rcu_read_lock(); + bpf_compute_data_end(skb); + ret = bpf_prog_run_save_cb(lwt->prog, skb); + rcu_read_unlock(); + + switch (ret) { + case BPF_OK: + break; + + case BPF_REDIRECT: + if (unlikely(!can_redirect)) { + pr_warn_once("Illegal redirect return code in prog %s\n", + lwt->name ? : ""); + ret = BPF_OK; + } else { + ret = skb_do_redirect(skb); + if (ret == 0) + ret = BPF_REDIRECT; + } + break; + + case BPF_DROP: + kfree_skb(skb); + ret = -EPERM; + break; + + default: + pr_warn_once("bpf-lwt: Illegal return value %u, expect packet loss\n", ret); + kfree_skb(skb); + ret = -EINVAL; + break; + } + + preempt_enable(); + + return ret; +} + +static int bpf_input(struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct bpf_lwt *bpf; + int ret; + + bpf = bpf_lwt_lwtunnel(dst->lwtstate); + if (bpf->in.prog) { + ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT); + if (ret < 0) + return ret; + } + + if (unlikely(!dst->lwtstate->orig_input)) { + pr_warn_once("orig_input not set on dst for prog %s\n", + bpf->out.name); + kfree_skb(skb); + return -EINVAL; + } + + return dst->lwtstate->orig_input(skb); +} + +static int bpf_output(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct bpf_lwt *bpf; + int ret; + + bpf = bpf_lwt_lwtunnel(dst->lwtstate); + if (bpf->out.prog) { + ret = run_lwt_bpf(skb, &bpf->out, dst, NO_REDIRECT); + if (ret < 0) + return ret; + } + + if (unlikely(!dst->lwtstate->orig_output)) { + pr_warn_once("orig_output not set on dst for prog %s\n", + bpf->out.name); + kfree_skb(skb); + return -EINVAL; + } + + return dst->lwtstate->orig_output(net, sk, skb); +} + +static int xmit_check_hhlen(struct sk_buff *skb) +{ + int hh_len = skb_dst(skb)->dev->hard_header_len; + + if (skb_headroom(skb) < hh_len) { + int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb)); + + if (pskb_expand_head(skb, nhead, 0, GFP_ATOMIC)) + return -ENOMEM; + } + + return 0; +} + +static int bpf_xmit(struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct bpf_lwt *bpf; + + bpf = bpf_lwt_lwtunnel(dst->lwtstate); + if (bpf->xmit.prog) { + int ret; + + ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT); + switch (ret) { + case BPF_OK: + /* If the header was expanded, headroom might be too + * small for L2 header to come, expand as needed. + */ + ret = xmit_check_hhlen(skb); + if (unlikely(ret)) + return ret; + + return LWTUNNEL_XMIT_CONTINUE; + case BPF_REDIRECT: + return LWTUNNEL_XMIT_DONE; + default: + return ret; + } + } + + return LWTUNNEL_XMIT_CONTINUE; +} + +static void bpf_lwt_prog_destroy(struct bpf_lwt_prog *prog) +{ + if (prog->prog) + bpf_prog_put(prog->prog); + + kfree(prog->name); +} + +static void bpf_destroy_state(struct lwtunnel_state *lwt) +{ + struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt); + + bpf_lwt_prog_destroy(&bpf->in); + bpf_lwt_prog_destroy(&bpf->out); + bpf_lwt_prog_destroy(&bpf->xmit); +} + +static const struct nla_policy bpf_prog_policy[LWT_BPF_PROG_MAX + 1] = { + [LWT_BPF_PROG_FD] = { .type = NLA_U32, }, + [LWT_BPF_PROG_NAME] = { .type = NLA_NUL_STRING, + .len = MAX_PROG_NAME }, +}; + +static int bpf_parse_prog(struct nlattr *attr, struct bpf_lwt_prog *prog, + enum bpf_prog_type type) +{ + struct nlattr *tb[LWT_BPF_PROG_MAX + 1]; + struct bpf_prog *p; + int ret; + u32 fd; + + ret = nla_parse_nested(tb, LWT_BPF_PROG_MAX, attr, bpf_prog_policy); + if (ret < 0) + return ret; + + if (!tb[LWT_BPF_PROG_FD] || !tb[LWT_BPF_PROG_NAME]) + return -EINVAL; + + prog->name = nla_memdup(tb[LWT_BPF_PROG_NAME], GFP_KERNEL); + if (!prog->name) + return -ENOMEM; + + fd = nla_get_u32(tb[LWT_BPF_PROG_FD]); + p = bpf_prog_get_type(fd, type); + if (IS_ERR(p)) + return PTR_ERR(p); + + prog->prog = p; + + return 0; +} + +static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = { + [LWT_BPF_IN] = { .type = NLA_NESTED, }, + [LWT_BPF_OUT] = { .type = NLA_NESTED, }, + [LWT_BPF_XMIT] = { .type = NLA_NESTED, }, + [LWT_BPF_XMIT_HEADROOM] = { .type = NLA_U32 }, +}; + +static int bpf_build_state(struct net_device *dev, struct nlattr *nla, + unsigned int family, const void *cfg, + struct lwtunnel_state **ts) +{ + struct nlattr *tb[LWT_BPF_MAX + 1]; + struct lwtunnel_state *newts; + struct bpf_lwt *bpf; + int ret; + + if (family != AF_INET && family != AF_INET6) + return -EAFNOSUPPORT; + + ret = nla_parse_nested(tb, LWT_BPF_MAX, nla, bpf_nl_policy); + if (ret < 0) + return ret; + + if (!tb[LWT_BPF_IN] && !tb[LWT_BPF_OUT] && !tb[LWT_BPF_XMIT]) + return -EINVAL; + + newts = lwtunnel_state_alloc(sizeof(*bpf)); + if (!newts) + return -ENOMEM; + + newts->type = LWTUNNEL_ENCAP_BPF; + bpf = bpf_lwt_lwtunnel(newts); + + if (tb[LWT_BPF_IN]) { + newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT; + ret = bpf_parse_prog(tb[LWT_BPF_IN], &bpf->in, + BPF_PROG_TYPE_LWT_IN); + if (ret < 0) + goto errout; + } + + if (tb[LWT_BPF_OUT]) { + newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; + ret = bpf_parse_prog(tb[LWT_BPF_OUT], &bpf->out, + BPF_PROG_TYPE_LWT_OUT); + if (ret < 0) + goto errout; + } + + if (tb[LWT_BPF_XMIT]) { + newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT; + ret = bpf_parse_prog(tb[LWT_BPF_XMIT], &bpf->xmit, + BPF_PROG_TYPE_LWT_XMIT); + if (ret < 0) + goto errout; + } + + if (tb[LWT_BPF_XMIT_HEADROOM]) { + u32 headroom = nla_get_u32(tb[LWT_BPF_XMIT_HEADROOM]); + + if (headroom > LWT_BPF_MAX_HEADROOM) { + ret = -ERANGE; + goto errout; + } + + newts->headroom = headroom; + } + + bpf->family = family; + *ts = newts; + + return 0; + +errout: + bpf_destroy_state(newts); + kfree(newts); + return ret; +} + +static int bpf_fill_lwt_prog(struct sk_buff *skb, int attr, + struct bpf_lwt_prog *prog) +{ + struct nlattr *nest; + + if (!prog->prog) + return 0; + + nest = nla_nest_start(skb, attr); + if (!nest) + return -EMSGSIZE; + + if (prog->name && + nla_put_string(skb, LWT_BPF_PROG_NAME, prog->name)) + return -EMSGSIZE; + + return nla_nest_end(skb, nest); +} + +static int bpf_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwt) +{ + struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt); + + if (bpf_fill_lwt_prog(skb, LWT_BPF_IN, &bpf->in) < 0 || + bpf_fill_lwt_prog(skb, LWT_BPF_OUT, &bpf->out) < 0 || + bpf_fill_lwt_prog(skb, LWT_BPF_XMIT, &bpf->xmit) < 0) + return -EMSGSIZE; + + return 0; +} + +static int bpf_encap_nlsize(struct lwtunnel_state *lwtstate) +{ + int nest_len = nla_total_size(sizeof(struct nlattr)) + + nla_total_size(MAX_PROG_NAME) + /* LWT_BPF_PROG_NAME */ + 0; + + return nest_len + /* LWT_BPF_IN */ + nest_len + /* LWT_BPF_OUT */ + nest_len + /* LWT_BPF_XMIT */ + 0; +} + +int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b) +{ + /* FIXME: + * The LWT state is currently rebuilt for delete requests which + * results in a new bpf_prog instance. Comparing names for now. + */ + if (!a->name && !b->name) + return 0; + + if (!a->name || !b->name) + return 1; + + return strcmp(a->name, b->name); +} + +static int bpf_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) +{ + struct bpf_lwt *a_bpf = bpf_lwt_lwtunnel(a); + struct bpf_lwt *b_bpf = bpf_lwt_lwtunnel(b); + + return bpf_lwt_prog_cmp(&a_bpf->in, &b_bpf->in) || + bpf_lwt_prog_cmp(&a_bpf->out, &b_bpf->out) || + bpf_lwt_prog_cmp(&a_bpf->xmit, &b_bpf->xmit); +} + +static const struct lwtunnel_encap_ops bpf_encap_ops = { + .build_state = bpf_build_state, + .destroy_state = bpf_destroy_state, + .input = bpf_input, + .output = bpf_output, + .xmit = bpf_xmit, + .fill_encap = bpf_fill_encap_info, + .get_encap_size = bpf_encap_nlsize, + .cmp_encap = bpf_encap_cmp, +}; + +static int __init bpf_lwt_init(void) +{ + return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF); +} + +subsys_initcall(bpf_lwt_init) diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index 03976e939818..a5d4e866ce88 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -41,6 +41,8 @@ static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type) return "ILA"; case LWTUNNEL_ENCAP_SEG6: return "SEG6"; + case LWTUNNEL_ENCAP_BPF: + return "BPF"; case LWTUNNEL_ENCAP_IP6: case LWTUNNEL_ENCAP_IP: case LWTUNNEL_ENCAP_NONE: -- cgit v1.3-7-g2ca7 From 366cbf2f46048d70005c6c33dc289330f24b54b0 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 30 Nov 2016 22:16:06 +0100 Subject: bpf, xdp: drop rcu_read_lock from bpf_prog_run_xdp and move to caller After 326fe02d1ed6 ("net/mlx4_en: protect ring->xdp_prog with rcu_read_lock"), the rcu_read_lock() in bpf_prog_run_xdp() is superfluous, since callers need to hold rcu_read_lock() already to make sure BPF program doesn't get released in the background. Thus, drop it from bpf_prog_run_xdp(), as it can otherwise be misleading. Still keeping the bpf_prog_run_xdp() is useful as it allows for grepping in XDP supported drivers and to keep the typecheck on the context intact. For mlx4, this means we don't have a double rcu_read_lock() anymore. nfp can just make use of bpf_prog_run_xdp(), too. For qede, just move rcu_read_lock() out of the helper. When the driver gets atomic replace support, this will move to call-sites eventually. mlx5 needs actual fixing as it has the same issue as described already in 326fe02d1ed6 ("net/mlx4_en: protect ring->xdp_prog with rcu_read_lock"), that is, we're under RCU bh at this time, BPF programs are released via call_rcu(), and call_rcu() != call_rcu_bh(), so we need to properly mark read side as programs can get xchg()'ed in mlx5e_xdp_set() without queue reset. Fixes: 86994156c736 ("net/mlx5e: XDP fast RX drop bpf programs support") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 8 ++++++-- drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 2 +- drivers/net/ethernet/qlogic/qede/qede_main.c | 7 +++++++ include/linux/filter.h | 18 +++++++++--------- 4 files changed, 23 insertions(+), 12 deletions(-) (limited to 'include/linux/filter.h') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index b036710ba52c..42cd687e6608 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -737,10 +737,10 @@ static inline struct sk_buff *skb_from_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe, u16 wqe_counter, u32 cqe_bcnt) { - struct bpf_prog *xdp_prog = READ_ONCE(rq->xdp_prog); struct mlx5e_dma_info *di; struct sk_buff *skb; void *va, *data; + bool consumed; di = &rq->dma_info[wqe_counter]; va = page_address(di->page); @@ -759,7 +759,11 @@ struct sk_buff *skb_from_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe, return NULL; } - if (mlx5e_xdp_handle(rq, xdp_prog, di, data, cqe_bcnt)) + rcu_read_lock(); + consumed = mlx5e_xdp_handle(rq, READ_ONCE(rq->xdp_prog), di, data, + cqe_bcnt); + rcu_read_unlock(); + if (consumed) return NULL; /* page/packet was consumed by XDP */ skb = build_skb(va, RQ_PAGE_SIZE(rq)); diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 876ab3a92ad5..00d9a03be31d 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -1518,7 +1518,7 @@ static int nfp_net_run_xdp(struct bpf_prog *prog, void *data, unsigned int len) xdp.data = data; xdp.data_end = data + len; - return BPF_PROG_RUN(prog, &xdp); + return bpf_prog_run_xdp(prog, &xdp); } /** diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c index 172ff6da92ad..faeaa9f3b197 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_main.c +++ b/drivers/net/ethernet/qlogic/qede/qede_main.c @@ -1497,7 +1497,14 @@ static bool qede_rx_xdp(struct qede_dev *edev, xdp.data = page_address(bd->data) + cqe->placement_offset; xdp.data_end = xdp.data + len; + + /* Queues always have a full reset currently, so for the time + * being until there's atomic program replace just mark read + * side for map helpers. + */ + rcu_read_lock(); act = bpf_prog_run_xdp(prog, &xdp); + rcu_read_unlock(); if (act == XDP_PASS) return true; diff --git a/include/linux/filter.h b/include/linux/filter.h index 7ba644626553..97338134398f 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -498,16 +498,16 @@ static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog, return BPF_PROG_RUN(prog, skb); } -static inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog, - struct xdp_buff *xdp) +static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog, + struct xdp_buff *xdp) { - u32 ret; - - rcu_read_lock(); - ret = BPF_PROG_RUN(prog, xdp); - rcu_read_unlock(); - - return ret; + /* Caller needs to hold rcu_read_lock() (!), otherwise program + * can be released while still running, or map elements could be + * freed early while still having concurrent users. XDP fastpath + * already takes rcu_read_lock() when fetching the program, so + * it's not necessary here anymore. + */ + return BPF_PROG_RUN(prog, xdp); } static inline unsigned int bpf_prog_size(unsigned int proglen) -- cgit v1.3-7-g2ca7 From 7bd509e311f408f7a5132fcdde2069af65fa05ae Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 4 Dec 2016 23:19:41 +0100 Subject: bpf: add prog_digest and expose it via fdinfo/netlink When loading a BPF program via bpf(2), calculate the digest over the program's instruction stream and store it in struct bpf_prog's digest member. This is done at a point in time before any instructions are rewritten by the verifier. Any unstable map file descriptor number part of the imm field will be zeroed for the hash. fdinfo example output for progs: # cat /proc/1590/fdinfo/5 pos: 0 flags: 02000002 mnt_id: 11 prog_type: 1 prog_jited: 1 prog_digest: b27e8b06da22707513aa97363dfb11c7c3675d28 memlock: 4096 When programs are pinned and retrieved by an ELF loader, the loader can check the program's digest through fdinfo and compare it against one that was generated over the ELF file's program section to see if the program needs to be reloaded. Furthermore, this can also be exposed through other means such as netlink in case of a tc cls/act dump (or xdp in future), but also through tracepoints or other facilities to identify the program. Other than that, the digest can also serve as a base name for the work in progress kallsyms support of programs. The digest doesn't depend/select the crypto layer, since we need to keep dependencies to a minimum. iproute2 will get support for this facility. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 1 + include/linux/filter.h | 7 +++- include/uapi/linux/pkt_cls.h | 1 + include/uapi/linux/tc_act/tc_bpf.h | 1 + kernel/bpf/core.c | 65 ++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 24 +++++++++++++- kernel/bpf/verifier.c | 2 ++ net/sched/act_bpf.c | 9 ++++++ net/sched/cls_bpf.c | 8 +++++ 9 files changed, 116 insertions(+), 2 deletions(-) (limited to 'include/linux/filter.h') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 69d0a7f12a3b..8796ff03f472 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -216,6 +216,7 @@ u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5); u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp); +void bpf_prog_calc_digest(struct bpf_prog *fp); const struct bpf_func_proto *bpf_get_trace_printk_proto(void); diff --git a/include/linux/filter.h b/include/linux/filter.h index 97338134398f..f078d2b1cff6 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -14,6 +14,7 @@ #include #include #include +#include #include @@ -56,6 +57,9 @@ struct bpf_prog_aux; /* BPF program can access up to 512 bytes of stack space. */ #define MAX_BPF_STACK 512 +/* Maximum BPF program size in bytes. */ +#define MAX_BPF_SIZE (BPF_MAXINSNS * sizeof(struct bpf_insn)) + /* Helper macros for filter block array initializers. */ /* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */ @@ -404,8 +408,9 @@ struct bpf_prog { cb_access:1, /* Is control block accessed? */ dst_needed:1; /* Do we need dst entry? */ kmemcheck_bitfield_end(meta); - u32 len; /* Number of filter blocks */ enum bpf_prog_type type; /* Type of BPF program */ + u32 len; /* Number of filter blocks */ + u32 digest[SHA_DIGEST_WORDS]; /* Program digest */ struct bpf_prog_aux *aux; /* Auxiliary fields */ struct sock_fprog_kern *orig_prog; /* Original BPF program */ unsigned int (*bpf_func)(const void *ctx, diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 86786d45ee66..1adc0b654996 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -397,6 +397,7 @@ enum { TCA_BPF_NAME, TCA_BPF_FLAGS, TCA_BPF_FLAGS_GEN, + TCA_BPF_DIGEST, __TCA_BPF_MAX, }; diff --git a/include/uapi/linux/tc_act/tc_bpf.h b/include/uapi/linux/tc_act/tc_bpf.h index 063d9d465119..a6b88a6f7f71 100644 --- a/include/uapi/linux/tc_act/tc_bpf.h +++ b/include/uapi/linux/tc_act/tc_bpf.h @@ -27,6 +27,7 @@ enum { TCA_ACT_BPF_FD, TCA_ACT_BPF_NAME, TCA_ACT_BPF_PAD, + TCA_ACT_BPF_DIGEST, __TCA_ACT_BPF_MAX, }; #define TCA_ACT_BPF_MAX (__TCA_ACT_BPF_MAX - 1) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 82a04143368e..bdcc9f4ba767 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -136,6 +136,71 @@ void __bpf_prog_free(struct bpf_prog *fp) vfree(fp); } +#define SHA_BPF_RAW_SIZE \ + round_up(MAX_BPF_SIZE + sizeof(__be64) + 1, SHA_MESSAGE_BYTES) + +/* Called under verifier mutex. */ +void bpf_prog_calc_digest(struct bpf_prog *fp) +{ + const u32 bits_offset = SHA_MESSAGE_BYTES - sizeof(__be64); + static u32 ws[SHA_WORKSPACE_WORDS]; + static u8 raw[SHA_BPF_RAW_SIZE]; + struct bpf_insn *dst = (void *)raw; + u32 i, bsize, psize, blocks; + bool was_ld_map; + u8 *todo = raw; + __be32 *result; + __be64 *bits; + + sha_init(fp->digest); + memset(ws, 0, sizeof(ws)); + + /* We need to take out the map fd for the digest calculation + * since they are unstable from user space side. + */ + for (i = 0, was_ld_map = false; i < fp->len; i++) { + dst[i] = fp->insnsi[i]; + if (!was_ld_map && + dst[i].code == (BPF_LD | BPF_IMM | BPF_DW) && + dst[i].src_reg == BPF_PSEUDO_MAP_FD) { + was_ld_map = true; + dst[i].imm = 0; + } else if (was_ld_map && + dst[i].code == 0 && + dst[i].dst_reg == 0 && + dst[i].src_reg == 0 && + dst[i].off == 0) { + was_ld_map = false; + dst[i].imm = 0; + } else { + was_ld_map = false; + } + } + + psize = fp->len * sizeof(struct bpf_insn); + memset(&raw[psize], 0, sizeof(raw) - psize); + raw[psize++] = 0x80; + + bsize = round_up(psize, SHA_MESSAGE_BYTES); + blocks = bsize / SHA_MESSAGE_BYTES; + if (bsize - psize >= sizeof(__be64)) { + bits = (__be64 *)(todo + bsize - sizeof(__be64)); + } else { + bits = (__be64 *)(todo + bsize + bits_offset); + blocks++; + } + *bits = cpu_to_be64((psize - 1) << 3); + + while (blocks--) { + sha_transform(fp->digest, todo, ws); + todo += SHA_MESSAGE_BYTES; + } + + result = (__force __be32 *)fp->digest; + for (i = 0; i < SHA_DIGEST_WORDS; i++) + result[i] = cpu_to_be32(fp->digest[i]); +} + static bool bpf_is_jmp_and_has_target(const struct bpf_insn *insn) { return BPF_CLASS(insn->code) == BPF_JMP && diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 85af86c496cd..c0d2b423ce93 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -662,8 +662,30 @@ static int bpf_prog_release(struct inode *inode, struct file *filp) return 0; } +#ifdef CONFIG_PROC_FS +static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp) +{ + const struct bpf_prog *prog = filp->private_data; + char prog_digest[sizeof(prog->digest) * 2 + 1] = { }; + + bin2hex(prog_digest, prog->digest, sizeof(prog->digest)); + seq_printf(m, + "prog_type:\t%u\n" + "prog_jited:\t%u\n" + "prog_digest:\t%s\n" + "memlock:\t%llu\n", + prog->type, + prog->jited, + prog_digest, + prog->pages * 1ULL << PAGE_SHIFT); +} +#endif + static const struct file_operations bpf_prog_fops = { - .release = bpf_prog_release, +#ifdef CONFIG_PROC_FS + .show_fdinfo = bpf_prog_show_fdinfo, +#endif + .release = bpf_prog_release, }; int bpf_prog_new_fd(struct bpf_prog *prog) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 38d05da84a49..cb37339ca0da 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3176,6 +3176,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) log_level = 0; } + bpf_prog_calc_digest(env->prog); + ret = replace_map_fd_with_map_ptr(env); if (ret < 0) goto skip_full_check; diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 84c1d2da4f8b..1c60317f0121 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -117,10 +117,19 @@ static int tcf_bpf_dump_bpf_info(const struct tcf_bpf *prog, static int tcf_bpf_dump_ebpf_info(const struct tcf_bpf *prog, struct sk_buff *skb) { + struct nlattr *nla; + if (prog->bpf_name && nla_put_string(skb, TCA_ACT_BPF_NAME, prog->bpf_name)) return -EMSGSIZE; + nla = nla_reserve(skb, TCA_ACT_BPF_DIGEST, + sizeof(prog->filter->digest)); + if (nla == NULL) + return -EMSGSIZE; + + memcpy(nla_data(nla), prog->filter->digest, nla_len(nla)); + return 0; } diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index f70e03d2d2c8..adc776048d1a 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -549,10 +549,18 @@ static int cls_bpf_dump_bpf_info(const struct cls_bpf_prog *prog, static int cls_bpf_dump_ebpf_info(const struct cls_bpf_prog *prog, struct sk_buff *skb) { + struct nlattr *nla; + if (prog->bpf_name && nla_put_string(skb, TCA_BPF_NAME, prog->bpf_name)) return -EMSGSIZE; + nla = nla_reserve(skb, TCA_BPF_DIGEST, sizeof(prog->filter->digest)); + if (nla == NULL) + return -EMSGSIZE; + + memcpy(nla_data(nla), prog->filter->digest, nla_len(nla)); + return 0; } -- cgit v1.3-7-g2ca7 From 17bedab2723145d17b14084430743549e6943d03 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 7 Dec 2016 15:53:11 -0800 Subject: bpf: xdp: Allow head adjustment in XDP prog This patch allows XDP prog to extend/remove the packet data at the head (like adding or removing header). It is done by adding a new XDP helper bpf_xdp_adjust_head(). It also renames bpf_helper_changes_skb_data() to bpf_helper_changes_pkt_data() to better reflect that XDP prog does not work on skb. This patch adds one "xdp_adjust_head" bit to bpf_prog for the XDP-capable driver to check if the XDP prog requires bpf_xdp_adjust_head() support. The driver can then decide to error out during XDP_SETUP_PROG. Signed-off-by: Martin KaFai Lau Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: John Fastabend Signed-off-by: David S. Miller --- arch/powerpc/net/bpf_jit_comp64.c | 4 ++-- arch/s390/net/bpf_jit_comp.c | 2 +- arch/x86/net/bpf_jit_comp.c | 2 +- drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 5 ++++ drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 5 ++++ .../net/ethernet/netronome/nfp/nfp_net_common.c | 4 ++++ drivers/net/ethernet/qlogic/qede/qede_main.c | 5 ++++ include/linux/filter.h | 6 +++-- include/uapi/linux/bpf.h | 11 ++++++++- kernel/bpf/core.c | 2 +- kernel/bpf/syscall.c | 2 ++ kernel/bpf/verifier.c | 2 +- net/core/filter.c | 28 ++++++++++++++++++++-- 13 files changed, 67 insertions(+), 11 deletions(-) (limited to 'include/linux/filter.h') diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 0fe98a567125..73a5cf18fd84 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -766,7 +766,7 @@ emit_clear: func = (u8 *) __bpf_call_base + imm; /* Save skb pointer if we need to re-cache skb data */ - if (bpf_helper_changes_skb_data(func)) + if (bpf_helper_changes_pkt_data(func)) PPC_BPF_STL(3, 1, bpf_jit_stack_local(ctx)); bpf_jit_emit_func_call(image, ctx, (u64)func); @@ -775,7 +775,7 @@ emit_clear: PPC_MR(b2p[BPF_REG_0], 3); /* refresh skb cache */ - if (bpf_helper_changes_skb_data(func)) { + if (bpf_helper_changes_pkt_data(func)) { /* reload skb pointer to r3 */ PPC_BPF_LL(3, 1, bpf_jit_stack_local(ctx)); bpf_jit_emit_skb_loads(image, ctx); diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index bee281f3163d..167b31b186c1 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -981,7 +981,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i EMIT2(0x0d00, REG_14, REG_W1); /* lgr %b0,%r2: load return value into %b0 */ EMIT4(0xb9040000, BPF_REG_0, REG_2); - if (bpf_helper_changes_skb_data((void *)func)) { + if (bpf_helper_changes_pkt_data((void *)func)) { jit->seen |= SEEN_SKB_CHANGE; /* lg %b1,ST_OFF_SKBP(%r15) */ EMIT6_DISP_LH(0xe3000000, 0x0004, BPF_REG_1, REG_0, diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index fe04a04dab8e..e76d1af60f7a 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -853,7 +853,7 @@ xadd: if (is_imm8(insn->off)) func = (u8 *) __bpf_call_base + imm32; jmp_offset = func - (image + addrs[i]); if (seen_ld_abs) { - reload_skb_data = bpf_helper_changes_skb_data(func); + reload_skb_data = bpf_helper_changes_pkt_data(func); if (reload_skb_data) { EMIT1(0x57); /* push %rdi */ jmp_offset += 22; /* pop, mov, sub, mov */ diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index 49a81f1fc1d6..f441eda63bec 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -2686,6 +2686,11 @@ static int mlx4_xdp_set(struct net_device *dev, struct bpf_prog *prog) int err; int i; + if (prog && prog->xdp_adjust_head) { + en_err(priv, "Does not support bpf_xdp_adjust_head()\n"); + return -EOPNOTSUPP; + } + xdp_ring_num = prog ? priv->rx_ring_num : 0; /* No need to reconfigure buffers when simply swapping the diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 07020276fe73..cbfa38fc72c0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -3183,6 +3183,11 @@ static int mlx5e_xdp_set(struct net_device *netdev, struct bpf_prog *prog) bool reset, was_opened; int i; + if (prog && prog->xdp_adjust_head) { + netdev_err(netdev, "Does not support bpf_xdp_adjust_head()\n"); + return -EOPNOTSUPP; + } + mutex_lock(&priv->state_lock); if ((netdev->features & NETIF_F_LRO) && prog) { diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 00d9a03be31d..e8d448109e03 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -2946,6 +2946,10 @@ static int nfp_net_xdp_setup(struct nfp_net *nn, struct bpf_prog *prog) }; int err; + if (prog && prog->xdp_adjust_head) { + nn_err(nn, "Does not support bpf_xdp_adjust_head()\n"); + return -EOPNOTSUPP; + } if (!prog && !nn->xdp_prog) return 0; if (prog && nn->xdp_prog) { diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c index cf1dd1436d93..aecdd1c5c0ea 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_main.c +++ b/drivers/net/ethernet/qlogic/qede/qede_main.c @@ -2507,6 +2507,11 @@ static int qede_xdp_set(struct qede_dev *edev, struct bpf_prog *prog) { struct qede_reload_args args; + if (prog && prog->xdp_adjust_head) { + DP_ERR(edev, "Does not support bpf_xdp_adjust_head()\n"); + return -EOPNOTSUPP; + } + /* If we're called, there was already a bpf reference increment */ args.func = &qede_xdp_reload_func; args.u.new_prog = prog; diff --git a/include/linux/filter.h b/include/linux/filter.h index f078d2b1cff6..6a1658308612 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -406,7 +406,8 @@ struct bpf_prog { u16 jited:1, /* Is our filter JIT'ed? */ gpl_compatible:1, /* Is filter GPL compatible? */ cb_access:1, /* Is control block accessed? */ - dst_needed:1; /* Do we need dst entry? */ + dst_needed:1, /* Do we need dst entry? */ + xdp_adjust_head:1; /* Adjusting pkt head? */ kmemcheck_bitfield_end(meta); enum bpf_prog_type type; /* Type of BPF program */ u32 len; /* Number of filter blocks */ @@ -440,6 +441,7 @@ struct bpf_skb_data_end { struct xdp_buff { void *data; void *data_end; + void *data_hard_start; }; /* compute the linear packet data range [data, data_end) which @@ -595,7 +597,7 @@ void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp); u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog); -bool bpf_helper_changes_skb_data(void *func); +bool bpf_helper_changes_pkt_data(void *func); struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6123d9b8e828..0eb0e87dbe9f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -424,6 +424,12 @@ union bpf_attr { * @len: length of header to be pushed in front * @flags: Flags (unused for now) * Return: 0 on success or negative error + * + * int bpf_xdp_adjust_head(xdp_md, delta) + * Adjust the xdp_md.data by delta + * @xdp_md: pointer to xdp_md + * @delta: An positive/negative integer to be added to xdp_md.data + * Return: 0 on success or negative on error */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -469,7 +475,8 @@ union bpf_attr { FN(csum_update), \ FN(set_hash_invalid), \ FN(get_numa_node_id), \ - FN(skb_change_head), + FN(skb_change_head), \ + FN(xdp_adjust_head), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -576,6 +583,8 @@ struct bpf_sock { __u32 protocol; }; +#define XDP_PACKET_HEADROOM 256 + /* User return codes for XDP prog type. * A valid XDP program must return one of these defined values. All other * return codes are reserved for future use. Unknown return codes will result diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index bdcc9f4ba767..83e0d153b0b4 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1143,7 +1143,7 @@ struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog) return prog; } -bool __weak bpf_helper_changes_skb_data(void *func) +bool __weak bpf_helper_changes_pkt_data(void *func) { return false; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 88f609f1c0c3..4819ec9d95f6 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -579,6 +579,8 @@ static void fixup_bpf_calls(struct bpf_prog *prog) prog->dst_needed = 1; if (insn->imm == BPF_FUNC_get_prandom_u32) bpf_user_rnd_init_once(); + if (insn->imm == BPF_FUNC_xdp_adjust_head) + prog->xdp_adjust_head = 1; if (insn->imm == BPF_FUNC_tail_call) { /* mark bpf_tail_call as different opcode * to avoid conditional branch in diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5b14f85f45c6..d28f9a3380a9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1216,7 +1216,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id) return -EINVAL; } - changes_data = bpf_helper_changes_skb_data(fn->func); + changes_data = bpf_helper_changes_pkt_data(fn->func); memset(&meta, 0, sizeof(meta)); meta.pkt_access = fn->pkt_access; diff --git a/net/core/filter.c b/net/core/filter.c index b751202e12f8..b1461708a977 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2234,7 +2234,28 @@ static const struct bpf_func_proto bpf_skb_change_head_proto = { .arg3_type = ARG_ANYTHING, }; -bool bpf_helper_changes_skb_data(void *func) +BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset) +{ + void *data = xdp->data + offset; + + if (unlikely(data < xdp->data_hard_start || + data > xdp->data_end - ETH_HLEN)) + return -EINVAL; + + xdp->data = data; + + return 0; +} + +static const struct bpf_func_proto bpf_xdp_adjust_head_proto = { + .func = bpf_xdp_adjust_head, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + +bool bpf_helper_changes_pkt_data(void *func) { if (func == bpf_skb_vlan_push || func == bpf_skb_vlan_pop || @@ -2244,7 +2265,8 @@ bool bpf_helper_changes_skb_data(void *func) func == bpf_skb_change_tail || func == bpf_skb_pull_data || func == bpf_l3_csum_replace || - func == bpf_l4_csum_replace) + func == bpf_l4_csum_replace || + func == bpf_xdp_adjust_head) return true; return false; @@ -2670,6 +2692,8 @@ xdp_func_proto(enum bpf_func_id func_id) return &bpf_xdp_event_output_proto; case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; + case BPF_FUNC_xdp_adjust_head: + return &bpf_xdp_adjust_head_proto; default: return sk_filter_func_proto(func_id); } -- cgit v1.3-7-g2ca7 From f23bc46c30ca5ef58b8549434899fcbac41b2cfc Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 15 Dec 2016 12:12:54 -0800 Subject: net: xdp: add invalid buffer warning This adds a warning for drivers to use when encountering an invalid buffer for XDP. For normal cases this should not happen but to catch this in virtual/qemu setups that I may not have expected from the emulation layer having a standard warning is useful. Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- include/linux/filter.h | 1 + net/core/filter.c | 6 ++++++ 2 files changed, 7 insertions(+) (limited to 'include/linux/filter.h') diff --git a/include/linux/filter.h b/include/linux/filter.h index 6a1658308612..af8a1804cac6 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -602,6 +602,7 @@ bool bpf_helper_changes_pkt_data(void *func); struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len); void bpf_warn_invalid_xdp_action(u32 act); +void bpf_warn_invalid_xdp_buffer(void); #ifdef CONFIG_BPF_JIT extern int bpf_jit_enable; diff --git a/net/core/filter.c b/net/core/filter.c index b1461708a977..7190bd648154 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2972,6 +2972,12 @@ void bpf_warn_invalid_xdp_action(u32 act) } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); +void bpf_warn_invalid_xdp_buffer(void) +{ + WARN_ONCE(1, "Illegal XDP buffer encountered, expect throughput degradation\n"); +} +EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_buffer); + static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg, int src_reg, int ctx_off, struct bpf_insn *insn_buf, -- cgit v1.3-7-g2ca7 From aafe6ae9cee32df85eb5e8bb6dd1d918e6807b09 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 18 Dec 2016 01:52:57 +0100 Subject: bpf: dynamically allocate digest scratch buffer Geert rightfully complained that 7bd509e311f4 ("bpf: add prog_digest and expose it via fdinfo/netlink") added a too large allocation of variable 'raw' from bss section, and should instead be done dynamically: # ./scripts/bloat-o-meter kernel/bpf/core.o.1 kernel/bpf/core.o.2 add/remove: 3/0 grow/shrink: 0/0 up/down: 33291/0 (33291) function old new delta raw - 32832 +32832 [...] Since this is only relevant during program creation path, which can be considered slow-path anyway, lets allocate that dynamically and be not implicitly dependent on verifier mutex. Move bpf_prog_calc_digest() at the beginning of replace_map_fd_with_map_ptr() and also error handling stays straight forward. Reported-by: Geert Uytterhoeven Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 2 +- include/linux/filter.h | 14 +++++++++++--- kernel/bpf/core.c | 27 ++++++++++++++++----------- kernel/bpf/syscall.c | 2 +- kernel/bpf/verifier.c | 6 ++++-- 5 files changed, 33 insertions(+), 18 deletions(-) (limited to 'include/linux/filter.h') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8796ff03f472..201eb483c46f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -216,7 +216,7 @@ u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5); u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp); -void bpf_prog_calc_digest(struct bpf_prog *fp); +int bpf_prog_calc_digest(struct bpf_prog *fp); const struct bpf_func_proto *bpf_get_trace_printk_proto(void); diff --git a/include/linux/filter.h b/include/linux/filter.h index af8a1804cac6..702314253797 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -57,9 +57,6 @@ struct bpf_prog_aux; /* BPF program can access up to 512 bytes of stack space. */ #define MAX_BPF_STACK 512 -/* Maximum BPF program size in bytes. */ -#define MAX_BPF_SIZE (BPF_MAXINSNS * sizeof(struct bpf_insn)) - /* Helper macros for filter block array initializers. */ /* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */ @@ -517,6 +514,17 @@ static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog, return BPF_PROG_RUN(prog, xdp); } +static inline u32 bpf_prog_insn_size(const struct bpf_prog *prog) +{ + return prog->len * sizeof(struct bpf_insn); +} + +static inline u32 bpf_prog_digest_scratch_size(const struct bpf_prog *prog) +{ + return round_up(bpf_prog_insn_size(prog) + + sizeof(__be64) + 1, SHA_MESSAGE_BYTES); +} + static inline unsigned int bpf_prog_size(unsigned int proglen) { return max(sizeof(struct bpf_prog), diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 83e0d153b0b4..75c08b8068d6 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -136,28 +136,29 @@ void __bpf_prog_free(struct bpf_prog *fp) vfree(fp); } -#define SHA_BPF_RAW_SIZE \ - round_up(MAX_BPF_SIZE + sizeof(__be64) + 1, SHA_MESSAGE_BYTES) - -/* Called under verifier mutex. */ -void bpf_prog_calc_digest(struct bpf_prog *fp) +int bpf_prog_calc_digest(struct bpf_prog *fp) { const u32 bits_offset = SHA_MESSAGE_BYTES - sizeof(__be64); - static u32 ws[SHA_WORKSPACE_WORDS]; - static u8 raw[SHA_BPF_RAW_SIZE]; - struct bpf_insn *dst = (void *)raw; + u32 raw_size = bpf_prog_digest_scratch_size(fp); + u32 ws[SHA_WORKSPACE_WORDS]; u32 i, bsize, psize, blocks; + struct bpf_insn *dst; bool was_ld_map; - u8 *todo = raw; + u8 *raw, *todo; __be32 *result; __be64 *bits; + raw = vmalloc(raw_size); + if (!raw) + return -ENOMEM; + sha_init(fp->digest); memset(ws, 0, sizeof(ws)); /* We need to take out the map fd for the digest calculation * since they are unstable from user space side. */ + dst = (void *)raw; for (i = 0, was_ld_map = false; i < fp->len; i++) { dst[i] = fp->insnsi[i]; if (!was_ld_map && @@ -177,12 +178,13 @@ void bpf_prog_calc_digest(struct bpf_prog *fp) } } - psize = fp->len * sizeof(struct bpf_insn); - memset(&raw[psize], 0, sizeof(raw) - psize); + psize = bpf_prog_insn_size(fp); + memset(&raw[psize], 0, raw_size - psize); raw[psize++] = 0x80; bsize = round_up(psize, SHA_MESSAGE_BYTES); blocks = bsize / SHA_MESSAGE_BYTES; + todo = raw; if (bsize - psize >= sizeof(__be64)) { bits = (__be64 *)(todo + bsize - sizeof(__be64)); } else { @@ -199,6 +201,9 @@ void bpf_prog_calc_digest(struct bpf_prog *fp) result = (__force __be32 *)fp->digest; for (i = 0; i < SHA_DIGEST_WORDS; i++) result[i] = cpu_to_be32(fp->digest[i]); + + vfree(raw); + return 0; } static bool bpf_is_jmp_and_has_target(const struct bpf_insn *insn) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4819ec9d95f6..35d674c1f12e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -811,7 +811,7 @@ static int bpf_prog_load(union bpf_attr *attr) err = -EFAULT; if (copy_from_user(prog->insns, u64_to_user_ptr(attr->insns), - prog->len * sizeof(struct bpf_insn)) != 0) + bpf_prog_insn_size(prog)) != 0) goto free_prog; prog->orig_prog = NULL; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 81e267bc4640..64b7b1abe087 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2931,6 +2931,10 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) int insn_cnt = env->prog->len; int i, j, err; + err = bpf_prog_calc_digest(env->prog); + if (err) + return err; + for (i = 0; i < insn_cnt; i++, insn++) { if (BPF_CLASS(insn->code) == BPF_LDX && (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0)) { @@ -3178,8 +3182,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) log_level = 0; } - bpf_prog_calc_digest(env->prog); - ret = replace_map_fd_with_map_ptr(env); if (ret < 0) goto skip_full_check; -- cgit v1.3-7-g2ca7