From 67306f84ca78c2ca5136f21791710c126a55a19b Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Mon, 17 Feb 2020 19:04:32 -0800 Subject: selftests/bpf: Add bpf_read_branch_records() selftest Add a selftest to test: * default bpf_read_branch_records() behavior * BPF_F_GET_BRANCH_RECORDS_SIZE flag behavior * error path on non branch record perf events * using helper to write to stack * using helper to write to global On host with hardware counter support: # ./test_progs -t perf_branches #27/1 perf_branches_hw:OK #27/2 perf_branches_no_hw:OK #27 perf_branches:OK Summary: 1/2 PASSED, 0 SKIPPED, 0 FAILED On host without hardware counter support (VM): # ./test_progs -t perf_branches #27/1 perf_branches_hw:OK #27/2 perf_branches_no_hw:OK #27 perf_branches:OK Summary: 1/2 PASSED, 1 SKIPPED, 0 FAILED Also sync tools/include/uapi/linux/bpf.h. Signed-off-by: Daniel Xu Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200218030432.4600-3-dxu@dxuuu.xyz --- tools/include/uapi/linux/bpf.h | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f1d74a2bd234..a7e59756853f 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2892,6 +2892,25 @@ union bpf_attr { * Obtain the 64bit jiffies * Return * The 64 bit jiffies + * + * int bpf_read_branch_records(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags) + * Description + * For an eBPF program attached to a perf event, retrieve the + * branch records (struct perf_branch_entry) associated to *ctx* + * and store it in the buffer pointed by *buf* up to size + * *size* bytes. + * Return + * On success, number of bytes written to *buf*. On error, a + * negative value. + * + * The *flags* can be set to **BPF_F_GET_BRANCH_RECORDS_SIZE** to + * instead return the number of bytes required to store all the + * branch entries. If this flag is set, *buf* may be NULL. + * + * **-EINVAL** if arguments invalid or **size** not a multiple + * of sizeof(struct perf_branch_entry). + * + * **-ENOENT** if architecture does not support branch records. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3012,7 +3031,8 @@ union bpf_attr { FN(probe_read_kernel_str), \ FN(tcp_send_ack), \ FN(send_signal_thread), \ - FN(jiffies64), + FN(jiffies64), \ + FN(read_branch_records), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -3091,6 +3111,9 @@ enum bpf_func_id { /* BPF_FUNC_sk_storage_get flags */ #define BPF_SK_STORAGE_GET_F_CREATE (1ULL << 0) +/* BPF_FUNC_read_branch_records flags. */ +#define BPF_F_GET_BRANCH_RECORDS_SIZE (1ULL << 0) + /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, -- cgit v1.2.3-59-g8ed1b From b0ac4941aa2a249bbb06de86110cd9e2e53980ca Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 3 Mar 2020 15:05:02 -0500 Subject: bpf: Sync uapi bpf.h to tools/ sync tools/include/uapi/linux/bpf.h to match include/uapi/linux/bpf.h Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200303200503.226217-3-willemdebruijn.kernel@gmail.com --- tools/include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 906e9f2752db..7c689f4552dd 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3176,6 +3176,7 @@ struct __sk_buff { __u32 wire_len; __u32 gso_segs; __bpf_md_ptr(struct bpf_sock *, sk); + __u32 gso_size; }; struct bpf_tunnel_key { -- cgit v1.2.3-59-g8ed1b From 1aae4bdd787998ea331a56f3db9d8595790fe2f9 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 2 Mar 2020 16:32:31 -0800 Subject: bpf: Switch BPF UAPI #define constants used from BPF program side to enums Switch BPF UAPI constants, previously defined as #define macro, to anonymous enum values. This preserves constants values and behavior in expressions, but has added advantaged of being captured as part of DWARF and, subsequently, BTF type info. Which, in turn, greatly improves usefulness of generated vmlinux.h for BPF applications, as it will not require BPF users to copy/paste various flags and constants, which are frequently used with BPF helpers. Only those constants that are used/useful from BPF program side are converted. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200303003233.3496043-2-andriin@fb.com --- include/uapi/linux/bpf.h | 175 +++++++++++++++++++++++++--------------- tools/include/uapi/linux/bpf.h | 177 +++++++++++++++++++++++++---------------- 2 files changed, 219 insertions(+), 133 deletions(-) (limited to 'tools/include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 180337fae97e..d6b33ea27bcc 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -325,44 +325,46 @@ enum bpf_attach_type { #define BPF_PSEUDO_CALL 1 /* flags for BPF_MAP_UPDATE_ELEM command */ -#define BPF_ANY 0 /* create new element or update existing */ -#define BPF_NOEXIST 1 /* create new element if it didn't exist */ -#define BPF_EXIST 2 /* update existing element */ -#define BPF_F_LOCK 4 /* spin_lock-ed map_lookup/map_update */ +enum { + BPF_ANY = 0, /* create new element or update existing */ + BPF_NOEXIST = 1, /* create new element if it didn't exist */ + BPF_EXIST = 2, /* update existing element */ + BPF_F_LOCK = 4, /* spin_lock-ed map_lookup/map_update */ +}; /* flags for BPF_MAP_CREATE command */ -#define BPF_F_NO_PREALLOC (1U << 0) +enum { + BPF_F_NO_PREALLOC = (1U << 0), /* Instead of having one common LRU list in the * BPF_MAP_TYPE_LRU_[PERCPU_]HASH map, use a percpu LRU list * which can scale and perform better. * Note, the LRU nodes (including free nodes) cannot be moved * across different LRU lists. */ -#define BPF_F_NO_COMMON_LRU (1U << 1) + BPF_F_NO_COMMON_LRU = (1U << 1), /* Specify numa node during map creation */ -#define BPF_F_NUMA_NODE (1U << 2) - -#define BPF_OBJ_NAME_LEN 16U + BPF_F_NUMA_NODE = (1U << 2), /* Flags for accessing BPF object from syscall side. */ -#define BPF_F_RDONLY (1U << 3) -#define BPF_F_WRONLY (1U << 4) + BPF_F_RDONLY = (1U << 3), + BPF_F_WRONLY = (1U << 4), /* Flag for stack_map, store build_id+offset instead of pointer */ -#define BPF_F_STACK_BUILD_ID (1U << 5) + BPF_F_STACK_BUILD_ID = (1U << 5), /* Zero-initialize hash function seed. This should only be used for testing. */ -#define BPF_F_ZERO_SEED (1U << 6) + BPF_F_ZERO_SEED = (1U << 6), /* Flags for accessing BPF object from program side. */ -#define BPF_F_RDONLY_PROG (1U << 7) -#define BPF_F_WRONLY_PROG (1U << 8) + BPF_F_RDONLY_PROG = (1U << 7), + BPF_F_WRONLY_PROG = (1U << 8), /* Clone map from listener for newly accepted socket */ -#define BPF_F_CLONE (1U << 9) + BPF_F_CLONE = (1U << 9), /* Enable memory-mapping BPF map */ -#define BPF_F_MMAPABLE (1U << 10) + BPF_F_MMAPABLE = (1U << 10), +}; /* Flags for BPF_PROG_QUERY. */ @@ -391,6 +393,8 @@ struct bpf_stack_build_id { }; }; +#define BPF_OBJ_NAME_LEN 16U + union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ __u32 map_type; /* one of enum bpf_map_type */ @@ -3045,72 +3049,100 @@ enum bpf_func_id { /* All flags used by eBPF helper functions, placed here. */ /* BPF_FUNC_skb_store_bytes flags. */ -#define BPF_F_RECOMPUTE_CSUM (1ULL << 0) -#define BPF_F_INVALIDATE_HASH (1ULL << 1) +enum { + BPF_F_RECOMPUTE_CSUM = (1ULL << 0), + BPF_F_INVALIDATE_HASH = (1ULL << 1), +}; /* BPF_FUNC_l3_csum_replace and BPF_FUNC_l4_csum_replace flags. * First 4 bits are for passing the header field size. */ -#define BPF_F_HDR_FIELD_MASK 0xfULL +enum { + BPF_F_HDR_FIELD_MASK = 0xfULL, +}; /* BPF_FUNC_l4_csum_replace flags. */ -#define BPF_F_PSEUDO_HDR (1ULL << 4) -#define BPF_F_MARK_MANGLED_0 (1ULL << 5) -#define BPF_F_MARK_ENFORCE (1ULL << 6) +enum { + BPF_F_PSEUDO_HDR = (1ULL << 4), + BPF_F_MARK_MANGLED_0 = (1ULL << 5), + BPF_F_MARK_ENFORCE = (1ULL << 6), +}; /* BPF_FUNC_clone_redirect and BPF_FUNC_redirect flags. */ -#define BPF_F_INGRESS (1ULL << 0) +enum { + BPF_F_INGRESS = (1ULL << 0), +}; /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */ -#define BPF_F_TUNINFO_IPV6 (1ULL << 0) +enum { + BPF_F_TUNINFO_IPV6 = (1ULL << 0), +}; /* flags for both BPF_FUNC_get_stackid and BPF_FUNC_get_stack. */ -#define BPF_F_SKIP_FIELD_MASK 0xffULL -#define BPF_F_USER_STACK (1ULL << 8) +enum { + BPF_F_SKIP_FIELD_MASK = 0xffULL, + BPF_F_USER_STACK = (1ULL << 8), /* flags used by BPF_FUNC_get_stackid only. */ -#define BPF_F_FAST_STACK_CMP (1ULL << 9) -#define BPF_F_REUSE_STACKID (1ULL << 10) + BPF_F_FAST_STACK_CMP = (1ULL << 9), + BPF_F_REUSE_STACKID = (1ULL << 10), /* flags used by BPF_FUNC_get_stack only. */ -#define BPF_F_USER_BUILD_ID (1ULL << 11) + BPF_F_USER_BUILD_ID = (1ULL << 11), +}; /* BPF_FUNC_skb_set_tunnel_key flags. */ -#define BPF_F_ZERO_CSUM_TX (1ULL << 1) -#define BPF_F_DONT_FRAGMENT (1ULL << 2) -#define BPF_F_SEQ_NUMBER (1ULL << 3) +enum { + BPF_F_ZERO_CSUM_TX = (1ULL << 1), + BPF_F_DONT_FRAGMENT = (1ULL << 2), + BPF_F_SEQ_NUMBER = (1ULL << 3), +}; /* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and * BPF_FUNC_perf_event_read_value flags. */ -#define BPF_F_INDEX_MASK 0xffffffffULL -#define BPF_F_CURRENT_CPU BPF_F_INDEX_MASK +enum { + BPF_F_INDEX_MASK = 0xffffffffULL, + BPF_F_CURRENT_CPU = BPF_F_INDEX_MASK, /* BPF_FUNC_perf_event_output for sk_buff input context. */ -#define BPF_F_CTXLEN_MASK (0xfffffULL << 32) + BPF_F_CTXLEN_MASK = (0xfffffULL << 32), +}; /* Current network namespace */ -#define BPF_F_CURRENT_NETNS (-1L) +enum { + BPF_F_CURRENT_NETNS = (-1L), +}; /* BPF_FUNC_skb_adjust_room flags. */ -#define BPF_F_ADJ_ROOM_FIXED_GSO (1ULL << 0) +enum { + BPF_F_ADJ_ROOM_FIXED_GSO = (1ULL << 0), + BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 = (1ULL << 1), + BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 = (1ULL << 2), + BPF_F_ADJ_ROOM_ENCAP_L4_GRE = (1ULL << 3), + BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4), +}; -#define BPF_ADJ_ROOM_ENCAP_L2_MASK 0xff -#define BPF_ADJ_ROOM_ENCAP_L2_SHIFT 56 +enum { + BPF_ADJ_ROOM_ENCAP_L2_MASK = 0xff, + BPF_ADJ_ROOM_ENCAP_L2_SHIFT = 56, +}; -#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 (1ULL << 1) -#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 (1ULL << 2) -#define BPF_F_ADJ_ROOM_ENCAP_L4_GRE (1ULL << 3) -#define BPF_F_ADJ_ROOM_ENCAP_L4_UDP (1ULL << 4) #define BPF_F_ADJ_ROOM_ENCAP_L2(len) (((__u64)len & \ BPF_ADJ_ROOM_ENCAP_L2_MASK) \ << BPF_ADJ_ROOM_ENCAP_L2_SHIFT) /* BPF_FUNC_sysctl_get_name flags. */ -#define BPF_F_SYSCTL_BASE_NAME (1ULL << 0) +enum { + BPF_F_SYSCTL_BASE_NAME = (1ULL << 0), +}; /* BPF_FUNC_sk_storage_get flags */ -#define BPF_SK_STORAGE_GET_F_CREATE (1ULL << 0) +enum { + BPF_SK_STORAGE_GET_F_CREATE = (1ULL << 0), +}; /* BPF_FUNC_read_branch_records flags. */ -#define BPF_F_GET_BRANCH_RECORDS_SIZE (1ULL << 0) +enum { + BPF_F_GET_BRANCH_RECORDS_SIZE = (1ULL << 0), +}; /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { @@ -3529,13 +3561,14 @@ struct bpf_sock_ops { }; /* Definitions for bpf_sock_ops_cb_flags */ -#define BPF_SOCK_OPS_RTO_CB_FLAG (1<<0) -#define BPF_SOCK_OPS_RETRANS_CB_FLAG (1<<1) -#define BPF_SOCK_OPS_STATE_CB_FLAG (1<<2) -#define BPF_SOCK_OPS_RTT_CB_FLAG (1<<3) -#define BPF_SOCK_OPS_ALL_CB_FLAGS 0xF /* Mask of all currently - * supported cb flags - */ +enum { + BPF_SOCK_OPS_RTO_CB_FLAG = (1<<0), + BPF_SOCK_OPS_RETRANS_CB_FLAG = (1<<1), + BPF_SOCK_OPS_STATE_CB_FLAG = (1<<2), + BPF_SOCK_OPS_RTT_CB_FLAG = (1<<3), +/* Mask of all currently supported cb flags */ + BPF_SOCK_OPS_ALL_CB_FLAGS = 0xF, +}; /* List of known BPF sock_ops operators. * New entries can only be added at the end @@ -3614,8 +3647,10 @@ enum { BPF_TCP_MAX_STATES /* Leave at the end! */ }; -#define TCP_BPF_IW 1001 /* Set TCP initial congestion window */ -#define TCP_BPF_SNDCWND_CLAMP 1002 /* Set sndcwnd_clamp */ +enum { + TCP_BPF_IW = 1001, /* Set TCP initial congestion window */ + TCP_BPF_SNDCWND_CLAMP = 1002, /* Set sndcwnd_clamp */ +}; struct bpf_perf_event_value { __u64 counter; @@ -3623,12 +3658,16 @@ struct bpf_perf_event_value { __u64 running; }; -#define BPF_DEVCG_ACC_MKNOD (1ULL << 0) -#define BPF_DEVCG_ACC_READ (1ULL << 1) -#define BPF_DEVCG_ACC_WRITE (1ULL << 2) +enum { + BPF_DEVCG_ACC_MKNOD = (1ULL << 0), + BPF_DEVCG_ACC_READ = (1ULL << 1), + BPF_DEVCG_ACC_WRITE = (1ULL << 2), +}; -#define BPF_DEVCG_DEV_BLOCK (1ULL << 0) -#define BPF_DEVCG_DEV_CHAR (1ULL << 1) +enum { + BPF_DEVCG_DEV_BLOCK = (1ULL << 0), + BPF_DEVCG_DEV_CHAR = (1ULL << 1), +}; struct bpf_cgroup_dev_ctx { /* access_type encoded as (BPF_DEVCG_ACC_* << 16) | BPF_DEVCG_DEV_* */ @@ -3644,8 +3683,10 @@ struct bpf_raw_tracepoint_args { /* DIRECT: Skip the FIB rules and go to FIB table associated with device * OUTPUT: Do lookup from egress perspective; default is ingress */ -#define BPF_FIB_LOOKUP_DIRECT (1U << 0) -#define BPF_FIB_LOOKUP_OUTPUT (1U << 1) +enum { + BPF_FIB_LOOKUP_DIRECT = (1U << 0), + BPF_FIB_LOOKUP_OUTPUT = (1U << 1), +}; enum { BPF_FIB_LKUP_RET_SUCCESS, /* lookup successful */ @@ -3717,9 +3758,11 @@ enum bpf_task_fd_type { BPF_FD_TYPE_URETPROBE, /* filename + offset */ }; -#define BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG (1U << 0) -#define BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL (1U << 1) -#define BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP (1U << 2) +enum { + BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG = (1U << 0), + BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL = (1U << 1), + BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP = (1U << 2), +}; struct bpf_flow_keys { __u16 nhoff; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 7c689f4552dd..d6b33ea27bcc 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -73,7 +73,7 @@ struct bpf_insn { /* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */ struct bpf_lpm_trie_key { __u32 prefixlen; /* up to 32 for AF_INET, 128 for AF_INET6 */ - __u8 data[0]; /* Arbitrary size */ + __u8 data[]; /* Arbitrary size */ }; struct bpf_cgroup_storage_key { @@ -325,44 +325,46 @@ enum bpf_attach_type { #define BPF_PSEUDO_CALL 1 /* flags for BPF_MAP_UPDATE_ELEM command */ -#define BPF_ANY 0 /* create new element or update existing */ -#define BPF_NOEXIST 1 /* create new element if it didn't exist */ -#define BPF_EXIST 2 /* update existing element */ -#define BPF_F_LOCK 4 /* spin_lock-ed map_lookup/map_update */ +enum { + BPF_ANY = 0, /* create new element or update existing */ + BPF_NOEXIST = 1, /* create new element if it didn't exist */ + BPF_EXIST = 2, /* update existing element */ + BPF_F_LOCK = 4, /* spin_lock-ed map_lookup/map_update */ +}; /* flags for BPF_MAP_CREATE command */ -#define BPF_F_NO_PREALLOC (1U << 0) +enum { + BPF_F_NO_PREALLOC = (1U << 0), /* Instead of having one common LRU list in the * BPF_MAP_TYPE_LRU_[PERCPU_]HASH map, use a percpu LRU list * which can scale and perform better. * Note, the LRU nodes (including free nodes) cannot be moved * across different LRU lists. */ -#define BPF_F_NO_COMMON_LRU (1U << 1) + BPF_F_NO_COMMON_LRU = (1U << 1), /* Specify numa node during map creation */ -#define BPF_F_NUMA_NODE (1U << 2) - -#define BPF_OBJ_NAME_LEN 16U + BPF_F_NUMA_NODE = (1U << 2), /* Flags for accessing BPF object from syscall side. */ -#define BPF_F_RDONLY (1U << 3) -#define BPF_F_WRONLY (1U << 4) + BPF_F_RDONLY = (1U << 3), + BPF_F_WRONLY = (1U << 4), /* Flag for stack_map, store build_id+offset instead of pointer */ -#define BPF_F_STACK_BUILD_ID (1U << 5) + BPF_F_STACK_BUILD_ID = (1U << 5), /* Zero-initialize hash function seed. This should only be used for testing. */ -#define BPF_F_ZERO_SEED (1U << 6) + BPF_F_ZERO_SEED = (1U << 6), /* Flags for accessing BPF object from program side. */ -#define BPF_F_RDONLY_PROG (1U << 7) -#define BPF_F_WRONLY_PROG (1U << 8) + BPF_F_RDONLY_PROG = (1U << 7), + BPF_F_WRONLY_PROG = (1U << 8), /* Clone map from listener for newly accepted socket */ -#define BPF_F_CLONE (1U << 9) + BPF_F_CLONE = (1U << 9), /* Enable memory-mapping BPF map */ -#define BPF_F_MMAPABLE (1U << 10) + BPF_F_MMAPABLE = (1U << 10), +}; /* Flags for BPF_PROG_QUERY. */ @@ -391,6 +393,8 @@ struct bpf_stack_build_id { }; }; +#define BPF_OBJ_NAME_LEN 16U + union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ __u32 map_type; /* one of enum bpf_map_type */ @@ -3045,72 +3049,100 @@ enum bpf_func_id { /* All flags used by eBPF helper functions, placed here. */ /* BPF_FUNC_skb_store_bytes flags. */ -#define BPF_F_RECOMPUTE_CSUM (1ULL << 0) -#define BPF_F_INVALIDATE_HASH (1ULL << 1) +enum { + BPF_F_RECOMPUTE_CSUM = (1ULL << 0), + BPF_F_INVALIDATE_HASH = (1ULL << 1), +}; /* BPF_FUNC_l3_csum_replace and BPF_FUNC_l4_csum_replace flags. * First 4 bits are for passing the header field size. */ -#define BPF_F_HDR_FIELD_MASK 0xfULL +enum { + BPF_F_HDR_FIELD_MASK = 0xfULL, +}; /* BPF_FUNC_l4_csum_replace flags. */ -#define BPF_F_PSEUDO_HDR (1ULL << 4) -#define BPF_F_MARK_MANGLED_0 (1ULL << 5) -#define BPF_F_MARK_ENFORCE (1ULL << 6) +enum { + BPF_F_PSEUDO_HDR = (1ULL << 4), + BPF_F_MARK_MANGLED_0 = (1ULL << 5), + BPF_F_MARK_ENFORCE = (1ULL << 6), +}; /* BPF_FUNC_clone_redirect and BPF_FUNC_redirect flags. */ -#define BPF_F_INGRESS (1ULL << 0) +enum { + BPF_F_INGRESS = (1ULL << 0), +}; /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */ -#define BPF_F_TUNINFO_IPV6 (1ULL << 0) +enum { + BPF_F_TUNINFO_IPV6 = (1ULL << 0), +}; /* flags for both BPF_FUNC_get_stackid and BPF_FUNC_get_stack. */ -#define BPF_F_SKIP_FIELD_MASK 0xffULL -#define BPF_F_USER_STACK (1ULL << 8) +enum { + BPF_F_SKIP_FIELD_MASK = 0xffULL, + BPF_F_USER_STACK = (1ULL << 8), /* flags used by BPF_FUNC_get_stackid only. */ -#define BPF_F_FAST_STACK_CMP (1ULL << 9) -#define BPF_F_REUSE_STACKID (1ULL << 10) + BPF_F_FAST_STACK_CMP = (1ULL << 9), + BPF_F_REUSE_STACKID = (1ULL << 10), /* flags used by BPF_FUNC_get_stack only. */ -#define BPF_F_USER_BUILD_ID (1ULL << 11) + BPF_F_USER_BUILD_ID = (1ULL << 11), +}; /* BPF_FUNC_skb_set_tunnel_key flags. */ -#define BPF_F_ZERO_CSUM_TX (1ULL << 1) -#define BPF_F_DONT_FRAGMENT (1ULL << 2) -#define BPF_F_SEQ_NUMBER (1ULL << 3) +enum { + BPF_F_ZERO_CSUM_TX = (1ULL << 1), + BPF_F_DONT_FRAGMENT = (1ULL << 2), + BPF_F_SEQ_NUMBER = (1ULL << 3), +}; /* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and * BPF_FUNC_perf_event_read_value flags. */ -#define BPF_F_INDEX_MASK 0xffffffffULL -#define BPF_F_CURRENT_CPU BPF_F_INDEX_MASK +enum { + BPF_F_INDEX_MASK = 0xffffffffULL, + BPF_F_CURRENT_CPU = BPF_F_INDEX_MASK, /* BPF_FUNC_perf_event_output for sk_buff input context. */ -#define BPF_F_CTXLEN_MASK (0xfffffULL << 32) + BPF_F_CTXLEN_MASK = (0xfffffULL << 32), +}; /* Current network namespace */ -#define BPF_F_CURRENT_NETNS (-1L) +enum { + BPF_F_CURRENT_NETNS = (-1L), +}; /* BPF_FUNC_skb_adjust_room flags. */ -#define BPF_F_ADJ_ROOM_FIXED_GSO (1ULL << 0) +enum { + BPF_F_ADJ_ROOM_FIXED_GSO = (1ULL << 0), + BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 = (1ULL << 1), + BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 = (1ULL << 2), + BPF_F_ADJ_ROOM_ENCAP_L4_GRE = (1ULL << 3), + BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4), +}; -#define BPF_ADJ_ROOM_ENCAP_L2_MASK 0xff -#define BPF_ADJ_ROOM_ENCAP_L2_SHIFT 56 +enum { + BPF_ADJ_ROOM_ENCAP_L2_MASK = 0xff, + BPF_ADJ_ROOM_ENCAP_L2_SHIFT = 56, +}; -#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 (1ULL << 1) -#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 (1ULL << 2) -#define BPF_F_ADJ_ROOM_ENCAP_L4_GRE (1ULL << 3) -#define BPF_F_ADJ_ROOM_ENCAP_L4_UDP (1ULL << 4) #define BPF_F_ADJ_ROOM_ENCAP_L2(len) (((__u64)len & \ BPF_ADJ_ROOM_ENCAP_L2_MASK) \ << BPF_ADJ_ROOM_ENCAP_L2_SHIFT) /* BPF_FUNC_sysctl_get_name flags. */ -#define BPF_F_SYSCTL_BASE_NAME (1ULL << 0) +enum { + BPF_F_SYSCTL_BASE_NAME = (1ULL << 0), +}; /* BPF_FUNC_sk_storage_get flags */ -#define BPF_SK_STORAGE_GET_F_CREATE (1ULL << 0) +enum { + BPF_SK_STORAGE_GET_F_CREATE = (1ULL << 0), +}; /* BPF_FUNC_read_branch_records flags. */ -#define BPF_F_GET_BRANCH_RECORDS_SIZE (1ULL << 0) +enum { + BPF_F_GET_BRANCH_RECORDS_SIZE = (1ULL << 0), +}; /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { @@ -3529,13 +3561,14 @@ struct bpf_sock_ops { }; /* Definitions for bpf_sock_ops_cb_flags */ -#define BPF_SOCK_OPS_RTO_CB_FLAG (1<<0) -#define BPF_SOCK_OPS_RETRANS_CB_FLAG (1<<1) -#define BPF_SOCK_OPS_STATE_CB_FLAG (1<<2) -#define BPF_SOCK_OPS_RTT_CB_FLAG (1<<3) -#define BPF_SOCK_OPS_ALL_CB_FLAGS 0xF /* Mask of all currently - * supported cb flags - */ +enum { + BPF_SOCK_OPS_RTO_CB_FLAG = (1<<0), + BPF_SOCK_OPS_RETRANS_CB_FLAG = (1<<1), + BPF_SOCK_OPS_STATE_CB_FLAG = (1<<2), + BPF_SOCK_OPS_RTT_CB_FLAG = (1<<3), +/* Mask of all currently supported cb flags */ + BPF_SOCK_OPS_ALL_CB_FLAGS = 0xF, +}; /* List of known BPF sock_ops operators. * New entries can only be added at the end @@ -3614,8 +3647,10 @@ enum { BPF_TCP_MAX_STATES /* Leave at the end! */ }; -#define TCP_BPF_IW 1001 /* Set TCP initial congestion window */ -#define TCP_BPF_SNDCWND_CLAMP 1002 /* Set sndcwnd_clamp */ +enum { + TCP_BPF_IW = 1001, /* Set TCP initial congestion window */ + TCP_BPF_SNDCWND_CLAMP = 1002, /* Set sndcwnd_clamp */ +}; struct bpf_perf_event_value { __u64 counter; @@ -3623,12 +3658,16 @@ struct bpf_perf_event_value { __u64 running; }; -#define BPF_DEVCG_ACC_MKNOD (1ULL << 0) -#define BPF_DEVCG_ACC_READ (1ULL << 1) -#define BPF_DEVCG_ACC_WRITE (1ULL << 2) +enum { + BPF_DEVCG_ACC_MKNOD = (1ULL << 0), + BPF_DEVCG_ACC_READ = (1ULL << 1), + BPF_DEVCG_ACC_WRITE = (1ULL << 2), +}; -#define BPF_DEVCG_DEV_BLOCK (1ULL << 0) -#define BPF_DEVCG_DEV_CHAR (1ULL << 1) +enum { + BPF_DEVCG_DEV_BLOCK = (1ULL << 0), + BPF_DEVCG_DEV_CHAR = (1ULL << 1), +}; struct bpf_cgroup_dev_ctx { /* access_type encoded as (BPF_DEVCG_ACC_* << 16) | BPF_DEVCG_DEV_* */ @@ -3644,8 +3683,10 @@ struct bpf_raw_tracepoint_args { /* DIRECT: Skip the FIB rules and go to FIB table associated with device * OUTPUT: Do lookup from egress perspective; default is ingress */ -#define BPF_FIB_LOOKUP_DIRECT (1U << 0) -#define BPF_FIB_LOOKUP_OUTPUT (1U << 1) +enum { + BPF_FIB_LOOKUP_DIRECT = (1U << 0), + BPF_FIB_LOOKUP_OUTPUT = (1U << 1), +}; enum { BPF_FIB_LKUP_RET_SUCCESS, /* lookup successful */ @@ -3717,9 +3758,11 @@ enum bpf_task_fd_type { BPF_FD_TYPE_URETPROBE, /* filename + offset */ }; -#define BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG (1U << 0) -#define BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL (1U << 1) -#define BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP (1U << 2) +enum { + BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG = (1U << 0), + BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL = (1U << 1), + BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP = (1U << 2), +}; struct bpf_flow_keys { __u16 nhoff; -- cgit v1.2.3-59-g8ed1b From ae24082331d9bbaae283aafbe930a8f0eb85605a Mon Sep 17 00:00:00 2001 From: KP Singh Date: Wed, 4 Mar 2020 20:18:49 +0100 Subject: bpf: Introduce BPF_MODIFY_RETURN When multiple programs are attached, each program receives the return value from the previous program on the stack and the last program provides the return value to the attached function. The fmod_ret bpf programs are run after the fentry programs and before the fexit programs. The original function is only called if all the fmod_ret programs return 0 to avoid any unintended side-effects. The success value, i.e. 0 is not currently configurable but can be made so where user-space can specify it at load time. For example: int func_to_be_attached(int a, int b) { <--- do_fentry do_fmod_ret: if (ret != 0) goto do_fexit; original_function: } <--- do_fexit The fmod_ret program attached to this function can be defined as: SEC("fmod_ret/func_to_be_attached") int BPF_PROG(func_name, int a, int b, int ret) { // This will skip the original function logic. return 1; } The first fmod_ret program is passed 0 in its return argument. Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200304191853.1529-4-kpsingh@chromium.org --- arch/x86/net/bpf_jit_comp.c | 130 +++++++++++++++++++++++++++++++++++++---- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 1 + kernel/bpf/btf.c | 3 +- kernel/bpf/syscall.c | 1 + kernel/bpf/trampoline.c | 5 +- kernel/bpf/verifier.c | 1 + tools/include/uapi/linux/bpf.h | 1 + 8 files changed, 130 insertions(+), 13 deletions(-) (limited to 'tools/include') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index d6349e930b06..b1fd000feb89 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1362,7 +1362,7 @@ static void restore_regs(const struct btf_func_model *m, u8 **prog, int nr_args, } static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, - struct bpf_prog *p, int stack_size) + struct bpf_prog *p, int stack_size, bool mod_ret) { u8 *prog = *pprog; int cnt = 0; @@ -1383,6 +1383,13 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, if (emit_call(&prog, p->bpf_func, prog)) return -EINVAL; + /* BPF_TRAMP_MODIFY_RETURN trampolines can modify the return + * of the previous call which is then passed on the stack to + * the next BPF program. + */ + if (mod_ret) + emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); + /* arg1: mov rdi, progs[i] */ emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p); @@ -1442,6 +1449,23 @@ static int emit_cond_near_jump(u8 **pprog, void *func, void *ip, u8 jmp_cond) return 0; } +static int emit_mod_ret_check_imm8(u8 **pprog, int value) +{ + u8 *prog = *pprog; + int cnt = 0; + + if (!is_imm8(value)) + return -EINVAL; + + if (value == 0) + EMIT2(0x85, add_2reg(0xC0, BPF_REG_0, BPF_REG_0)); + else + EMIT3(0x83, add_1reg(0xF8, BPF_REG_0), value); + + *pprog = prog; + return 0; +} + static int invoke_bpf(const struct btf_func_model *m, u8 **pprog, struct bpf_tramp_progs *tp, int stack_size) { @@ -1449,9 +1473,49 @@ static int invoke_bpf(const struct btf_func_model *m, u8 **pprog, u8 *prog = *pprog; for (i = 0; i < tp->nr_progs; i++) { - if (invoke_bpf_prog(m, &prog, tp->progs[i], stack_size)) + if (invoke_bpf_prog(m, &prog, tp->progs[i], stack_size, false)) + return -EINVAL; + } + *pprog = prog; + return 0; +} + +static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog, + struct bpf_tramp_progs *tp, int stack_size, + u8 **branches) +{ + u8 *prog = *pprog; + int i; + + /* The first fmod_ret program will receive a garbage return value. + * Set this to 0 to avoid confusing the program. + */ + emit_mov_imm32(&prog, false, BPF_REG_0, 0); + emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); + for (i = 0; i < tp->nr_progs; i++) { + if (invoke_bpf_prog(m, &prog, tp->progs[i], stack_size, true)) return -EINVAL; + + /* Generate a branch: + * + * if (ret != 0) + * goto do_fexit; + * + * If needed this can be extended to any integer value which can + * be passed by user-space when the program is loaded. + */ + if (emit_mod_ret_check_imm8(&prog, 0)) + return -EINVAL; + + /* Save the location of the branch and Generate 6 nops + * (4 bytes for an offset and 2 bytes for the jump) These nops + * are replaced with a conditional jump once do_fexit (i.e. the + * start of the fexit invocation) is finalized. + */ + branches[i] = prog; + emit_nops(&prog, 4 + 2); } + *pprog = prog; return 0; } @@ -1521,10 +1585,12 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, struct bpf_tramp_progs *tprogs, void *orig_call) { - int cnt = 0, nr_args = m->nr_args; + int ret, i, cnt = 0, nr_args = m->nr_args; int stack_size = nr_args * 8; struct bpf_tramp_progs *fentry = &tprogs[BPF_TRAMP_FENTRY]; struct bpf_tramp_progs *fexit = &tprogs[BPF_TRAMP_FEXIT]; + struct bpf_tramp_progs *fmod_ret = &tprogs[BPF_TRAMP_MODIFY_RETURN]; + u8 **branches = NULL; u8 *prog; /* x86-64 supports up to 6 arguments. 7+ can be added in the future */ @@ -1557,24 +1623,60 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, if (invoke_bpf(m, &prog, fentry, stack_size)) return -EINVAL; + if (fmod_ret->nr_progs) { + branches = kcalloc(fmod_ret->nr_progs, sizeof(u8 *), + GFP_KERNEL); + if (!branches) + return -ENOMEM; + + if (invoke_bpf_mod_ret(m, &prog, fmod_ret, stack_size, + branches)) { + ret = -EINVAL; + goto cleanup; + } + } + if (flags & BPF_TRAMP_F_CALL_ORIG) { - if (fentry->nr_progs) + if (fentry->nr_progs || fmod_ret->nr_progs) restore_regs(m, &prog, nr_args, stack_size); /* call original function */ - if (emit_call(&prog, orig_call, prog)) - return -EINVAL; + if (emit_call(&prog, orig_call, prog)) { + ret = -EINVAL; + goto cleanup; + } /* remember return value in a stack for bpf prog to access */ emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); } + if (fmod_ret->nr_progs) { + /* From Intel 64 and IA-32 Architectures Optimization + * Reference Manual, 3.4.1.4 Code Alignment, Assembly/Compiler + * Coding Rule 11: All branch targets should be 16-byte + * aligned. + */ + emit_align(&prog, 16); + /* Update the branches saved in invoke_bpf_mod_ret with the + * aligned address of do_fexit. + */ + for (i = 0; i < fmod_ret->nr_progs; i++) + emit_cond_near_jump(&branches[i], prog, branches[i], + X86_JNE); + } + if (fexit->nr_progs) - if (invoke_bpf(m, &prog, fexit, stack_size)) - return -EINVAL; + if (invoke_bpf(m, &prog, fexit, stack_size)) { + ret = -EINVAL; + goto cleanup; + } if (flags & BPF_TRAMP_F_RESTORE_REGS) restore_regs(m, &prog, nr_args, stack_size); + /* This needs to be done regardless. If there were fmod_ret programs, + * the return value is only updated on the stack and still needs to be + * restored to R0. + */ if (flags & BPF_TRAMP_F_CALL_ORIG) /* restore original return value back into RAX */ emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8); @@ -1586,9 +1688,15 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, EMIT4(0x48, 0x83, 0xC4, 8); /* add rsp, 8 */ EMIT1(0xC3); /* ret */ /* Make sure the trampoline generation logic doesn't overflow */ - if (WARN_ON_ONCE(prog > (u8 *)image_end - BPF_INSN_SAFETY)) - return -EFAULT; - return prog - (u8 *)image; + if (WARN_ON_ONCE(prog > (u8 *)image_end - BPF_INSN_SAFETY)) { + ret = -EFAULT; + goto cleanup; + } + ret = prog - (u8 *)image; + +cleanup: + kfree(branches); + return ret; } static int emit_fallback_jump(u8 **pprog) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 98ec10b23dbb..f748b31e5888 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -474,6 +474,7 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start); enum bpf_tramp_prog_type { BPF_TRAMP_FENTRY, BPF_TRAMP_FEXIT, + BPF_TRAMP_MODIFY_RETURN, BPF_TRAMP_MAX, BPF_TRAMP_REPLACE, /* more than MAX */ }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d6b33ea27bcc..40b2d9476268 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -210,6 +210,7 @@ enum bpf_attach_type { BPF_TRACE_RAW_TP, BPF_TRACE_FENTRY, BPF_TRACE_FEXIT, + BPF_MODIFY_RETURN, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 787140095e58..30841fb8b3c0 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3710,7 +3710,8 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, nr_args--; } - if (prog->expected_attach_type == BPF_TRACE_FEXIT && + if ((prog->expected_attach_type == BPF_TRACE_FEXIT || + prog->expected_attach_type == BPF_MODIFY_RETURN) && arg == nr_args) { if (!t) /* Default prog with 5 args. 6th arg is retval. */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 13de65363ba2..7ce0815793dd 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2324,6 +2324,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog) if (prog->expected_attach_type != BPF_TRACE_FENTRY && prog->expected_attach_type != BPF_TRACE_FEXIT && + prog->expected_attach_type != BPF_MODIFY_RETURN && prog->type != BPF_PROG_TYPE_EXT) { err = -EINVAL; goto out_put_prog; diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 546198f6f307..221a17af1f81 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -232,7 +232,8 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) goto out; } - if (tprogs[BPF_TRAMP_FEXIT].nr_progs) + if (tprogs[BPF_TRAMP_FEXIT].nr_progs || + tprogs[BPF_TRAMP_MODIFY_RETURN].nr_progs) flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME; /* Though the second half of trampoline page is unused a task could be @@ -269,6 +270,8 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(enum bpf_attach_type t) switch (t) { case BPF_TRACE_FENTRY: return BPF_TRAMP_FENTRY; + case BPF_MODIFY_RETURN: + return BPF_TRAMP_MODIFY_RETURN; case BPF_TRACE_FEXIT: return BPF_TRAMP_FEXIT; default: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 289383edfc8c..2460c8e6b5be 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9950,6 +9950,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) if (!prog_extension) return -EINVAL; /* fallthrough */ + case BPF_MODIFY_RETURN: case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: if (!btf_type_is_func(t)) { diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index d6b33ea27bcc..40b2d9476268 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -210,6 +210,7 @@ enum bpf_attach_type { BPF_TRACE_RAW_TP, BPF_TRACE_FENTRY, BPF_TRACE_FEXIT, + BPF_MODIFY_RETURN, __MAX_BPF_ATTACH_TYPE }; -- cgit v1.2.3-59-g8ed1b From b4490c5c4e023f09b7d27c9a9d3e7ad7d09ea6bf Mon Sep 17 00:00:00 2001 From: Carlos Neira Date: Wed, 4 Mar 2020 17:41:56 -0300 Subject: bpf: Added new helper bpf_get_ns_current_pid_tgid New bpf helper bpf_get_ns_current_pid_tgid, This helper will return pid and tgid from current task which namespace matches dev_t and inode number provided, this will allows us to instrument a process inside a container. Signed-off-by: Carlos Neira Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200304204157.58695-3-cneirabustos@gmail.com --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 20 ++++++++++++++++++- kernel/bpf/core.c | 1 + kernel/bpf/helpers.c | 45 ++++++++++++++++++++++++++++++++++++++++++ kernel/trace/bpf_trace.c | 2 ++ scripts/bpf_helpers_doc.py | 1 + tools/include/uapi/linux/bpf.h | 20 ++++++++++++++++++- 7 files changed, 88 insertions(+), 2 deletions(-) (limited to 'tools/include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4fd91b7c95ea..4ec835334a1f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1497,6 +1497,7 @@ extern const struct bpf_func_proto bpf_strtol_proto; extern const struct bpf_func_proto bpf_strtoul_proto; extern const struct bpf_func_proto bpf_tcp_sock_proto; extern const struct bpf_func_proto bpf_jiffies64_proto; +extern const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto; /* Shared helpers among cBPF and eBPF. */ void bpf_user_rnd_init_once(void); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 40b2d9476268..15b239da775b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2914,6 +2914,19 @@ union bpf_attr { * of sizeof(struct perf_branch_entry). * * **-ENOENT** if architecture does not support branch records. + * + * int bpf_get_ns_current_pid_tgid(u64 dev, u64 ino, struct bpf_pidns_info *nsdata, u32 size) + * Description + * Returns 0 on success, values for *pid* and *tgid* as seen from the current + * *namespace* will be returned in *nsdata*. + * + * On failure, the returned value is one of the following: + * + * **-EINVAL** if dev and inum supplied don't match dev_t and inode number + * with nsfs of current task, or if dev conversion to dev_t lost high bits. + * + * **-ENOENT** if pidns does not exists for the current task. + * */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3035,7 +3048,8 @@ union bpf_attr { FN(tcp_send_ack), \ FN(send_signal_thread), \ FN(jiffies64), \ - FN(read_branch_records), + FN(read_branch_records), \ + FN(get_ns_current_pid_tgid), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -3829,4 +3843,8 @@ struct bpf_sockopt { __s32 retval; }; +struct bpf_pidns_info { + __u32 pid; + __u32 tgid; +}; #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 973a20d49749..0f9ca46e1978 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2149,6 +2149,7 @@ const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; const struct bpf_func_proto bpf_get_current_comm_proto __weak; const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak; const struct bpf_func_proto bpf_get_local_storage_proto __weak; +const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto __weak; const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) { diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index d8b7b110a1c5..01878db15eaf 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include "../../lib/kstrtox.h" @@ -499,3 +501,46 @@ const struct bpf_func_proto bpf_strtoul_proto = { .arg4_type = ARG_PTR_TO_LONG, }; #endif + +BPF_CALL_4(bpf_get_ns_current_pid_tgid, u64, dev, u64, ino, + struct bpf_pidns_info *, nsdata, u32, size) +{ + struct task_struct *task = current; + struct pid_namespace *pidns; + int err = -EINVAL; + + if (unlikely(size != sizeof(struct bpf_pidns_info))) + goto clear; + + if (unlikely((u64)(dev_t)dev != dev)) + goto clear; + + if (unlikely(!task)) + goto clear; + + pidns = task_active_pid_ns(task); + if (unlikely(!pidns)) { + err = -ENOENT; + goto clear; + } + + if (!ns_match(&pidns->ns, (dev_t)dev, ino)) + goto clear; + + nsdata->pid = task_pid_nr_ns(task, pidns); + nsdata->tgid = task_tgid_nr_ns(task, pidns); + return 0; +clear: + memset((void *)nsdata, 0, (size_t) size); + return err; +} + +const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto = { + .func = bpf_get_ns_current_pid_tgid, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, +}; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 6a490d8ce9de..b5071c7e93ca 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -843,6 +843,8 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_send_signal_thread_proto; case BPF_FUNC_perf_event_read_value: return &bpf_perf_event_read_value_proto; + case BPF_FUNC_get_ns_current_pid_tgid: + return &bpf_get_ns_current_pid_tgid_proto; default: return NULL; } diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index cebed6fb5bbb..c1e2b5410faa 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -435,6 +435,7 @@ class PrinterHelpers(Printer): 'struct bpf_fib_lookup', 'struct bpf_perf_event_data', 'struct bpf_perf_event_value', + 'struct bpf_pidns_info', 'struct bpf_sock', 'struct bpf_sock_addr', 'struct bpf_sock_ops', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 40b2d9476268..15b239da775b 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2914,6 +2914,19 @@ union bpf_attr { * of sizeof(struct perf_branch_entry). * * **-ENOENT** if architecture does not support branch records. + * + * int bpf_get_ns_current_pid_tgid(u64 dev, u64 ino, struct bpf_pidns_info *nsdata, u32 size) + * Description + * Returns 0 on success, values for *pid* and *tgid* as seen from the current + * *namespace* will be returned in *nsdata*. + * + * On failure, the returned value is one of the following: + * + * **-EINVAL** if dev and inum supplied don't match dev_t and inode number + * with nsfs of current task, or if dev conversion to dev_t lost high bits. + * + * **-ENOENT** if pidns does not exists for the current task. + * */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3035,7 +3048,8 @@ union bpf_attr { FN(tcp_send_ack), \ FN(send_signal_thread), \ FN(jiffies64), \ - FN(read_branch_records), + FN(read_branch_records), \ + FN(get_ns_current_pid_tgid), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -3829,4 +3843,8 @@ struct bpf_sockopt { __s32 retval; }; +struct bpf_pidns_info { + __u32 pid; + __u32 tgid; +}; #endif /* _UAPI__LINUX_BPF_H__ */ -- cgit v1.2.3-59-g8ed1b From d831ee84bfc9173eecf30dbbc2553ae81b996c60 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Fri, 6 Mar 2020 08:59:23 +0000 Subject: bpf: Add bpf_xdp_output() helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce new helper that reuses existing xdp perf_event output implementation, but can be called from raw_tracepoint programs that receive 'struct xdp_buff *' as a tracepoint argument. Signed-off-by: Eelco Chaudron Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/158348514556.2239.11050972434793741444.stgit@xdp-tutorial --- include/uapi/linux/bpf.h | 26 ++++++++++- kernel/bpf/verifier.c | 4 +- kernel/trace/bpf_trace.c | 3 ++ net/core/filter.c | 16 ++++++- tools/include/uapi/linux/bpf.h | 26 ++++++++++- .../testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c | 53 ++++++++++++++++++++++ .../testing/selftests/bpf/progs/test_xdp_bpf2bpf.c | 24 ++++++++++ 7 files changed, 148 insertions(+), 4 deletions(-) (limited to 'tools/include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 15b239da775b..5d01c5c7e598 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2927,6 +2927,29 @@ union bpf_attr { * * **-ENOENT** if pidns does not exists for the current task. * + * int bpf_xdp_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * Description + * Write raw *data* blob into a special BPF perf event held by + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf + * event must have the following attributes: **PERF_SAMPLE_RAW** + * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and + * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. + * + * The *flags* are used to indicate the index in *map* for which + * the value must be put, masked with **BPF_F_INDEX_MASK**. + * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** + * to indicate that the index of the current CPU core should be + * used. + * + * The value to write, of *size*, is passed through eBPF stack and + * pointed by *data*. + * + * *ctx* is a pointer to in-kernel struct xdp_buff. + * + * This helper is similar to **bpf_perf_eventoutput**\ () but + * restricted to raw_tracepoint bpf programs. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3049,7 +3072,8 @@ union bpf_attr { FN(send_signal_thread), \ FN(jiffies64), \ FN(read_branch_records), \ - FN(get_ns_current_pid_tgid), + FN(get_ns_current_pid_tgid), \ + FN(xdp_output), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 55d376c53f7d..745f3cfdf3b2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3650,7 +3650,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (func_id != BPF_FUNC_perf_event_read && func_id != BPF_FUNC_perf_event_output && func_id != BPF_FUNC_skb_output && - func_id != BPF_FUNC_perf_event_read_value) + func_id != BPF_FUNC_perf_event_read_value && + func_id != BPF_FUNC_xdp_output) goto error; break; case BPF_MAP_TYPE_STACK_TRACE: @@ -3740,6 +3741,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_FUNC_perf_event_output: case BPF_FUNC_perf_event_read_value: case BPF_FUNC_skb_output: + case BPF_FUNC_xdp_output: if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) goto error; break; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index b5071c7e93ca..e619eedb5919 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1145,6 +1145,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { }; extern const struct bpf_func_proto bpf_skb_output_proto; +extern const struct bpf_func_proto bpf_xdp_output_proto; BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args, struct bpf_map *, map, u64, flags) @@ -1220,6 +1221,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #ifdef CONFIG_NET case BPF_FUNC_skb_output: return &bpf_skb_output_proto; + case BPF_FUNC_xdp_output: + return &bpf_xdp_output_proto; #endif default: return raw_tp_prog_func_proto(func_id, prog); diff --git a/net/core/filter.c b/net/core/filter.c index cd0a532db4e7..22219544410f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4061,7 +4061,8 @@ BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map, if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) return -EINVAL; - if (unlikely(xdp_size > (unsigned long)(xdp->data_end - xdp->data))) + if (unlikely(!xdp || + xdp_size > (unsigned long)(xdp->data_end - xdp->data))) return -EFAULT; return bpf_event_output(map, flags, meta, meta_size, xdp->data, @@ -4079,6 +4080,19 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; +static int bpf_xdp_output_btf_ids[5]; +const struct bpf_func_proto bpf_xdp_output_proto = { + .func = bpf_xdp_event_output, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, + .btf_id = bpf_xdp_output_btf_ids, +}; + BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb) { return skb->sk ? sock_gen_cookie(skb->sk) : 0; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 15b239da775b..5d01c5c7e598 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2927,6 +2927,29 @@ union bpf_attr { * * **-ENOENT** if pidns does not exists for the current task. * + * int bpf_xdp_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * Description + * Write raw *data* blob into a special BPF perf event held by + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf + * event must have the following attributes: **PERF_SAMPLE_RAW** + * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and + * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. + * + * The *flags* are used to indicate the index in *map* for which + * the value must be put, masked with **BPF_F_INDEX_MASK**. + * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** + * to indicate that the index of the current CPU core should be + * used. + * + * The value to write, of *size*, is passed through eBPF stack and + * pointed by *data*. + * + * *ctx* is a pointer to in-kernel struct xdp_buff. + * + * This helper is similar to **bpf_perf_eventoutput**\ () but + * restricted to raw_tracepoint bpf programs. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3049,7 +3072,8 @@ union bpf_attr { FN(send_signal_thread), \ FN(jiffies64), \ FN(read_branch_records), \ - FN(get_ns_current_pid_tgid), + FN(get_ns_current_pid_tgid), \ + FN(xdp_output), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c b/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c index 4ba011031d4c..a0f688c37023 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c @@ -4,17 +4,51 @@ #include "test_xdp.skel.h" #include "test_xdp_bpf2bpf.skel.h" +struct meta { + int ifindex; + int pkt_len; +}; + +static void on_sample(void *ctx, int cpu, void *data, __u32 size) +{ + int duration = 0; + struct meta *meta = (struct meta *)data; + struct ipv4_packet *trace_pkt_v4 = data + sizeof(*meta); + + if (CHECK(size < sizeof(pkt_v4) + sizeof(*meta), + "check_size", "size %u < %zu\n", + size, sizeof(pkt_v4) + sizeof(*meta))) + return; + + if (CHECK(meta->ifindex != if_nametoindex("lo"), "check_meta_ifindex", + "meta->ifindex = %d\n", meta->ifindex)) + return; + + if (CHECK(meta->pkt_len != sizeof(pkt_v4), "check_meta_pkt_len", + "meta->pkt_len = %zd\n", sizeof(pkt_v4))) + return; + + if (CHECK(memcmp(trace_pkt_v4, &pkt_v4, sizeof(pkt_v4)), + "check_packet_content", "content not the same\n")) + return; + + *(bool *)ctx = true; +} + void test_xdp_bpf2bpf(void) { __u32 duration = 0, retval, size; char buf[128]; int err, pkt_fd, map_fd; + bool passed = false; struct iphdr *iph = (void *)buf + sizeof(struct ethhdr); struct iptnl_info value4 = {.family = AF_INET}; struct test_xdp *pkt_skel = NULL; struct test_xdp_bpf2bpf *ftrace_skel = NULL; struct vip key4 = {.protocol = 6, .family = AF_INET}; struct bpf_program *prog; + struct perf_buffer *pb = NULL; + struct perf_buffer_opts pb_opts = {}; /* Load XDP program to introspect */ pkt_skel = test_xdp__open_and_load(); @@ -50,6 +84,14 @@ void test_xdp_bpf2bpf(void) if (CHECK(err, "ftrace_attach", "ftrace attach failed: %d\n", err)) goto out; + /* Set up perf buffer */ + pb_opts.sample_cb = on_sample; + pb_opts.ctx = &passed; + pb = perf_buffer__new(bpf_map__fd(ftrace_skel->maps.perf_buf_map), + 1, &pb_opts); + if (CHECK(IS_ERR(pb), "perf_buf__new", "err %ld\n", PTR_ERR(pb))) + goto out; + /* Run test program */ err = bpf_prog_test_run(pkt_fd, 1, &pkt_v4, sizeof(pkt_v4), buf, &size, &retval, &duration); @@ -60,6 +102,15 @@ void test_xdp_bpf2bpf(void) err, errno, retval, size)) goto out; + /* Make sure bpf_xdp_output() was triggered and it sent the expected + * data to the perf ring buffer. + */ + err = perf_buffer__poll(pb, 100); + if (CHECK(err < 0, "perf_buffer__poll", "err %d\n", err)) + goto out; + + CHECK_FAIL(!passed); + /* Verify test results */ if (CHECK(ftrace_skel->bss->test_result_fentry != if_nametoindex("lo"), "result", "fentry failed err %llu\n", @@ -70,6 +121,8 @@ void test_xdp_bpf2bpf(void) "fexit failed err %llu\n", ftrace_skel->bss->test_result_fexit); out: + if (pb) + perf_buffer__free(pb); test_xdp__destroy(pkt_skel); test_xdp_bpf2bpf__destroy(ftrace_skel); } diff --git a/tools/testing/selftests/bpf/progs/test_xdp_bpf2bpf.c b/tools/testing/selftests/bpf/progs/test_xdp_bpf2bpf.c index 42dd2fedd588..a038e827f850 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp_bpf2bpf.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_bpf2bpf.c @@ -3,6 +3,8 @@ #include #include +char _license[] SEC("license") = "GPL"; + struct net_device { /* Structure does not need to contain all entries, * as "preserve_access_index" will use BTF to fix this... @@ -27,10 +29,32 @@ struct xdp_buff { struct xdp_rxq_info *rxq; } __attribute__((preserve_access_index)); +struct meta { + int ifindex; + int pkt_len; +}; + +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(int)); +} perf_buf_map SEC(".maps"); + __u64 test_result_fentry = 0; SEC("fentry/FUNC") int BPF_PROG(trace_on_entry, struct xdp_buff *xdp) { + struct meta meta; + void *data_end = (void *)(long)xdp->data_end; + void *data = (void *)(long)xdp->data; + + meta.ifindex = xdp->rxq->dev->ifindex; + meta.pkt_len = data_end - data; + bpf_xdp_output(xdp, &perf_buf_map, + ((__u64) meta.pkt_len << 32) | + BPF_F_CURRENT_CPU, + &meta, sizeof(meta)); + test_result_fentry = xdp->rxq->dev->ifindex; return 0; } -- cgit v1.2.3-59-g8ed1b From bcd66b10b5e956b3e81f76a61abfed2501ff4038 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Fri, 13 Mar 2020 12:31:05 +0100 Subject: tools/bpf: Move linux/types.h for selftests and bpftool Commit fe4eb069edb7 ("bpftool: Use linux/types.h from source tree for profiler build") added a build dependency on tools/testing/selftests/bpf to tools/bpf/bpftool. This is suboptimal with respect to a possible stand-alone build of bpftool. Fix this by moving tools/testing/selftests/bpf/include/uapi/linux/types.h to tools/include/uapi/linux/types.h. This requires an adjustment in the include search path order for the tests in tools/testing/selftests/bpf so that tools/include/linux/types.h is selected when building host binaries and tools/include/uapi/linux/types.h is selected when building bpf binaries. Verified by compiling bpftool and the bpf selftests on x86_64 with this change. Fixes: fe4eb069edb7 ("bpftool: Use linux/types.h from source tree for profiler build") Suggested-by: Andrii Nakryiko Signed-off-by: Tobias Klauser Signed-off-by: Daniel Borkmann Reviewed-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20200313113105.6918-1-tklauser@distanz.ch --- tools/bpf/bpftool/Makefile | 1 - tools/include/uapi/linux/types.h | 23 ++++++++++++++++++++++ tools/testing/selftests/bpf/Makefile | 7 ++++--- .../selftests/bpf/include/uapi/linux/types.h | 23 ---------------------- 4 files changed, 27 insertions(+), 27 deletions(-) create mode 100644 tools/include/uapi/linux/types.h delete mode 100644 tools/testing/selftests/bpf/include/uapi/linux/types.h (limited to 'tools/include') diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index 9ca3bfbb9ac4..f584d1fdfc64 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -129,7 +129,6 @@ $(OUTPUT)_bpftool: $(_OBJS) $(LIBBPF) skeleton/profiler.bpf.o: skeleton/profiler.bpf.c $(LIBBPF) $(QUIET_CLANG)$(CLANG) \ -I$(srctree)/tools/include/uapi/ \ - -I$(srctree)/tools/testing/selftests/bpf/include/uapi \ -I$(LIBBPF_PATH) -I$(srctree)/tools/lib \ -g -O2 -target bpf -c $< -o $@ diff --git a/tools/include/uapi/linux/types.h b/tools/include/uapi/linux/types.h new file mode 100644 index 000000000000..91fa51a9c31d --- /dev/null +++ b/tools/include/uapi/linux/types.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _UAPI_LINUX_TYPES_H +#define _UAPI_LINUX_TYPES_H + +#include + +/* copied from linux:include/uapi/linux/types.h */ +#define __bitwise +typedef __u16 __bitwise __le16; +typedef __u16 __bitwise __be16; +typedef __u32 __bitwise __le32; +typedef __u32 __bitwise __be32; +typedef __u64 __bitwise __le64; +typedef __u64 __bitwise __be64; + +typedef __u16 __bitwise __sum16; +typedef __u32 __bitwise __wsum; + +#define __aligned_u64 __u64 __attribute__((aligned(8))) +#define __aligned_be64 __be64 __attribute__((aligned(8))) +#define __aligned_le64 __le64 __attribute__((aligned(8))) + +#endif /* _UAPI_LINUX_TYPES_H */ diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index da4389dde9f7..074a05efd1ca 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -20,8 +20,9 @@ CLANG ?= clang LLC ?= llc LLVM_OBJCOPY ?= llvm-objcopy BPF_GCC ?= $(shell command -v bpf-gcc;) -CFLAGS += -g -rdynamic -Wall -O2 $(GENFLAGS) -I$(CURDIR) -I$(APIDIR) \ +CFLAGS += -g -rdynamic -Wall -O2 $(GENFLAGS) -I$(CURDIR) \ -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) -I$(TOOLSINCDIR) \ + -I$(APIDIR) \ -Dbpf_prog_load=bpf_prog_test_load \ -Dbpf_load_program=bpf_test_load_program LDLIBS += -lcap -lelf -lz -lrt -lpthread @@ -194,8 +195,8 @@ MENDIAN=$(if $(IS_LITTLE_ENDIAN),-mlittle-endian,-mbig-endian) CLANG_SYS_INCLUDES = $(call get_sys_includes,$(CLANG)) BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH) $(MENDIAN) \ - -I$(INCLUDE_DIR) -I$(CURDIR) -I$(CURDIR)/include/uapi \ - -I$(APIDIR) -I$(abspath $(OUTPUT)/../usr/include) + -I$(INCLUDE_DIR) -I$(CURDIR) -I$(APIDIR) \ + -I$(abspath $(OUTPUT)/../usr/include) CLANG_CFLAGS = $(CLANG_SYS_INCLUDES) \ -Wno-compare-distinct-pointer-types diff --git a/tools/testing/selftests/bpf/include/uapi/linux/types.h b/tools/testing/selftests/bpf/include/uapi/linux/types.h deleted file mode 100644 index 91fa51a9c31d..000000000000 --- a/tools/testing/selftests/bpf/include/uapi/linux/types.h +++ /dev/null @@ -1,23 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _UAPI_LINUX_TYPES_H -#define _UAPI_LINUX_TYPES_H - -#include - -/* copied from linux:include/uapi/linux/types.h */ -#define __bitwise -typedef __u16 __bitwise __le16; -typedef __u16 __bitwise __be16; -typedef __u32 __bitwise __le32; -typedef __u32 __bitwise __be32; -typedef __u64 __bitwise __le64; -typedef __u64 __bitwise __be64; - -typedef __u16 __bitwise __sum16; -typedef __u32 __bitwise __wsum; - -#define __aligned_u64 __u64 __attribute__((aligned(8))) -#define __aligned_be64 __be64 __attribute__((aligned(8))) -#define __aligned_le64 __le64 __attribute__((aligned(8))) - -#endif /* _UAPI_LINUX_TYPES_H */ -- cgit v1.2.3-59-g8ed1b From 21114b7feec29e4425a3ac48a037569c016a46c8 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Wed, 25 Mar 2020 15:52:33 +0300 Subject: net: macsec: add support for offloading to the MAC This patch adds a new MACsec offloading option, MACSEC_OFFLOAD_MAC, allowing a user to select a MAC as a provider for MACsec offloading operations. Signed-off-by: Antoine Tenart Signed-off-by: Mark Starovoytov Signed-off-by: Igor Russkikh Signed-off-by: David S. Miller --- drivers/net/macsec.c | 13 +++++++++++-- include/uapi/linux/if_link.h | 1 + tools/include/uapi/linux/if_link.h | 1 + 3 files changed, 13 insertions(+), 2 deletions(-) (limited to 'tools/include') diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index 35abebe95b79..d29c072e19af 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -339,7 +339,8 @@ static void macsec_set_shortlen(struct macsec_eth_header *h, size_t data_len) /* Checks if a MACsec interface is being offloaded to an hardware engine */ static bool macsec_is_offloaded(struct macsec_dev *macsec) { - if (macsec->offload == MACSEC_OFFLOAD_PHY) + if (macsec->offload == MACSEC_OFFLOAD_MAC || + macsec->offload == MACSEC_OFFLOAD_PHY) return true; return false; @@ -355,6 +356,9 @@ static bool macsec_check_offload(enum macsec_offload offload, if (offload == MACSEC_OFFLOAD_PHY) return macsec->real_dev->phydev && macsec->real_dev->phydev->macsec_ops; + else if (offload == MACSEC_OFFLOAD_MAC) + return macsec->real_dev->features & NETIF_F_HW_MACSEC && + macsec->real_dev->macsec_ops; return false; } @@ -369,9 +373,14 @@ static const struct macsec_ops *__macsec_get_ops(enum macsec_offload offload, if (offload == MACSEC_OFFLOAD_PHY) ctx->phydev = macsec->real_dev->phydev; + else if (offload == MACSEC_OFFLOAD_MAC) + ctx->netdev = macsec->real_dev; } - return macsec->real_dev->phydev->macsec_ops; + if (offload == MACSEC_OFFLOAD_PHY) + return macsec->real_dev->phydev->macsec_ops; + else + return macsec->real_dev->macsec_ops; } /* Returns a pointer to the MACsec ops struct if any and updates the MACsec diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 61e0801c82df..d6ccd0105c05 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -489,6 +489,7 @@ enum macsec_validation_type { enum macsec_offload { MACSEC_OFFLOAD_OFF = 0, MACSEC_OFFLOAD_PHY = 1, + MACSEC_OFFLOAD_MAC = 2, __MACSEC_OFFLOAD_END, MACSEC_OFFLOAD_MAX = __MACSEC_OFFLOAD_END - 1, }; diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h index 024af2d1d0af..771371d5b996 100644 --- a/tools/include/uapi/linux/if_link.h +++ b/tools/include/uapi/linux/if_link.h @@ -489,6 +489,7 @@ enum macsec_validation_type { enum macsec_offload { MACSEC_OFFLOAD_OFF = 0, MACSEC_OFFLOAD_PHY = 1, + MACSEC_OFFLOAD_MAC = 2, __MACSEC_OFFLOAD_END, MACSEC_OFFLOAD_MAX = __MACSEC_OFFLOAD_END - 1, }; -- cgit v1.2.3-59-g8ed1b From f318903c0bf42448b4c884732df2bbb0ef7a2284 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 27 Mar 2020 16:58:52 +0100 Subject: bpf: Add netns cookie and enable it for bpf cgroup hooks In Cilium we're mainly using BPF cgroup hooks today in order to implement kube-proxy free Kubernetes service translation for ClusterIP, NodePort (*), ExternalIP, and LoadBalancer as well as HostPort mapping [0] for all traffic between Cilium managed nodes. While this works in its current shape and avoids packet-level NAT for inter Cilium managed node traffic, there is one major limitation we're facing today, that is, lack of netns awareness. In Kubernetes, the concept of Pods (which hold one or multiple containers) has been built around network namespaces, so while we can use the global scope of attaching to root BPF cgroup hooks also to our advantage (e.g. for exposing NodePort ports on loopback addresses), we also have the need to differentiate between initial network namespaces and non-initial one. For example, ExternalIP services mandate that non-local service IPs are not to be translated from the host (initial) network namespace as one example. Right now, we have an ugly work-around in place where non-local service IPs for ExternalIP services are not xlated from connect() and friends BPF hooks but instead via less efficient packet-level NAT on the veth tc ingress hook for Pod traffic. On top of determining whether we're in initial or non-initial network namespace we also have a need for a socket-cookie like mechanism for network namespaces scope. Socket cookies have the nice property that they can be combined as part of the key structure e.g. for BPF LRU maps without having to worry that the cookie could be recycled. We are planning to use this for our sessionAffinity implementation for services. Therefore, add a new bpf_get_netns_cookie() helper which would resolve both use cases at once: bpf_get_netns_cookie(NULL) would provide the cookie for the initial network namespace while passing the context instead of NULL would provide the cookie from the application's network namespace. We're using a hole, so no size increase; the assignment happens only once. Therefore this allows for a comparison on initial namespace as well as regular cookie usage as we have today with socket cookies. We could later on enable this helper for other program types as well as we would see need. (*) Both externalTrafficPolicy={Local|Cluster} types [0] https://github.com/cilium/cilium/blob/master/bpf/bpf_sock.c Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/c47d2346982693a9cf9da0e12690453aded4c788.1585323121.git.daniel@iogearbox.net --- include/linux/bpf.h | 1 + include/net/net_namespace.h | 10 ++++++++++ include/uapi/linux/bpf.h | 16 +++++++++++++++- kernel/bpf/verifier.c | 16 ++++++++++------ net/core/filter.c | 37 +++++++++++++++++++++++++++++++++++++ net/core/net_namespace.c | 15 +++++++++++++++ tools/include/uapi/linux/bpf.h | 16 +++++++++++++++- 7 files changed, 103 insertions(+), 8 deletions(-) (limited to 'tools/include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index bdb981c204fa..78046c570596 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -233,6 +233,7 @@ enum bpf_arg_type { ARG_CONST_SIZE_OR_ZERO, /* number of bytes accessed from memory or 0 */ ARG_PTR_TO_CTX, /* pointer to context */ + ARG_PTR_TO_CTX_OR_NULL, /* pointer to context or NULL */ ARG_ANYTHING, /* any (initialized) argument is ok */ ARG_PTR_TO_SPIN_LOCK, /* pointer to bpf_spin_lock */ ARG_PTR_TO_SOCK_COMMON, /* pointer to sock_common */ diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 854d39ef1ca3..1c6edfdb9a2c 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -168,6 +168,9 @@ struct net { #ifdef CONFIG_XFRM struct netns_xfrm xfrm; #endif + + atomic64_t net_cookie; /* written once */ + #if IS_ENABLED(CONFIG_IP_VS) struct netns_ipvs *ipvs; #endif @@ -273,6 +276,8 @@ static inline int check_net(const struct net *net) void net_drop_ns(void *); +u64 net_gen_cookie(struct net *net); + #else static inline struct net *get_net(struct net *net) @@ -300,6 +305,11 @@ static inline int check_net(const struct net *net) return 1; } +static inline u64 net_gen_cookie(struct net *net) +{ + return 0; +} + #define net_drop_ns NULL #endif diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 5d01c5c7e598..bd81c4555206 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2950,6 +2950,19 @@ union bpf_attr { * restricted to raw_tracepoint bpf programs. * Return * 0 on success, or a negative error in case of failure. + * + * u64 bpf_get_netns_cookie(void *ctx) + * Description + * Retrieve the cookie (generated by the kernel) of the network + * namespace the input *ctx* is associated with. The network + * namespace cookie remains stable for its lifetime and provides + * a global identifier that can be assumed unique. If *ctx* is + * NULL, then the helper returns the cookie for the initial + * network namespace. The cookie itself is very similar to that + * of bpf_get_socket_cookie() helper, but for network namespaces + * instead of sockets. + * Return + * A 8-byte long opaque number. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3073,7 +3086,8 @@ union bpf_attr { FN(jiffies64), \ FN(read_branch_records), \ FN(get_ns_current_pid_tgid), \ - FN(xdp_output), + FN(xdp_output), \ + FN(get_netns_cookie), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2ea2a868324e..46ba86c540e2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3461,13 +3461,17 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, expected_type = CONST_PTR_TO_MAP; if (type != expected_type) goto err_type; - } else if (arg_type == ARG_PTR_TO_CTX) { + } else if (arg_type == ARG_PTR_TO_CTX || + arg_type == ARG_PTR_TO_CTX_OR_NULL) { expected_type = PTR_TO_CTX; - if (type != expected_type) - goto err_type; - err = check_ctx_reg(env, reg, regno); - if (err < 0) - return err; + if (!(register_is_null(reg) && + arg_type == ARG_PTR_TO_CTX_OR_NULL)) { + if (type != expected_type) + goto err_type; + err = check_ctx_reg(env, reg, regno); + if (err < 0) + return err; + } } else if (arg_type == ARG_PTR_TO_SOCK_COMMON) { expected_type = PTR_TO_SOCK_COMMON; /* Any sk pointer can be ARG_PTR_TO_SOCK_COMMON */ diff --git a/net/core/filter.c b/net/core/filter.c index 6cb7e0e24473..e249a499cbe5 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4141,6 +4141,39 @@ static const struct bpf_func_proto bpf_get_socket_cookie_sock_ops_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +static u64 __bpf_get_netns_cookie(struct sock *sk) +{ +#ifdef CONFIG_NET_NS + return net_gen_cookie(sk ? sk->sk_net.net : &init_net); +#else + return 0; +#endif +} + +BPF_CALL_1(bpf_get_netns_cookie_sock, struct sock *, ctx) +{ + return __bpf_get_netns_cookie(ctx); +} + +static const struct bpf_func_proto bpf_get_netns_cookie_sock_proto = { + .func = bpf_get_netns_cookie_sock, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX_OR_NULL, +}; + +BPF_CALL_1(bpf_get_netns_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx) +{ + return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL); +} + +static const struct bpf_func_proto bpf_get_netns_cookie_sock_addr_proto = { + .func = bpf_get_netns_cookie_sock_addr, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX_OR_NULL, +}; + BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb) { struct sock *sk = sk_to_full_sk(skb->sk); @@ -5968,6 +6001,8 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_local_storage_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_sock_proto; + case BPF_FUNC_get_netns_cookie: + return &bpf_get_netns_cookie_sock_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; default: @@ -5994,6 +6029,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_sock_addr_proto; + case BPF_FUNC_get_netns_cookie: + return &bpf_get_netns_cookie_sock_addr_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; case BPF_FUNC_perf_event_output: diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 757cc1d084e7..190ca66a383b 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -69,6 +69,20 @@ EXPORT_SYMBOL_GPL(pernet_ops_rwsem); static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS; +static atomic64_t cookie_gen; + +u64 net_gen_cookie(struct net *net) +{ + while (1) { + u64 res = atomic64_read(&net->net_cookie); + + if (res) + return res; + res = atomic64_inc_return(&cookie_gen); + atomic64_cmpxchg(&net->net_cookie, 0, res); + } +} + static struct net_generic *net_alloc_generic(void) { struct net_generic *ng; @@ -1087,6 +1101,7 @@ static int __init net_ns_init(void) panic("Could not allocate generic netns"); rcu_assign_pointer(init_net.gen, ng); + net_gen_cookie(&init_net); down_write(&pernet_ops_rwsem); if (setup_net(&init_net, &init_user_ns)) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 5d01c5c7e598..bd81c4555206 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2950,6 +2950,19 @@ union bpf_attr { * restricted to raw_tracepoint bpf programs. * Return * 0 on success, or a negative error in case of failure. + * + * u64 bpf_get_netns_cookie(void *ctx) + * Description + * Retrieve the cookie (generated by the kernel) of the network + * namespace the input *ctx* is associated with. The network + * namespace cookie remains stable for its lifetime and provides + * a global identifier that can be assumed unique. If *ctx* is + * NULL, then the helper returns the cookie for the initial + * network namespace. The cookie itself is very similar to that + * of bpf_get_socket_cookie() helper, but for network namespaces + * instead of sockets. + * Return + * A 8-byte long opaque number. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3073,7 +3086,8 @@ union bpf_attr { FN(jiffies64), \ FN(read_branch_records), \ FN(get_ns_current_pid_tgid), \ - FN(xdp_output), + FN(xdp_output), \ + FN(get_netns_cookie), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3-59-g8ed1b From 0f09abd105da6c37713d2b253730a86cb45e127a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 27 Mar 2020 16:58:54 +0100 Subject: bpf: Enable bpf cgroup hooks to retrieve cgroup v2 and ancestor id Enable the bpf_get_current_cgroup_id() helper for connect(), sendmsg(), recvmsg() and bind-related hooks in order to retrieve the cgroup v2 context which can then be used as part of the key for BPF map lookups, for example. Given these hooks operate in process context 'current' is always valid and pointing to the app that is performing mentioned syscalls if it's subject to a v2 cgroup. Also with same motivation of commit 7723628101aa ("bpf: Introduce bpf_skb_ancestor_cgroup_id helper") enable retrieval of ancestor from current so the cgroup id can be used for policy lookups which can then forbid connect() / bind(), for example. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/d2a7ef42530ad299e3cbb245e6c12374b72145ef.1585323121.git.daniel@iogearbox.net --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 21 ++++++++++++++++++++- kernel/bpf/core.c | 1 + kernel/bpf/helpers.c | 18 ++++++++++++++++++ net/core/filter.c | 12 ++++++++++++ tools/include/uapi/linux/bpf.h | 21 ++++++++++++++++++++- 6 files changed, 72 insertions(+), 2 deletions(-) (limited to 'tools/include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 78046c570596..372708eeaecd 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1501,6 +1501,7 @@ extern const struct bpf_func_proto bpf_get_stack_proto; extern const struct bpf_func_proto bpf_sock_map_update_proto; extern const struct bpf_func_proto bpf_sock_hash_update_proto; extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto; +extern const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto; extern const struct bpf_func_proto bpf_msg_redirect_hash_proto; extern const struct bpf_func_proto bpf_msg_redirect_map_proto; extern const struct bpf_func_proto bpf_sk_redirect_hash_proto; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index bd81c4555206..222ba11966e3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2963,6 +2963,24 @@ union bpf_attr { * instead of sockets. * Return * A 8-byte long opaque number. + * + * u64 bpf_get_current_ancestor_cgroup_id(int ancestor_level) + * Description + * Return id of cgroup v2 that is ancestor of the cgroup associated + * with the current task at the *ancestor_level*. The root cgroup + * is at *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with the current task, then return value will be the + * same as that of **bpf_get_current_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with the current task. + * + * The format of returned id and helper limitations are same as in + * **bpf_get_current_cgroup_id**\ (). + * Return + * The id is returned or 0 in case the id could not be retrieved. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3087,7 +3105,8 @@ union bpf_attr { FN(read_branch_records), \ FN(get_ns_current_pid_tgid), \ FN(xdp_output), \ - FN(get_netns_cookie), + FN(get_netns_cookie), \ + FN(get_current_ancestor_cgroup_id), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 914f3463aa41..916f5132a984 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2156,6 +2156,7 @@ const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; const struct bpf_func_proto bpf_get_current_comm_proto __weak; const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak; +const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto __weak; const struct bpf_func_proto bpf_get_local_storage_proto __weak; const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto __weak; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 01878db15eaf..bafc53ddd350 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -340,6 +340,24 @@ const struct bpf_func_proto bpf_get_current_cgroup_id_proto = { .ret_type = RET_INTEGER, }; +BPF_CALL_1(bpf_get_current_ancestor_cgroup_id, int, ancestor_level) +{ + struct cgroup *cgrp = task_dfl_cgroup(current); + struct cgroup *ancestor; + + ancestor = cgroup_ancestor(cgrp, ancestor_level); + if (!ancestor) + return 0; + return cgroup_id(ancestor); +} + +const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = { + .func = bpf_get_current_ancestor_cgroup_id, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, +}; + #ifdef CONFIG_CGROUP_BPF DECLARE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); diff --git a/net/core/filter.c b/net/core/filter.c index 3083c7746ee0..5cec3ac9e3dd 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6018,6 +6018,12 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_netns_cookie_sock_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; +#ifdef CONFIG_CGROUPS + case BPF_FUNC_get_current_cgroup_id: + return &bpf_get_current_cgroup_id_proto; + case BPF_FUNC_get_current_ancestor_cgroup_id: + return &bpf_get_current_ancestor_cgroup_id_proto; +#endif #ifdef CONFIG_CGROUP_NET_CLASSID case BPF_FUNC_get_cgroup_classid: return &bpf_get_cgroup_classid_curr_proto; @@ -6052,6 +6058,12 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_local_storage_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; +#ifdef CONFIG_CGROUPS + case BPF_FUNC_get_current_cgroup_id: + return &bpf_get_current_cgroup_id_proto; + case BPF_FUNC_get_current_ancestor_cgroup_id: + return &bpf_get_current_ancestor_cgroup_id_proto; +#endif #ifdef CONFIG_CGROUP_NET_CLASSID case BPF_FUNC_get_cgroup_classid: return &bpf_get_cgroup_classid_curr_proto; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index bd81c4555206..222ba11966e3 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2963,6 +2963,24 @@ union bpf_attr { * instead of sockets. * Return * A 8-byte long opaque number. + * + * u64 bpf_get_current_ancestor_cgroup_id(int ancestor_level) + * Description + * Return id of cgroup v2 that is ancestor of the cgroup associated + * with the current task at the *ancestor_level*. The root cgroup + * is at *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with the current task, then return value will be the + * same as that of **bpf_get_current_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with the current task. + * + * The format of returned id and helper limitations are same as in + * **bpf_get_current_cgroup_id**\ (). + * Return + * The id is returned or 0 in case the id could not be retrieved. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3087,7 +3105,8 @@ union bpf_attr { FN(read_branch_records), \ FN(get_ns_current_pid_tgid), \ FN(xdp_output), \ - FN(get_netns_cookie), + FN(get_netns_cookie), \ + FN(get_current_ancestor_cgroup_id), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3-59-g8ed1b From 50a3e678b58ae9535f9e536f2889649b624943c3 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Wed, 25 Mar 2020 18:23:27 +0100 Subject: tools: Add EXPECTED_FD-related definitions in if_link.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds the IFLA_XDP_EXPECTED_FD netlink attribute definition and the XDP_FLAGS_REPLACE flag to if_link.h in tools/include. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/158515700747.92963.8615391897417388586.stgit@toke.dk --- tools/include/uapi/linux/if_link.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h index 024af2d1d0af..b1ec7c949e6a 100644 --- a/tools/include/uapi/linux/if_link.h +++ b/tools/include/uapi/linux/if_link.h @@ -960,11 +960,12 @@ enum { #define XDP_FLAGS_SKB_MODE (1U << 1) #define XDP_FLAGS_DRV_MODE (1U << 2) #define XDP_FLAGS_HW_MODE (1U << 3) +#define XDP_FLAGS_REPLACE (1U << 4) #define XDP_FLAGS_MODES (XDP_FLAGS_SKB_MODE | \ XDP_FLAGS_DRV_MODE | \ XDP_FLAGS_HW_MODE) #define XDP_FLAGS_MASK (XDP_FLAGS_UPDATE_IF_NOEXIST | \ - XDP_FLAGS_MODES) + XDP_FLAGS_MODES | XDP_FLAGS_REPLACE) /* These are stored into IFLA_XDP_ATTACHED on dump. */ enum { @@ -984,6 +985,7 @@ enum { IFLA_XDP_DRV_PROG_ID, IFLA_XDP_SKB_PROG_ID, IFLA_XDP_HW_PROG_ID, + IFLA_XDP_EXPECTED_FD, __IFLA_XDP_MAX, }; -- cgit v1.2.3-59-g8ed1b From fc611f47f2188ade2b48ff6902d5cce8baac0c58 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Sun, 29 Mar 2020 01:43:49 +0100 Subject: bpf: Introduce BPF_PROG_TYPE_LSM Introduce types and configs for bpf programs that can be attached to LSM hooks. The programs can be enabled by the config option CONFIG_BPF_LSM. Signed-off-by: KP Singh Signed-off-by: Daniel Borkmann Reviewed-by: Brendan Jackman Reviewed-by: Florent Revest Reviewed-by: Thomas Garnier Acked-by: Yonghong Song Acked-by: Andrii Nakryiko Acked-by: James Morris Link: https://lore.kernel.org/bpf/20200329004356.27286-2-kpsingh@chromium.org --- MAINTAINERS | 1 + include/linux/bpf.h | 3 +++ include/linux/bpf_types.h | 4 ++++ include/uapi/linux/bpf.h | 2 ++ init/Kconfig | 12 ++++++++++++ kernel/bpf/Makefile | 1 + kernel/bpf/bpf_lsm.c | 17 +++++++++++++++++ kernel/trace/bpf_trace.c | 12 ++++++------ tools/include/uapi/linux/bpf.h | 2 ++ tools/lib/bpf/libbpf_probes.c | 1 + 10 files changed, 49 insertions(+), 6 deletions(-) create mode 100644 kernel/bpf/bpf_lsm.c (limited to 'tools/include') diff --git a/MAINTAINERS b/MAINTAINERS index 5dbee41045bc..3197fe9256b2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3147,6 +3147,7 @@ R: Martin KaFai Lau R: Song Liu R: Yonghong Song R: Andrii Nakryiko +R: KP Singh L: netdev@vger.kernel.org L: bpf@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 372708eeaecd..3bde59a8453b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1515,6 +1515,9 @@ extern const struct bpf_func_proto bpf_tcp_sock_proto; extern const struct bpf_func_proto bpf_jiffies64_proto; extern const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto; +const struct bpf_func_proto *bpf_tracing_func_proto( + enum bpf_func_id func_id, const struct bpf_prog *prog); + /* Shared helpers among cBPF and eBPF. */ void bpf_user_rnd_init_once(void); u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index c81d4ece79a4..ba0c2d56f8a3 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -70,6 +70,10 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_STRUCT_OPS, bpf_struct_ops, void *, void *) BPF_PROG_TYPE(BPF_PROG_TYPE_EXT, bpf_extension, void *, void *) +#ifdef CONFIG_BPF_LSM +BPF_PROG_TYPE(BPF_PROG_TYPE_LSM, lsm, + void *, void *) +#endif /* CONFIG_BPF_LSM */ #endif BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 222ba11966e3..f1fbc36f58d3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -181,6 +181,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_TRACING, BPF_PROG_TYPE_STRUCT_OPS, BPF_PROG_TYPE_EXT, + BPF_PROG_TYPE_LSM, }; enum bpf_attach_type { @@ -211,6 +212,7 @@ enum bpf_attach_type { BPF_TRACE_FENTRY, BPF_TRACE_FEXIT, BPF_MODIFY_RETURN, + BPF_LSM_MAC, __MAX_BPF_ATTACH_TYPE }; diff --git a/init/Kconfig b/init/Kconfig index 20a6ac33761c..deae572d1927 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1616,6 +1616,18 @@ config KALLSYMS_BASE_RELATIVE # end of the "standard kernel features (expert users)" menu # syscall, maps, verifier + +config BPF_LSM + bool "LSM Instrumentation with BPF" + depends on BPF_SYSCALL + depends on SECURITY + depends on BPF_JIT + help + Enables instrumentation of the security hooks with eBPF programs for + implementing dynamic MAC and Audit Policies. + + If you are unsure how to answer this question, answer N. + config BPF_SYSCALL bool "Enable bpf() system call" select BPF diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 046ce5d98033..f2d7be596966 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -29,4 +29,5 @@ obj-$(CONFIG_DEBUG_INFO_BTF) += sysfs_btf.o endif ifeq ($(CONFIG_BPF_JIT),y) obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o +obj-${CONFIG_BPF_LSM} += bpf_lsm.o endif diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c new file mode 100644 index 000000000000..82875039ca90 --- /dev/null +++ b/kernel/bpf/bpf_lsm.c @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (C) 2020 Google LLC. + */ + +#include +#include +#include + +const struct bpf_prog_ops lsm_prog_ops = { +}; + +const struct bpf_verifier_ops lsm_verifier_ops = { + .get_func_proto = bpf_tracing_func_proto, + .is_valid_access = btf_ctx_access, +}; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index e619eedb5919..37ffceab608f 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -779,8 +779,8 @@ static const struct bpf_func_proto bpf_send_signal_thread_proto = { .arg1_type = ARG_ANYTHING, }; -static const struct bpf_func_proto * -tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +const struct bpf_func_proto * +bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_map_lookup_elem: @@ -865,7 +865,7 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_override_return_proto; #endif default: - return tracing_func_proto(func_id, prog); + return bpf_tracing_func_proto(func_id, prog); } } @@ -975,7 +975,7 @@ tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_stack: return &bpf_get_stack_proto_tp; default: - return tracing_func_proto(func_id, prog); + return bpf_tracing_func_proto(func_id, prog); } } @@ -1082,7 +1082,7 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_read_branch_records: return &bpf_read_branch_records_proto; default: - return tracing_func_proto(func_id, prog); + return bpf_tracing_func_proto(func_id, prog); } } @@ -1210,7 +1210,7 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_stack: return &bpf_get_stack_proto_raw_tp; default: - return tracing_func_proto(func_id, prog); + return bpf_tracing_func_proto(func_id, prog); } } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 222ba11966e3..f1fbc36f58d3 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -181,6 +181,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_TRACING, BPF_PROG_TYPE_STRUCT_OPS, BPF_PROG_TYPE_EXT, + BPF_PROG_TYPE_LSM, }; enum bpf_attach_type { @@ -211,6 +212,7 @@ enum bpf_attach_type { BPF_TRACE_FENTRY, BPF_TRACE_FEXIT, BPF_MODIFY_RETURN, + BPF_LSM_MAC, __MAX_BPF_ATTACH_TYPE }; diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c index b782ebef6ac9..2c92059c0c90 100644 --- a/tools/lib/bpf/libbpf_probes.c +++ b/tools/lib/bpf/libbpf_probes.c @@ -108,6 +108,7 @@ probe_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns, case BPF_PROG_TYPE_TRACING: case BPF_PROG_TYPE_STRUCT_OPS: case BPF_PROG_TYPE_EXT: + case BPF_PROG_TYPE_LSM: default: break; } -- cgit v1.2.3-59-g8ed1b From 791bb3fcafcedd11f9066da9fee9342ecb6904d0 Mon Sep 17 00:00:00 2001 From: Mark Starovoytov Date: Wed, 25 Mar 2020 16:01:34 +0300 Subject: net: macsec: add support for specifying offload upon link creation This patch adds new netlink attribute to allow a user to (optionally) specify the desired offload mode immediately upon MACSec link creation. Separate iproute patch will be required to support this from user space. Signed-off-by: Mark Starovoytov Signed-off-by: Igor Russkikh Signed-off-by: David S. Miller --- drivers/net/macsec.c | 31 +++++++++++++++++++++++++++++-- include/uapi/linux/if_link.h | 1 + tools/include/uapi/linux/if_link.h | 1 + 3 files changed, 31 insertions(+), 2 deletions(-) (limited to 'tools/include') diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index 2dad91cba459..da82d7f16a09 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -1469,6 +1469,11 @@ static struct net_device *get_dev_from_nl(struct net *net, return dev; } +static enum macsec_offload nla_get_offload(const struct nlattr *nla) +{ + return (__force enum macsec_offload)nla_get_u8(nla); +} + static sci_t nla_get_sci(const struct nlattr *nla) { return (__force sci_t)nla_get_u64(nla); @@ -4012,8 +4017,16 @@ static int macsec_newlink(struct net *net, struct net_device *dev, macsec->real_dev = real_dev; - /* MACsec offloading is off by default */ - macsec->offload = MACSEC_OFFLOAD_OFF; + if (data && data[IFLA_MACSEC_OFFLOAD]) + macsec->offload = nla_get_offload(data[IFLA_MACSEC_OFFLOAD]); + else + /* MACsec offloading is off by default */ + macsec->offload = MACSEC_OFFLOAD_OFF; + + /* Check if the offloading mode is supported by the underlying layers */ + if (macsec->offload != MACSEC_OFFLOAD_OFF && + !macsec_check_offload(macsec->offload, macsec)) + return -EOPNOTSUPP; if (data && data[IFLA_MACSEC_ICV_LEN]) icv_len = nla_get_u8(data[IFLA_MACSEC_ICV_LEN]); @@ -4056,6 +4069,20 @@ static int macsec_newlink(struct net *net, struct net_device *dev, goto del_dev; } + /* If h/w offloading is available, propagate to the device */ + if (macsec_is_offloaded(macsec)) { + const struct macsec_ops *ops; + struct macsec_context ctx; + + ops = macsec_get_ops(macsec, &ctx); + if (ops) { + ctx.secy = &macsec->secy; + err = macsec_offload(ops->mdo_add_secy, &ctx); + if (err) + goto del_dev; + } + } + err = register_macsec_dev(real_dev, dev); if (err < 0) goto del_dev; diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index d6ccd0105c05..e204c3e4dce1 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -463,6 +463,7 @@ enum { IFLA_MACSEC_REPLAY_PROTECT, IFLA_MACSEC_VALIDATION, IFLA_MACSEC_PAD, + IFLA_MACSEC_OFFLOAD, __IFLA_MACSEC_MAX, }; diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h index 771371d5b996..24cf6fe075f7 100644 --- a/tools/include/uapi/linux/if_link.h +++ b/tools/include/uapi/linux/if_link.h @@ -463,6 +463,7 @@ enum { IFLA_MACSEC_REPLAY_PROTECT, IFLA_MACSEC_VALIDATION, IFLA_MACSEC_PAD, + IFLA_MACSEC_OFFLOAD, __IFLA_MACSEC_MAX, }; -- cgit v1.2.3-59-g8ed1b From cf7fbe660f2dbd738ab58aea8e9b0ca6ad232449 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Sun, 29 Mar 2020 15:53:38 -0700 Subject: bpf: Add socket assign support Add support for TPROXY via a new bpf helper, bpf_sk_assign(). This helper requires the BPF program to discover the socket via a call to bpf_sk*_lookup_*(), then pass this socket to the new helper. The helper takes its own reference to the socket in addition to any existing reference that may or may not currently be obtained for the duration of BPF processing. For the destination socket to receive the traffic, the traffic must be routed towards that socket via local route. The simplest example route is below, but in practice you may want to route traffic more narrowly (eg by CIDR): $ ip route add local default dev lo This patch avoids trying to introduce an extra bit into the skb->sk, as that would require more invasive changes to all code interacting with the socket to ensure that the bit is handled correctly, such as all error-handling cases along the path from the helper in BPF through to the orphan path in the input. Instead, we opt to use the destructor variable to switch on the prefetch of the socket. Signed-off-by: Joe Stringer Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200329225342.16317-2-joe@wand.net.nz --- include/net/sock.h | 11 +++++++++++ include/uapi/linux/bpf.h | 25 ++++++++++++++++++++++++- net/core/filter.c | 31 +++++++++++++++++++++++++++++++ net/core/sock.c | 11 +++++++++++ net/ipv4/ip_input.c | 3 ++- net/ipv6/ip6_input.c | 3 ++- net/sched/act_bpf.c | 3 +++ tools/include/uapi/linux/bpf.h | 25 ++++++++++++++++++++++++- 8 files changed, 108 insertions(+), 4 deletions(-) (limited to 'tools/include') diff --git a/include/net/sock.h b/include/net/sock.h index b5cca7bae69b..dc398cee7873 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1659,6 +1659,7 @@ void sock_rfree(struct sk_buff *skb); void sock_efree(struct sk_buff *skb); #ifdef CONFIG_INET void sock_edemux(struct sk_buff *skb); +void sock_pfree(struct sk_buff *skb); #else #define sock_edemux sock_efree #endif @@ -2526,6 +2527,16 @@ void sock_net_set(struct sock *sk, struct net *net) write_pnet(&sk->sk_net, net); } +static inline bool +skb_sk_is_prefetched(struct sk_buff *skb) +{ +#ifdef CONFIG_INET + return skb->destructor == sock_pfree; +#else + return false; +#endif /* CONFIG_INET */ +} + static inline struct sock *skb_steal_sock(struct sk_buff *skb) { if (skb->sk) { diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f1fbc36f58d3..9f786a5a44ac 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2983,6 +2983,28 @@ union bpf_attr { * **bpf_get_current_cgroup_id**\ (). * Return * The id is returned or 0 in case the id could not be retrieved. + * + * int bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags) + * Description + * Assign the *sk* to the *skb*. When combined with appropriate + * routing configuration to receive the packet towards the socket, + * will cause *skb* to be delivered to the specified socket. + * Subsequent redirection of *skb* via **bpf_redirect**\ (), + * **bpf_clone_redirect**\ () or other methods outside of BPF may + * interfere with successful delivery to the socket. + * + * This operation is only valid from TC ingress path. + * + * The *flags* argument must be zero. + * Return + * 0 on success, or a negative errno in case of failure. + * + * * **-EINVAL** Unsupported flags specified. + * * **-ENOENT** Socket is unavailable for assignment. + * * **-ENETUNREACH** Socket is unreachable (wrong netns). + * * **-EOPNOTSUPP** Unsupported operation, for example a + * call from outside of TC ingress. + * * **-ESOCKTNOSUPPORT** Socket type not supported (reuseport). */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3108,7 +3130,8 @@ union bpf_attr { FN(get_ns_current_pid_tgid), \ FN(xdp_output), \ FN(get_netns_cookie), \ - FN(get_current_ancestor_cgroup_id), + FN(get_current_ancestor_cgroup_id), \ + FN(sk_assign), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/net/core/filter.c b/net/core/filter.c index bb4a196c8809..ac5c1633f8d2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5918,6 +5918,35 @@ static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = { .arg5_type = ARG_CONST_SIZE, }; +BPF_CALL_3(bpf_sk_assign, struct sk_buff *, skb, struct sock *, sk, u64, flags) +{ + if (flags != 0) + return -EINVAL; + if (!skb_at_tc_ingress(skb)) + return -EOPNOTSUPP; + if (unlikely(dev_net(skb->dev) != sock_net(sk))) + return -ENETUNREACH; + if (unlikely(sk->sk_reuseport)) + return -ESOCKTNOSUPPORT; + if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) + return -ENOENT; + + skb_orphan(skb); + skb->sk = sk; + skb->destructor = sock_pfree; + + return 0; +} + +static const struct bpf_func_proto bpf_sk_assign_proto = { + .func = bpf_sk_assign, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_SOCK_COMMON, + .arg3_type = ARG_ANYTHING, +}; + #endif /* CONFIG_INET */ bool bpf_helper_changes_pkt_data(void *func) @@ -6249,6 +6278,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skb_ecn_set_ce_proto; case BPF_FUNC_tcp_gen_syncookie: return &bpf_tcp_gen_syncookie_proto; + case BPF_FUNC_sk_assign: + return &bpf_sk_assign_proto; #endif default: return bpf_base_func_proto(func_id); diff --git a/net/core/sock.c b/net/core/sock.c index 0fc8937a7ff4..87e3a03c9056 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2071,6 +2071,17 @@ void sock_efree(struct sk_buff *skb) } EXPORT_SYMBOL(sock_efree); +/* Buffer destructor for prefetch/receive path where reference count may + * not be held, e.g. for listen sockets. + */ +#ifdef CONFIG_INET +void sock_pfree(struct sk_buff *skb) +{ + sock_gen_put(skb->sk); +} +EXPORT_SYMBOL(sock_pfree); +#endif /* CONFIG_INET */ + kuid_t sock_i_uid(struct sock *sk) { kuid_t uid; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index aa438c6758a7..b0c244af1e4d 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -509,7 +509,8 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net) IPCB(skb)->iif = skb->skb_iif; /* Must drop socket now because of tproxy. */ - skb_orphan(skb); + if (!skb_sk_is_prefetched(skb)) + skb_orphan(skb); return skb; diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 7b089d0ac8cd..e96304d8a4a7 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -285,7 +285,8 @@ static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev, rcu_read_unlock(); /* Must drop socket now because of tproxy. */ - skb_orphan(skb); + if (!skb_sk_is_prefetched(skb)) + skb_orphan(skb); return skb; err: diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 46f47e58b3be..54d5652cfe6c 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -12,6 +12,7 @@ #include #include +#include #include #include @@ -53,6 +54,8 @@ static int tcf_bpf_act(struct sk_buff *skb, const struct tc_action *act, bpf_compute_data_pointers(skb); filter_res = BPF_PROG_RUN(filter, skb); } + if (skb_sk_is_prefetched(skb) && filter_res != TC_ACT_OK) + skb_orphan(skb); rcu_read_unlock(); /* A BPF program may overwrite the default action opcode. diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f1fbc36f58d3..9f786a5a44ac 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2983,6 +2983,28 @@ union bpf_attr { * **bpf_get_current_cgroup_id**\ (). * Return * The id is returned or 0 in case the id could not be retrieved. + * + * int bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags) + * Description + * Assign the *sk* to the *skb*. When combined with appropriate + * routing configuration to receive the packet towards the socket, + * will cause *skb* to be delivered to the specified socket. + * Subsequent redirection of *skb* via **bpf_redirect**\ (), + * **bpf_clone_redirect**\ () or other methods outside of BPF may + * interfere with successful delivery to the socket. + * + * This operation is only valid from TC ingress path. + * + * The *flags* argument must be zero. + * Return + * 0 on success, or a negative errno in case of failure. + * + * * **-EINVAL** Unsupported flags specified. + * * **-ENOENT** Socket is unavailable for assignment. + * * **-ENETUNREACH** Socket is unreachable (wrong netns). + * * **-EOPNOTSUPP** Unsupported operation, for example a + * call from outside of TC ingress. + * * **-ESOCKTNOSUPPORT** Socket type not supported (reuseport). */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3108,7 +3130,8 @@ union bpf_attr { FN(get_ns_current_pid_tgid), \ FN(xdp_output), \ FN(get_netns_cookie), \ - FN(get_current_ancestor_cgroup_id), + FN(get_current_ancestor_cgroup_id), \ + FN(sk_assign), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3-59-g8ed1b From af6eea57437a830293eab56246b6025cc7d46ee7 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 29 Mar 2020 19:59:58 -0700 Subject: bpf: Implement bpf_link-based cgroup BPF program attachment Implement new sub-command to attach cgroup BPF programs and return FD-based bpf_link back on success. bpf_link, once attached to cgroup, cannot be replaced, except by owner having its FD. Cgroup bpf_link supports only BPF_F_ALLOW_MULTI semantics. Both link-based and prog-based BPF_F_ALLOW_MULTI attachments can be freely intermixed. To prevent bpf_cgroup_link from keeping cgroup alive past the point when no BPF program can be executed, implement auto-detachment of link. When cgroup_bpf_release() is called, all attached bpf_links are forced to release cgroup refcounts, but they leave bpf_link otherwise active and allocated, as well as still owning underlying bpf_prog. This is because user-space might still have FDs open and active, so bpf_link as a user-referenced object can't be freed yet. Once last active FD is closed, bpf_link will be freed and underlying bpf_prog refcount will be dropped. But cgroup refcount won't be touched, because cgroup is released already. The inherent race between bpf_cgroup_link release (from closing last FD) and cgroup_bpf_release() is resolved by both operations taking cgroup_mutex. So the only additional check required is when bpf_cgroup_link attempts to detach itself from cgroup. At that time we need to check whether there is still cgroup associated with that link. And if not, exit with success, because bpf_cgroup_link was already successfully detached. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Roman Gushchin Link: https://lore.kernel.org/bpf/20200330030001.2312810-2-andriin@fb.com --- include/linux/bpf-cgroup.h | 29 +++- include/linux/bpf.h | 10 +- include/uapi/linux/bpf.h | 10 +- kernel/bpf/cgroup.c | 315 +++++++++++++++++++++++++++++++---------- kernel/bpf/syscall.c | 64 +++++++-- kernel/cgroup/cgroup.c | 14 +- tools/include/uapi/linux/bpf.h | 10 +- 7 files changed, 354 insertions(+), 98 deletions(-) (limited to 'tools/include') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index a7cd5c7a2509..d2d969669564 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -51,9 +51,18 @@ struct bpf_cgroup_storage { struct rcu_head rcu; }; +struct bpf_cgroup_link { + struct bpf_link link; + struct cgroup *cgroup; + enum bpf_attach_type type; +}; + +extern const struct bpf_link_ops bpf_cgroup_link_lops; + struct bpf_prog_list { struct list_head node; struct bpf_prog *prog; + struct bpf_cgroup_link *link; struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]; }; @@ -84,20 +93,23 @@ struct cgroup_bpf { int cgroup_bpf_inherit(struct cgroup *cgrp); void cgroup_bpf_offline(struct cgroup *cgrp); -int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, - struct bpf_prog *replace_prog, +int __cgroup_bpf_attach(struct cgroup *cgrp, + struct bpf_prog *prog, struct bpf_prog *replace_prog, + struct bpf_cgroup_link *link, enum bpf_attach_type type, u32 flags); int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, + struct bpf_cgroup_link *link, enum bpf_attach_type type); int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, union bpf_attr __user *uattr); /* Wrapper for __cgroup_bpf_*() protected by cgroup_mutex */ -int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, - struct bpf_prog *replace_prog, enum bpf_attach_type type, +int cgroup_bpf_attach(struct cgroup *cgrp, + struct bpf_prog *prog, struct bpf_prog *replace_prog, + struct bpf_cgroup_link *link, enum bpf_attach_type type, u32 flags); int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, - enum bpf_attach_type type, u32 flags); + enum bpf_attach_type type); int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, union bpf_attr __user *uattr); @@ -332,6 +344,7 @@ int cgroup_bpf_prog_attach(const union bpf_attr *attr, enum bpf_prog_type ptype, struct bpf_prog *prog); int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype); +int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); int cgroup_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr); #else @@ -354,6 +367,12 @@ static inline int cgroup_bpf_prog_detach(const union bpf_attr *attr, return -EINVAL; } +static inline int cgroup_bpf_link_attach(const union bpf_attr *attr, + struct bpf_prog *prog) +{ + return -EINVAL; +} + static inline int cgroup_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3bde59a8453b..56254d880293 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1082,15 +1082,23 @@ extern int sysctl_unprivileged_bpf_disabled; int bpf_map_new_fd(struct bpf_map *map, int flags); int bpf_prog_new_fd(struct bpf_prog *prog); -struct bpf_link; +struct bpf_link { + atomic64_t refcnt; + const struct bpf_link_ops *ops; + struct bpf_prog *prog; + struct work_struct work; +}; struct bpf_link_ops { void (*release)(struct bpf_link *link); void (*dealloc)(struct bpf_link *link); + }; void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops, struct bpf_prog *prog); +void bpf_link_cleanup(struct bpf_link *link, struct file *link_file, + int link_fd); void bpf_link_inc(struct bpf_link *link); void bpf_link_put(struct bpf_link *link); int bpf_link_new_fd(struct bpf_link *link); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 9f786a5a44ac..37dffe5089a0 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -111,6 +111,7 @@ enum bpf_cmd { BPF_MAP_LOOKUP_AND_DELETE_BATCH, BPF_MAP_UPDATE_BATCH, BPF_MAP_DELETE_BATCH, + BPF_LINK_CREATE, }; enum bpf_map_type { @@ -541,7 +542,7 @@ union bpf_attr { __u32 prog_cnt; } query; - struct { + struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */ __u64 name; __u32 prog_fd; } raw_tracepoint; @@ -569,6 +570,13 @@ union bpf_attr { __u64 probe_offset; /* output: probe_offset */ __u64 probe_addr; /* output: probe_addr */ } task_fd_query; + + struct { /* struct used by BPF_LINK_CREATE command */ + __u32 prog_fd; /* eBPF program to attach */ + __u32 target_fd; /* object to attach to */ + __u32 attach_type; /* attach type */ + __u32 flags; /* extra flags */ + } link_create; } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 9c8472823a7f..c24029937431 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -80,6 +80,17 @@ static void bpf_cgroup_storages_unlink(struct bpf_cgroup_storage *storages[]) bpf_cgroup_storage_unlink(storages[stype]); } +/* Called when bpf_cgroup_link is auto-detached from dying cgroup. + * It drops cgroup and bpf_prog refcounts, and marks bpf_link as defunct. It + * doesn't free link memory, which will eventually be done by bpf_link's + * release() callback, when its last FD is closed. + */ +static void bpf_cgroup_link_auto_detach(struct bpf_cgroup_link *link) +{ + cgroup_put(link->cgroup); + link->cgroup = NULL; +} + /** * cgroup_bpf_release() - put references of all bpf programs and * release all cgroup bpf data @@ -100,7 +111,10 @@ static void cgroup_bpf_release(struct work_struct *work) list_for_each_entry_safe(pl, tmp, progs, node) { list_del(&pl->node); - bpf_prog_put(pl->prog); + if (pl->prog) + bpf_prog_put(pl->prog); + if (pl->link) + bpf_cgroup_link_auto_detach(pl->link); bpf_cgroup_storages_unlink(pl->storage); bpf_cgroup_storages_free(pl->storage); kfree(pl); @@ -134,6 +148,18 @@ static void cgroup_bpf_release_fn(struct percpu_ref *ref) queue_work(system_wq, &cgrp->bpf.release_work); } +/* Get underlying bpf_prog of bpf_prog_list entry, regardless if it's through + * link or direct prog. + */ +static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl) +{ + if (pl->prog) + return pl->prog; + if (pl->link) + return pl->link->link.prog; + return NULL; +} + /* count number of elements in the list. * it's slow but the list cannot be long */ @@ -143,7 +169,7 @@ static u32 prog_list_length(struct list_head *head) u32 cnt = 0; list_for_each_entry(pl, head, node) { - if (!pl->prog) + if (!prog_list_prog(pl)) continue; cnt++; } @@ -212,11 +238,11 @@ static int compute_effective_progs(struct cgroup *cgrp, continue; list_for_each_entry(pl, &p->bpf.progs[type], node) { - if (!pl->prog) + if (!prog_list_prog(pl)) continue; item = &progs->items[cnt]; - item->prog = pl->prog; + item->prog = prog_list_prog(pl); bpf_cgroup_storages_assign(item->cgroup_storage, pl->storage); cnt++; @@ -333,19 +359,60 @@ cleanup: #define BPF_CGROUP_MAX_PROGS 64 +static struct bpf_prog_list *find_attach_entry(struct list_head *progs, + struct bpf_prog *prog, + struct bpf_cgroup_link *link, + struct bpf_prog *replace_prog, + bool allow_multi) +{ + struct bpf_prog_list *pl; + + /* single-attach case */ + if (!allow_multi) { + if (list_empty(progs)) + return NULL; + return list_first_entry(progs, typeof(*pl), node); + } + + list_for_each_entry(pl, progs, node) { + if (prog && pl->prog == prog) + /* disallow attaching the same prog twice */ + return ERR_PTR(-EINVAL); + if (link && pl->link == link) + /* disallow attaching the same link twice */ + return ERR_PTR(-EINVAL); + } + + /* direct prog multi-attach w/ replacement case */ + if (replace_prog) { + list_for_each_entry(pl, progs, node) { + if (pl->prog == replace_prog) + /* a match found */ + return pl; + } + /* prog to replace not found for cgroup */ + return ERR_PTR(-ENOENT); + } + + return NULL; +} + /** - * __cgroup_bpf_attach() - Attach the program to a cgroup, and + * __cgroup_bpf_attach() - Attach the program or the link to a cgroup, and * propagate the change to descendants * @cgrp: The cgroup which descendants to traverse * @prog: A program to attach + * @link: A link to attach * @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set * @type: Type of attach operation * @flags: Option flags * + * Exactly one of @prog or @link can be non-null. * Must be called with cgroup_mutex held. */ -int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, - struct bpf_prog *replace_prog, +int __cgroup_bpf_attach(struct cgroup *cgrp, + struct bpf_prog *prog, struct bpf_prog *replace_prog, + struct bpf_cgroup_link *link, enum bpf_attach_type type, u32 flags) { u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)); @@ -353,13 +420,19 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, struct bpf_prog *old_prog = NULL; struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE], *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {NULL}; - struct bpf_prog_list *pl, *replace_pl = NULL; + struct bpf_prog_list *pl; int err; if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) || ((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI))) /* invalid combination */ return -EINVAL; + if (link && (prog || replace_prog)) + /* only either link or prog/replace_prog can be specified */ + return -EINVAL; + if (!!replace_prog != !!(flags & BPF_F_REPLACE)) + /* replace_prog implies BPF_F_REPLACE, and vice versa */ + return -EINVAL; if (!hierarchy_allows_attach(cgrp, type)) return -EPERM; @@ -374,26 +447,15 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS) return -E2BIG; - if (flags & BPF_F_ALLOW_MULTI) { - list_for_each_entry(pl, progs, node) { - if (pl->prog == prog) - /* disallow attaching the same prog twice */ - return -EINVAL; - if (pl->prog == replace_prog) - replace_pl = pl; - } - if ((flags & BPF_F_REPLACE) && !replace_pl) - /* prog to replace not found for cgroup */ - return -ENOENT; - } else if (!list_empty(progs)) { - replace_pl = list_first_entry(progs, typeof(*pl), node); - } + pl = find_attach_entry(progs, prog, link, replace_prog, + flags & BPF_F_ALLOW_MULTI); + if (IS_ERR(pl)) + return PTR_ERR(pl); - if (bpf_cgroup_storages_alloc(storage, prog)) + if (bpf_cgroup_storages_alloc(storage, prog ? : link->link.prog)) return -ENOMEM; - if (replace_pl) { - pl = replace_pl; + if (pl) { old_prog = pl->prog; bpf_cgroup_storages_unlink(pl->storage); bpf_cgroup_storages_assign(old_storage, pl->storage); @@ -407,6 +469,7 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, } pl->prog = prog; + pl->link = link; bpf_cgroup_storages_assign(pl->storage, storage); cgrp->bpf.flags[type] = saved_flags; @@ -414,80 +477,93 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, if (err) goto cleanup; - static_branch_inc(&cgroup_bpf_enabled_key); bpf_cgroup_storages_free(old_storage); - if (old_prog) { + if (old_prog) bpf_prog_put(old_prog); - static_branch_dec(&cgroup_bpf_enabled_key); - } - bpf_cgroup_storages_link(storage, cgrp, type); + else + static_branch_inc(&cgroup_bpf_enabled_key); + bpf_cgroup_storages_link(pl->storage, cgrp, type); return 0; cleanup: - /* and cleanup the prog list */ - pl->prog = old_prog; + if (old_prog) { + pl->prog = old_prog; + pl->link = NULL; + } bpf_cgroup_storages_free(pl->storage); bpf_cgroup_storages_assign(pl->storage, old_storage); bpf_cgroup_storages_link(pl->storage, cgrp, type); - if (!replace_pl) { + if (!old_prog) { list_del(&pl->node); kfree(pl); } return err; } +static struct bpf_prog_list *find_detach_entry(struct list_head *progs, + struct bpf_prog *prog, + struct bpf_cgroup_link *link, + bool allow_multi) +{ + struct bpf_prog_list *pl; + + if (!allow_multi) { + if (list_empty(progs)) + /* report error when trying to detach and nothing is attached */ + return ERR_PTR(-ENOENT); + + /* to maintain backward compatibility NONE and OVERRIDE cgroups + * allow detaching with invalid FD (prog==NULL) in legacy mode + */ + return list_first_entry(progs, typeof(*pl), node); + } + + if (!prog && !link) + /* to detach MULTI prog the user has to specify valid FD + * of the program or link to be detached + */ + return ERR_PTR(-EINVAL); + + /* find the prog or link and detach it */ + list_for_each_entry(pl, progs, node) { + if (pl->prog == prog && pl->link == link) + return pl; + } + return ERR_PTR(-ENOENT); +} + /** - * __cgroup_bpf_detach() - Detach the program from a cgroup, and + * __cgroup_bpf_detach() - Detach the program or link from a cgroup, and * propagate the change to descendants * @cgrp: The cgroup which descendants to traverse * @prog: A program to detach or NULL + * @prog: A link to detach or NULL * @type: Type of detach operation * + * At most one of @prog or @link can be non-NULL. * Must be called with cgroup_mutex held. */ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, - enum bpf_attach_type type) + struct bpf_cgroup_link *link, enum bpf_attach_type type) { struct list_head *progs = &cgrp->bpf.progs[type]; u32 flags = cgrp->bpf.flags[type]; - struct bpf_prog *old_prog = NULL; struct bpf_prog_list *pl; + struct bpf_prog *old_prog; int err; - if (flags & BPF_F_ALLOW_MULTI) { - if (!prog) - /* to detach MULTI prog the user has to specify valid FD - * of the program to be detached - */ - return -EINVAL; - } else { - if (list_empty(progs)) - /* report error when trying to detach and nothing is attached */ - return -ENOENT; - } + if (prog && link) + /* only one of prog or link can be specified */ + return -EINVAL; - if (flags & BPF_F_ALLOW_MULTI) { - /* find the prog and detach it */ - list_for_each_entry(pl, progs, node) { - if (pl->prog != prog) - continue; - old_prog = prog; - /* mark it deleted, so it's ignored while - * recomputing effective - */ - pl->prog = NULL; - break; - } - if (!old_prog) - return -ENOENT; - } else { - /* to maintain backward compatibility NONE and OVERRIDE cgroups - * allow detaching with invalid FD (prog==NULL) - */ - pl = list_first_entry(progs, typeof(*pl), node); - old_prog = pl->prog; - pl->prog = NULL; - } + pl = find_detach_entry(progs, prog, link, flags & BPF_F_ALLOW_MULTI); + if (IS_ERR(pl)) + return PTR_ERR(pl); + + /* mark it deleted, so it's ignored while recomputing effective */ + old_prog = pl->prog; + pl->prog = NULL; + pl->link = NULL; err = update_effective_progs(cgrp, type); if (err) @@ -501,14 +577,15 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, if (list_empty(progs)) /* last program was detached, reset flags to zero */ cgrp->bpf.flags[type] = 0; - - bpf_prog_put(old_prog); + if (old_prog) + bpf_prog_put(old_prog); static_branch_dec(&cgroup_bpf_enabled_key); return 0; cleanup: - /* and restore back old_prog */ + /* restore back prog or link */ pl->prog = old_prog; + pl->link = link; return err; } @@ -521,6 +598,7 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, struct list_head *progs = &cgrp->bpf.progs[type]; u32 flags = cgrp->bpf.flags[type]; struct bpf_prog_array *effective; + struct bpf_prog *prog; int cnt, ret = 0, i; effective = rcu_dereference_protected(cgrp->bpf.effective[type], @@ -551,7 +629,8 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, i = 0; list_for_each_entry(pl, progs, node) { - id = pl->prog->aux->id; + prog = prog_list_prog(pl); + id = prog->aux->id; if (copy_to_user(prog_ids + i, &id, sizeof(id))) return -EFAULT; if (++i == cnt) @@ -581,8 +660,8 @@ int cgroup_bpf_prog_attach(const union bpf_attr *attr, } } - ret = cgroup_bpf_attach(cgrp, prog, replace_prog, attr->attach_type, - attr->attach_flags); + ret = cgroup_bpf_attach(cgrp, prog, replace_prog, NULL, + attr->attach_type, attr->attach_flags); if (replace_prog) bpf_prog_put(replace_prog); @@ -604,7 +683,7 @@ int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) if (IS_ERR(prog)) prog = NULL; - ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0); + ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type); if (prog) bpf_prog_put(prog); @@ -612,6 +691,90 @@ int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) return ret; } +static void bpf_cgroup_link_release(struct bpf_link *link) +{ + struct bpf_cgroup_link *cg_link = + container_of(link, struct bpf_cgroup_link, link); + + /* link might have been auto-detached by dying cgroup already, + * in that case our work is done here + */ + if (!cg_link->cgroup) + return; + + mutex_lock(&cgroup_mutex); + + /* re-check cgroup under lock again */ + if (!cg_link->cgroup) { + mutex_unlock(&cgroup_mutex); + return; + } + + WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link, + cg_link->type)); + + mutex_unlock(&cgroup_mutex); + cgroup_put(cg_link->cgroup); +} + +static void bpf_cgroup_link_dealloc(struct bpf_link *link) +{ + struct bpf_cgroup_link *cg_link = + container_of(link, struct bpf_cgroup_link, link); + + kfree(cg_link); +} + +const struct bpf_link_ops bpf_cgroup_link_lops = { + .release = bpf_cgroup_link_release, + .dealloc = bpf_cgroup_link_dealloc, +}; + +int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct bpf_cgroup_link *link; + struct file *link_file; + struct cgroup *cgrp; + int err, link_fd; + + if (attr->link_create.flags) + return -EINVAL; + + cgrp = cgroup_get_from_fd(attr->link_create.target_fd); + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + + link = kzalloc(sizeof(*link), GFP_USER); + if (!link) { + err = -ENOMEM; + goto out_put_cgroup; + } + bpf_link_init(&link->link, &bpf_cgroup_link_lops, prog); + link->cgroup = cgrp; + link->type = attr->link_create.attach_type; + + link_file = bpf_link_new_file(&link->link, &link_fd); + if (IS_ERR(link_file)) { + kfree(link); + err = PTR_ERR(link_file); + goto out_put_cgroup; + } + + err = cgroup_bpf_attach(cgrp, NULL, NULL, link, link->type, + BPF_F_ALLOW_MULTI); + if (err) { + bpf_link_cleanup(&link->link, link_file, link_fd); + goto out_put_cgroup; + } + + fd_install(link_fd, link_file); + return link_fd; + +out_put_cgroup: + cgroup_put(cgrp); + return err; +} + int cgroup_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a616b63f23b4..97d5c6fb63cd 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2175,13 +2175,6 @@ static int bpf_obj_get(const union bpf_attr *attr) attr->file_flags); } -struct bpf_link { - atomic64_t refcnt; - const struct bpf_link_ops *ops; - struct bpf_prog *prog; - struct work_struct work; -}; - void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops, struct bpf_prog *prog) { @@ -2195,8 +2188,8 @@ void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops, * anon_inode's release() call. This helper manages marking bpf_link as * defunct, releases anon_inode file and puts reserved FD. */ -static void bpf_link_cleanup(struct bpf_link *link, struct file *link_file, - int link_fd) +void bpf_link_cleanup(struct bpf_link *link, struct file *link_file, + int link_fd) { link->prog = NULL; fput(link_file); @@ -2266,6 +2259,10 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) link_type = "raw_tracepoint"; else if (link->ops == &bpf_tracing_link_lops) link_type = "tracing"; +#ifdef CONFIG_CGROUP_BPF + else if (link->ops == &bpf_cgroup_link_lops) + link_type = "cgroup"; +#endif else link_type = "unknown"; @@ -3553,6 +3550,52 @@ err_put: return err; } +#define BPF_LINK_CREATE_LAST_FIELD link_create.flags +static int link_create(union bpf_attr *attr) +{ + enum bpf_prog_type ptype; + struct bpf_prog *prog; + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (CHECK_ATTR(BPF_LINK_CREATE)) + return -EINVAL; + + ptype = attach_type_to_prog_type(attr->link_create.attach_type); + if (ptype == BPF_PROG_TYPE_UNSPEC) + return -EINVAL; + + prog = bpf_prog_get_type(attr->link_create.prog_fd, ptype); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + ret = bpf_prog_attach_check_attach_type(prog, + attr->link_create.attach_type); + if (ret) + goto err_out; + + switch (ptype) { + case BPF_PROG_TYPE_CGROUP_SKB: + case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + case BPF_PROG_TYPE_SOCK_OPS: + case BPF_PROG_TYPE_CGROUP_DEVICE: + case BPF_PROG_TYPE_CGROUP_SYSCTL: + case BPF_PROG_TYPE_CGROUP_SOCKOPT: + ret = cgroup_bpf_link_attach(attr, prog); + break; + default: + ret = -EINVAL; + } + +err_out: + if (ret < 0) + bpf_prog_put(prog); + return ret; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -3663,6 +3706,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_MAP_DELETE_BATCH: err = bpf_map_do_batch(&attr, uattr, BPF_MAP_DELETE_BATCH); break; + case BPF_LINK_CREATE: + err = link_create(&attr); + break; default: err = -EINVAL; break; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 3dead0416b91..219624fba9ba 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6303,27 +6303,31 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd) #endif /* CONFIG_SOCK_CGROUP_DATA */ #ifdef CONFIG_CGROUP_BPF -int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, - struct bpf_prog *replace_prog, enum bpf_attach_type type, +int cgroup_bpf_attach(struct cgroup *cgrp, + struct bpf_prog *prog, struct bpf_prog *replace_prog, + struct bpf_cgroup_link *link, + enum bpf_attach_type type, u32 flags) { int ret; mutex_lock(&cgroup_mutex); - ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, type, flags); + ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags); mutex_unlock(&cgroup_mutex); return ret; } + int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, - enum bpf_attach_type type, u32 flags) + enum bpf_attach_type type) { int ret; mutex_lock(&cgroup_mutex); - ret = __cgroup_bpf_detach(cgrp, prog, type); + ret = __cgroup_bpf_detach(cgrp, prog, NULL, type); mutex_unlock(&cgroup_mutex); return ret; } + int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, union bpf_attr __user *uattr) { diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 9f786a5a44ac..37dffe5089a0 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -111,6 +111,7 @@ enum bpf_cmd { BPF_MAP_LOOKUP_AND_DELETE_BATCH, BPF_MAP_UPDATE_BATCH, BPF_MAP_DELETE_BATCH, + BPF_LINK_CREATE, }; enum bpf_map_type { @@ -541,7 +542,7 @@ union bpf_attr { __u32 prog_cnt; } query; - struct { + struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */ __u64 name; __u32 prog_fd; } raw_tracepoint; @@ -569,6 +570,13 @@ union bpf_attr { __u64 probe_offset; /* output: probe_offset */ __u64 probe_addr; /* output: probe_addr */ } task_fd_query; + + struct { /* struct used by BPF_LINK_CREATE command */ + __u32 prog_fd; /* eBPF program to attach */ + __u32 target_fd; /* object to attach to */ + __u32 attach_type; /* attach type */ + __u32 flags; /* extra flags */ + } link_create; } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF -- cgit v1.2.3-59-g8ed1b From cc4f864bb118e0ae7bf9f4e3418eaeb083aa34f2 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 29 Mar 2020 20:00:00 -0700 Subject: libbpf: Add support for bpf_link-based cgroup attachment Add bpf_program__attach_cgroup(), which uses BPF_LINK_CREATE subcommand to create an FD-based kernel bpf_link. Also add low-level bpf_link_create() API. If expected_attach_type is not specified explicitly with bpf_program__set_expected_attach_type(), libbpf will try to determine proper attach type from BPF program's section definition. Also add support for bpf_link's underlying BPF program replacement: - unconditional through high-level bpf_link__update_program() API; - cmpxchg-like with specifying expected current BPF program through low-level bpf_link_update() API. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200330030001.2312810-4-andriin@fb.com --- tools/include/uapi/linux/bpf.h | 12 +++++++++++ tools/lib/bpf/bpf.c | 34 +++++++++++++++++++++++++++++++ tools/lib/bpf/bpf.h | 19 +++++++++++++++++ tools/lib/bpf/libbpf.c | 46 ++++++++++++++++++++++++++++++++++++++++++ tools/lib/bpf/libbpf.h | 8 +++++++- tools/lib/bpf/libbpf.map | 4 ++++ 6 files changed, 122 insertions(+), 1 deletion(-) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 37dffe5089a0..2e29a671d67e 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -112,6 +112,7 @@ enum bpf_cmd { BPF_MAP_UPDATE_BATCH, BPF_MAP_DELETE_BATCH, BPF_LINK_CREATE, + BPF_LINK_UPDATE, }; enum bpf_map_type { @@ -577,6 +578,17 @@ union bpf_attr { __u32 attach_type; /* attach type */ __u32 flags; /* extra flags */ } link_create; + + struct { /* struct used by BPF_LINK_UPDATE command */ + __u32 link_fd; /* link fd */ + /* new program fd to update link with */ + __u32 new_prog_fd; + __u32 flags; /* extra flags */ + /* expected link's program fd; is specified only if + * BPF_F_REPLACE flag is set in flags */ + __u32 old_prog_fd; + } link_update; + } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 73220176728d..5cc1b0785d18 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -585,6 +585,40 @@ int bpf_prog_detach2(int prog_fd, int target_fd, enum bpf_attach_type type) return sys_bpf(BPF_PROG_DETACH, &attr, sizeof(attr)); } +int bpf_link_create(int prog_fd, int target_fd, + enum bpf_attach_type attach_type, + const struct bpf_link_create_opts *opts) +{ + union bpf_attr attr; + + if (!OPTS_VALID(opts, bpf_link_create_opts)) + return -EINVAL; + + memset(&attr, 0, sizeof(attr)); + attr.link_create.prog_fd = prog_fd; + attr.link_create.target_fd = target_fd; + attr.link_create.attach_type = attach_type; + + return sys_bpf(BPF_LINK_CREATE, &attr, sizeof(attr)); +} + +int bpf_link_update(int link_fd, int new_prog_fd, + const struct bpf_link_update_opts *opts) +{ + union bpf_attr attr; + + if (!OPTS_VALID(opts, bpf_link_update_opts)) + return -EINVAL; + + memset(&attr, 0, sizeof(attr)); + attr.link_update.link_fd = link_fd; + attr.link_update.new_prog_fd = new_prog_fd; + attr.link_update.flags = OPTS_GET(opts, flags, 0); + attr.link_update.old_prog_fd = OPTS_GET(opts, old_prog_fd, 0); + + return sys_bpf(BPF_LINK_UPDATE, &attr, sizeof(attr)); +} + int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags, __u32 *attach_flags, __u32 *prog_ids, __u32 *prog_cnt) { diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index b976e77316cc..46d47afdd887 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -168,6 +168,25 @@ LIBBPF_API int bpf_prog_detach(int attachable_fd, enum bpf_attach_type type); LIBBPF_API int bpf_prog_detach2(int prog_fd, int attachable_fd, enum bpf_attach_type type); +struct bpf_link_create_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ +}; +#define bpf_link_create_opts__last_field sz + +LIBBPF_API int bpf_link_create(int prog_fd, int target_fd, + enum bpf_attach_type attach_type, + const struct bpf_link_create_opts *opts); + +struct bpf_link_update_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + __u32 flags; /* extra flags */ + __u32 old_prog_fd; /* expected old program FD */ +}; +#define bpf_link_update_opts__last_field old_prog_fd + +LIBBPF_API int bpf_link_update(int link_fd, int new_prog_fd, + const struct bpf_link_update_opts *opts); + struct bpf_prog_test_run_attr { int prog_fd; int repeat; diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 0638e717f502..ff9174282a8c 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -6978,6 +6978,12 @@ struct bpf_link { bool disconnected; }; +/* Replace link's underlying BPF program with the new one */ +int bpf_link__update_program(struct bpf_link *link, struct bpf_program *prog) +{ + return bpf_link_update(bpf_link__fd(link), bpf_program__fd(prog), NULL); +} + /* Release "ownership" of underlying BPF resource (typically, BPF program * attached to some BPF hook, e.g., tracepoint, kprobe, etc). Disconnected * link, when destructed through bpf_link__destroy() call won't attempt to @@ -7533,6 +7539,46 @@ static struct bpf_link *attach_lsm(const struct bpf_sec_def *sec, return bpf_program__attach_lsm(prog); } +struct bpf_link * +bpf_program__attach_cgroup(struct bpf_program *prog, int cgroup_fd) +{ + const struct bpf_sec_def *sec_def; + enum bpf_attach_type attach_type; + char errmsg[STRERR_BUFSIZE]; + struct bpf_link *link; + int prog_fd, link_fd; + + prog_fd = bpf_program__fd(prog); + if (prog_fd < 0) { + pr_warn("program '%s': can't attach before loaded\n", + bpf_program__title(prog, false)); + return ERR_PTR(-EINVAL); + } + + link = calloc(1, sizeof(*link)); + if (!link) + return ERR_PTR(-ENOMEM); + link->detach = &bpf_link__detach_fd; + + attach_type = bpf_program__get_expected_attach_type(prog); + if (!attach_type) { + sec_def = find_sec_def(bpf_program__title(prog, false)); + if (sec_def) + attach_type = sec_def->attach_type; + } + link_fd = bpf_link_create(prog_fd, cgroup_fd, attach_type, NULL); + if (link_fd < 0) { + link_fd = -errno; + free(link); + pr_warn("program '%s': failed to attach to cgroup: %s\n", + bpf_program__title(prog, false), + libbpf_strerror_r(link_fd, errmsg, sizeof(errmsg))); + return ERR_PTR(link_fd); + } + link->fd = link_fd; + return link; +} + struct bpf_link *bpf_program__attach(struct bpf_program *prog) { const struct bpf_sec_def *sec_def; diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 55348724c355..44df1d3e7287 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -224,6 +224,8 @@ LIBBPF_API int bpf_link__fd(const struct bpf_link *link); LIBBPF_API const char *bpf_link__pin_path(const struct bpf_link *link); LIBBPF_API int bpf_link__pin(struct bpf_link *link, const char *path); LIBBPF_API int bpf_link__unpin(struct bpf_link *link); +LIBBPF_API int bpf_link__update_program(struct bpf_link *link, + struct bpf_program *prog); LIBBPF_API void bpf_link__disconnect(struct bpf_link *link); LIBBPF_API int bpf_link__destroy(struct bpf_link *link); @@ -245,13 +247,17 @@ bpf_program__attach_tracepoint(struct bpf_program *prog, LIBBPF_API struct bpf_link * bpf_program__attach_raw_tracepoint(struct bpf_program *prog, const char *tp_name); - LIBBPF_API struct bpf_link * bpf_program__attach_trace(struct bpf_program *prog); LIBBPF_API struct bpf_link * bpf_program__attach_lsm(struct bpf_program *prog); +LIBBPF_API struct bpf_link * +bpf_program__attach_cgroup(struct bpf_program *prog, int cgroup_fd); + struct bpf_map; + LIBBPF_API struct bpf_link *bpf_map__attach_struct_ops(struct bpf_map *map); + struct bpf_insn; /* diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index eabd3d3e689f..bb8831605b25 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -243,7 +243,11 @@ LIBBPF_0.0.8 { bpf_link__pin; bpf_link__pin_path; bpf_link__unpin; + bpf_link__update_program; + bpf_link_create; + bpf_link_update; bpf_map__set_initial_value; + bpf_program__attach_cgroup; bpf_program__attach_lsm; bpf_program__is_lsm; bpf_program__set_attach_target; -- cgit v1.2.3-59-g8ed1b