From c4dcfdd406aa2167396ac215e351e5e4dfd7efe3 Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Thu, 16 Dec 2021 02:04:26 +0000 Subject: bpf: Move getsockopt retval to struct bpf_cg_run_ctx The retval value is moved to struct bpf_cg_run_ctx for ease of access in different prog types with different context structs layouts. The helper implementation (to be added in a later patch in the series) can simply perform a container_of from current->bpf_ctx to retrieve bpf_cg_run_ctx. Unfortunately, there is no easy way to access the current task_struct via the verifier BPF bytecode rewrite, aside from possibly calling a helper, so a pointer to current task is added to struct bpf_sockopt_kern so that the rewritten BPF bytecode can access struct bpf_cg_run_ctx with an indirection. For backward compatibility, if a getsockopt program rejects a syscall by returning 0, an -EPERM will be generated, by having the BPF_PROG_RUN_ARRAY_CG family macros automatically set the retval to -EPERM. Unlike prior to this patch, this -EPERM will be visible to ctx->retval for any other hooks down the line in the prog array. Additionally, the restriction that getsockopt filters can only set the retval to 0 is removed, considering that certain getsockopt implementations may return optlen. Filters are now able to set the value arbitrarily. Signed-off-by: YiFei Zhu Reviewed-by: Stanislav Fomichev Link: https://lore.kernel.org/r/73b0325f5c29912ccea7ea57ec1ed4d388fc1d37.1639619851.git.zhuyifei@google.com Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux/filter.h') diff --git a/include/linux/filter.h b/include/linux/filter.h index 71fa57b88bfc..d23e999dc032 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1356,7 +1356,10 @@ struct bpf_sockopt_kern { s32 level; s32 optname; s32 optlen; - s32 retval; + /* for retval in struct bpf_cg_run_ctx */ + struct task_struct *current_task; + /* Temporary "register" for indirect stores to ppos. */ + u64 tmp_reg; }; int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len); -- cgit v1.3-8-gc7d7 From ed2d9e1a26cca963ff5ed3b76326d70f7d8201a9 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 4 Feb 2022 10:57:36 -0800 Subject: bpf: Use size instead of pages in bpf_binary_header This is necessary to charge sub page memory for the BPF program. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220204185742.271030-4-song@kernel.org --- include/linux/filter.h | 6 +++--- kernel/bpf/core.c | 11 +++++------ 2 files changed, 8 insertions(+), 9 deletions(-) (limited to 'include/linux/filter.h') diff --git a/include/linux/filter.h b/include/linux/filter.h index d23e999dc032..5855eb474c62 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -548,7 +548,7 @@ struct sock_fprog_kern { #define BPF_IMAGE_ALIGNMENT 8 struct bpf_binary_header { - u32 pages; + u32 size; u8 image[] __aligned(BPF_IMAGE_ALIGNMENT); }; @@ -886,8 +886,8 @@ static inline void bpf_prog_lock_ro(struct bpf_prog *fp) static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr) { set_vm_flush_reset_perms(hdr); - set_memory_ro((unsigned long)hdr, hdr->pages); - set_memory_x((unsigned long)hdr, hdr->pages); + set_memory_ro((unsigned long)hdr, hdr->size >> PAGE_SHIFT); + set_memory_x((unsigned long)hdr, hdr->size >> PAGE_SHIFT); } static inline struct bpf_binary_header * diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 6ca0550c4b24..14199228a6f0 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -543,7 +543,7 @@ bpf_prog_ksym_set_addr(struct bpf_prog *prog) WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog)); prog->aux->ksym.start = (unsigned long) prog->bpf_func; - prog->aux->ksym.end = addr + hdr->pages * PAGE_SIZE; + prog->aux->ksym.end = addr + hdr->size; } static void @@ -866,7 +866,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, bpf_jit_fill_hole_t bpf_fill_ill_insns) { struct bpf_binary_header *hdr; - u32 size, hole, start, pages; + u32 size, hole, start; WARN_ON_ONCE(!is_power_of_2(alignment) || alignment > BPF_IMAGE_ALIGNMENT); @@ -876,7 +876,6 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, * random section of illegal instructions. */ size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE); - pages = size / PAGE_SIZE; if (bpf_jit_charge_modmem(size)) return NULL; @@ -889,7 +888,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, /* Fill space with illegal/arch-dep instructions. */ bpf_fill_ill_insns(hdr, size); - hdr->pages = pages; + hdr->size = size; hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), PAGE_SIZE - sizeof(*hdr)); start = (get_random_int() % hole) & ~(alignment - 1); @@ -902,10 +901,10 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, void bpf_jit_binary_free(struct bpf_binary_header *hdr) { - u32 pages = hdr->pages; + u32 size = hdr->size; bpf_jit_free_exec(hdr); - bpf_jit_uncharge_modmem(pages << PAGE_SHIFT); + bpf_jit_uncharge_modmem(size); } /* This symbol is only overridden by archs that have different -- cgit v1.3-8-gc7d7 From 33c9805860e584b194199cab1a1e81f4e6395408 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 4 Feb 2022 10:57:41 -0800 Subject: bpf: Introduce bpf_jit_binary_pack_[alloc|finalize|free] This is the jit binary allocator built on top of bpf_prog_pack. bpf_prog_pack allocates RO memory, which cannot be used directly by the JIT engine. Therefore, a temporary rw buffer is allocated for the JIT engine. Once JIT is done, bpf_jit_binary_pack_finalize is used to copy the program to the RO memory. bpf_jit_binary_pack_alloc reserves 16 bytes of extra space for illegal instructions, which is small than the 128 bytes space reserved by bpf_jit_binary_alloc. This change is necessary for bpf_jit_binary_hdr to find the correct header. Also, flag use_bpf_prog_pack is added to differentiate a program allocated by bpf_jit_binary_pack_alloc. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220204185742.271030-9-song@kernel.org --- include/linux/bpf.h | 1 + include/linux/filter.h | 21 +++++----- kernel/bpf/core.c | 108 ++++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 120 insertions(+), 10 deletions(-) (limited to 'include/linux/filter.h') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ea0d7fd4a410..2fc7e5c5ef41 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -953,6 +953,7 @@ struct bpf_prog_aux { bool sleepable; bool tail_call_reachable; bool xdp_has_frags; + bool use_bpf_prog_pack; struct hlist_node tramp_hlist; /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */ const struct btf_type *attach_func_proto; diff --git a/include/linux/filter.h b/include/linux/filter.h index 5855eb474c62..1cb1af917617 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -890,15 +890,6 @@ static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr) set_memory_x((unsigned long)hdr, hdr->size >> PAGE_SHIFT); } -static inline struct bpf_binary_header * -bpf_jit_binary_hdr(const struct bpf_prog *fp) -{ - unsigned long real_start = (unsigned long)fp->bpf_func; - unsigned long addr = real_start & PAGE_MASK; - - return (void *)addr; -} - int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap); static inline int sk_filter(struct sock *sk, struct sk_buff *skb) { @@ -1068,6 +1059,18 @@ void *bpf_jit_alloc_exec(unsigned long size); void bpf_jit_free_exec(void *addr); void bpf_jit_free(struct bpf_prog *fp); +struct bpf_binary_header * +bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **ro_image, + unsigned int alignment, + struct bpf_binary_header **rw_hdr, + u8 **rw_image, + bpf_jit_fill_hole_t bpf_fill_ill_insns); +int bpf_jit_binary_pack_finalize(struct bpf_prog *prog, + struct bpf_binary_header *ro_header, + struct bpf_binary_header *rw_header); +void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header, + struct bpf_binary_header *rw_header); + int bpf_jit_add_poke_descriptor(struct bpf_prog *prog, struct bpf_jit_poke_descriptor *poke); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 7ae590897b73..306aa63fa58e 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1031,6 +1031,109 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr) bpf_jit_uncharge_modmem(size); } +/* Allocate jit binary from bpf_prog_pack allocator. + * Since the allocated memory is RO+X, the JIT engine cannot write directly + * to the memory. To solve this problem, a RW buffer is also allocated at + * as the same time. The JIT engine should calculate offsets based on the + * RO memory address, but write JITed program to the RW buffer. Once the + * JIT engine finishes, it calls bpf_jit_binary_pack_finalize, which copies + * the JITed program to the RO memory. + */ +struct bpf_binary_header * +bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr, + unsigned int alignment, + struct bpf_binary_header **rw_header, + u8 **rw_image, + bpf_jit_fill_hole_t bpf_fill_ill_insns) +{ + struct bpf_binary_header *ro_header; + u32 size, hole, start; + + WARN_ON_ONCE(!is_power_of_2(alignment) || + alignment > BPF_IMAGE_ALIGNMENT); + + /* add 16 bytes for a random section of illegal instructions */ + size = round_up(proglen + sizeof(*ro_header) + 16, BPF_PROG_CHUNK_SIZE); + + if (bpf_jit_charge_modmem(size)) + return NULL; + ro_header = bpf_prog_pack_alloc(size); + if (!ro_header) { + bpf_jit_uncharge_modmem(size); + return NULL; + } + + *rw_header = kvmalloc(size, GFP_KERNEL); + if (!*rw_header) { + bpf_prog_pack_free(ro_header); + bpf_jit_uncharge_modmem(size); + return NULL; + } + + /* Fill space with illegal/arch-dep instructions. */ + bpf_fill_ill_insns(*rw_header, size); + (*rw_header)->size = size; + + hole = min_t(unsigned int, size - (proglen + sizeof(*ro_header)), + BPF_PROG_CHUNK_SIZE - sizeof(*ro_header)); + start = (get_random_int() % hole) & ~(alignment - 1); + + *image_ptr = &ro_header->image[start]; + *rw_image = &(*rw_header)->image[start]; + + return ro_header; +} + +/* Copy JITed text from rw_header to its final location, the ro_header. */ +int bpf_jit_binary_pack_finalize(struct bpf_prog *prog, + struct bpf_binary_header *ro_header, + struct bpf_binary_header *rw_header) +{ + void *ptr; + + ptr = bpf_arch_text_copy(ro_header, rw_header, rw_header->size); + + kvfree(rw_header); + + if (IS_ERR(ptr)) { + bpf_prog_pack_free(ro_header); + return PTR_ERR(ptr); + } + prog->aux->use_bpf_prog_pack = true; + return 0; +} + +/* bpf_jit_binary_pack_free is called in two different scenarios: + * 1) when the program is freed after; + * 2) when the JIT engine fails (before bpf_jit_binary_pack_finalize). + * For case 2), we need to free both the RO memory and the RW buffer. + * Also, ro_header->size in 2) is not properly set yet, so rw_header->size + * is used for uncharge. + */ +void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header, + struct bpf_binary_header *rw_header) +{ + u32 size = rw_header ? rw_header->size : ro_header->size; + + bpf_prog_pack_free(ro_header); + kvfree(rw_header); + bpf_jit_uncharge_modmem(size); +} + +static inline struct bpf_binary_header * +bpf_jit_binary_hdr(const struct bpf_prog *fp) +{ + unsigned long real_start = (unsigned long)fp->bpf_func; + unsigned long addr; + + if (fp->aux->use_bpf_prog_pack) + addr = real_start & BPF_PROG_CHUNK_MASK; + else + addr = real_start & PAGE_MASK; + + return (void *)addr; +} + /* This symbol is only overridden by archs that have different * requirements than the usual eBPF JITs, f.e. when they only * implement cBPF JIT, do not set images read-only, etc. @@ -1040,7 +1143,10 @@ void __weak bpf_jit_free(struct bpf_prog *fp) if (fp->jited) { struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp); - bpf_jit_binary_free(hdr); + if (fp->aux->use_bpf_prog_pack) + bpf_jit_binary_pack_free(hdr, NULL /* rw_buffer */); + else + bpf_jit_binary_free(hdr); WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp)); } -- cgit v1.3-8-gc7d7 From 8d21ec0e46ed6e39994accff8eb4f2be3d2e76b5 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 2 Mar 2022 11:56:34 -0800 Subject: bpf: Add __sk_buff->delivery_time_type and bpf_skb_set_skb_delivery_time() * __sk_buff->delivery_time_type: This patch adds __sk_buff->delivery_time_type. It tells if the delivery_time is stored in __sk_buff->tstamp or not. It will be most useful for ingress to tell if the __sk_buff->tstamp has the (rcv) timestamp or delivery_time. If delivery_time_type is 0 (BPF_SKB_DELIVERY_TIME_NONE), it has the (rcv) timestamp. Two non-zero types are defined for the delivery_time_type, BPF_SKB_DELIVERY_TIME_MONO and BPF_SKB_DELIVERY_TIME_UNSPEC. For UNSPEC, it can only happen in egress because only mono delivery_time can be forwarded to ingress now. The clock of UNSPEC delivery_time can be deduced from the skb->sk->sk_clockid which is how the sch_etf doing it also. * Provide forwarded delivery_time to tc-bpf@ingress: With the help of the new delivery_time_type, the tc-bpf has a way to tell if the __sk_buff->tstamp has the (rcv) timestamp or the delivery_time. During bpf load time, the verifier will learn if the bpf prog has accessed the new __sk_buff->delivery_time_type. If it does, it means the tc-bpf@ingress is expecting the skb->tstamp could have the delivery_time. The kernel will then read the skb->tstamp as-is during bpf insn rewrite without checking the skb->mono_delivery_time. This is done by adding a new prog->delivery_time_access bit. The same goes for writing skb->tstamp. * bpf_skb_set_delivery_time(): The bpf_skb_set_delivery_time() helper is added to allow setting both delivery_time and the delivery_time_type at the same time. If the tc-bpf does not need to change the delivery_time_type, it can directly write to the __sk_buff->tstamp as the existing tc-bpf has already been doing. It will be most useful at ingress to change the __sk_buff->tstamp from the (rcv) timestamp to a mono delivery_time and then bpf_redirect_*(). bpf only has mono clock helper (bpf_ktime_get_ns), and the current known use case is the mono EDT for fq, and only mono delivery time can be kept during forward now, so bpf_skb_set_delivery_time() only supports setting BPF_SKB_DELIVERY_TIME_MONO. It can be extended later when use cases come up and the forwarding path also supports other clock bases. Signed-off-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/linux/filter.h | 3 +- include/uapi/linux/bpf.h | 41 +++++++++- net/core/filter.c | 169 ++++++++++++++++++++++++++++++++--------- tools/include/uapi/linux/bpf.h | 41 +++++++++- 4 files changed, 216 insertions(+), 38 deletions(-) (limited to 'include/linux/filter.h') diff --git a/include/linux/filter.h b/include/linux/filter.h index 1cb1af917617..9bf26307247f 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -572,7 +572,8 @@ struct bpf_prog { has_callchain_buf:1, /* callchain buffer allocated? */ enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */ call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */ - call_get_func_ip:1; /* Do we call get_func_ip() */ + call_get_func_ip:1, /* Do we call get_func_ip() */ + delivery_time_access:1; /* Accessed __sk_buff->delivery_time_type */ enum bpf_prog_type type; /* Type of BPF program */ enum bpf_attach_type expected_attach_type; /* For some prog types */ u32 len; /* Number of filter blocks */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index afe3d0d7f5f2..4eebea830613 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5086,6 +5086,37 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. On error * *dst* buffer is zeroed out. + * + * long bpf_skb_set_delivery_time(struct sk_buff *skb, u64 dtime, u32 dtime_type) + * Description + * Set a *dtime* (delivery time) to the __sk_buff->tstamp and also + * change the __sk_buff->delivery_time_type to *dtime_type*. + * + * When setting a delivery time (non zero *dtime*) to + * __sk_buff->tstamp, only BPF_SKB_DELIVERY_TIME_MONO *dtime_type* + * is supported. It is the only delivery_time_type that will be + * kept after bpf_redirect_*(). + * + * If there is no need to change the __sk_buff->delivery_time_type, + * the delivery time can be directly written to __sk_buff->tstamp + * instead. + * + * *dtime* 0 and *dtime_type* BPF_SKB_DELIVERY_TIME_NONE + * can be used to clear any delivery time stored in + * __sk_buff->tstamp. + * + * Only IPv4 and IPv6 skb->protocol are supported. + * + * This function is most useful when it needs to set a + * mono delivery time to __sk_buff->tstamp and then + * bpf_redirect_*() to the egress of an iface. For example, + * changing the (rcv) timestamp in __sk_buff->tstamp at + * ingress to a mono delivery time and then bpf_redirect_*() + * to sch_fq@phy-dev. + * Return + * 0 on success. + * **-EINVAL** for invalid input + * **-EOPNOTSUPP** for unsupported delivery_time_type and protocol */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5280,6 +5311,7 @@ union bpf_attr { FN(xdp_load_bytes), \ FN(xdp_store_bytes), \ FN(copy_from_user_task), \ + FN(skb_set_delivery_time), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -5469,6 +5501,12 @@ union { \ __u64 :64; \ } __attribute__((aligned(8))) +enum { + BPF_SKB_DELIVERY_TIME_NONE, + BPF_SKB_DELIVERY_TIME_UNSPEC, + BPF_SKB_DELIVERY_TIME_MONO, +}; + /* user accessible mirror of in-kernel sk_buff. * new fields can only be added to the end of this structure */ @@ -5509,7 +5547,8 @@ struct __sk_buff { __u32 gso_segs; __bpf_md_ptr(struct bpf_sock *, sk); __u32 gso_size; - __u32 :32; /* Padding, future use. */ + __u8 delivery_time_type; + __u32 :24; /* Padding, future use. */ __u64 hwtstamp; }; diff --git a/net/core/filter.c b/net/core/filter.c index 5072733743e9..88767f7da150 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7388,6 +7388,43 @@ static const struct bpf_func_proto bpf_sock_ops_reserve_hdr_opt_proto = { .arg3_type = ARG_ANYTHING, }; +BPF_CALL_3(bpf_skb_set_delivery_time, struct sk_buff *, skb, + u64, dtime, u32, dtime_type) +{ + /* skb_clear_delivery_time() is done for inet protocol */ + if (skb->protocol != htons(ETH_P_IP) && + skb->protocol != htons(ETH_P_IPV6)) + return -EOPNOTSUPP; + + switch (dtime_type) { + case BPF_SKB_DELIVERY_TIME_MONO: + if (!dtime) + return -EINVAL; + skb->tstamp = dtime; + skb->mono_delivery_time = 1; + break; + case BPF_SKB_DELIVERY_TIME_NONE: + if (dtime) + return -EINVAL; + skb->tstamp = 0; + skb->mono_delivery_time = 0; + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + +static const struct bpf_func_proto bpf_skb_set_delivery_time_proto = { + .func = bpf_skb_set_delivery_time, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + #endif /* CONFIG_INET */ bool bpf_helper_changes_pkt_data(void *func) @@ -7749,6 +7786,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_tcp_gen_syncookie_proto; case BPF_FUNC_sk_assign: return &bpf_sk_assign_proto; + case BPF_FUNC_skb_set_delivery_time: + return &bpf_skb_set_delivery_time_proto; #endif default: return bpf_sk_base_func_proto(func_id); @@ -8088,7 +8127,9 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type return false; info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL; break; - case offsetofend(struct __sk_buff, gso_size) ... offsetof(struct __sk_buff, hwtstamp) - 1: + case offsetof(struct __sk_buff, delivery_time_type): + return false; + case offsetofend(struct __sk_buff, delivery_time_type) ... offsetof(struct __sk_buff, hwtstamp) - 1: /* Explicitly prohibit access to padding in __sk_buff. */ return false; default: @@ -8443,6 +8484,15 @@ static bool tc_cls_act_is_valid_access(int off, int size, break; case bpf_ctx_range_till(struct __sk_buff, family, local_port): return false; + case offsetof(struct __sk_buff, delivery_time_type): + /* The convert_ctx_access() on reading and writing + * __sk_buff->tstamp depends on whether the bpf prog + * has used __sk_buff->delivery_time_type or not. + * Thus, we need to set prog->delivery_time_access + * earlier during is_valid_access() here. + */ + ((struct bpf_prog *)prog)->delivery_time_access = 1; + return size == sizeof(__u8); } return bpf_skb_is_valid_access(off, size, type, prog, info); @@ -8838,6 +8888,45 @@ static u32 flow_dissector_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } +static struct bpf_insn *bpf_convert_dtime_type_read(const struct bpf_insn *si, + struct bpf_insn *insn) +{ + __u8 value_reg = si->dst_reg; + __u8 skb_reg = si->src_reg; + __u8 tmp_reg = BPF_REG_AX; + + *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, + SKB_MONO_DELIVERY_TIME_OFFSET); + *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, + SKB_MONO_DELIVERY_TIME_MASK); + *insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 2); + /* value_reg = BPF_SKB_DELIVERY_TIME_MONO */ + *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_DELIVERY_TIME_MONO); + *insn++ = BPF_JMP_A(IS_ENABLED(CONFIG_NET_CLS_ACT) ? 10 : 5); + + *insn++ = BPF_LDX_MEM(BPF_DW, tmp_reg, skb_reg, + offsetof(struct sk_buff, tstamp)); + *insn++ = BPF_JMP_IMM(BPF_JNE, tmp_reg, 0, 2); + /* value_reg = BPF_SKB_DELIVERY_TIME_NONE */ + *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_DELIVERY_TIME_NONE); + *insn++ = BPF_JMP_A(IS_ENABLED(CONFIG_NET_CLS_ACT) ? 6 : 1); + +#ifdef CONFIG_NET_CLS_ACT + *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, TC_AT_INGRESS_OFFSET); + *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, TC_AT_INGRESS_MASK); + *insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 2); + /* At ingress, value_reg = 0 */ + *insn++ = BPF_MOV32_IMM(value_reg, 0); + *insn++ = BPF_JMP_A(1); +#endif + + /* value_reg = BPF_SKB_DELIVERYT_TIME_UNSPEC */ + *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_DELIVERY_TIME_UNSPEC); + + /* 15 insns with CONFIG_NET_CLS_ACT */ + return insn; +} + static struct bpf_insn *bpf_convert_shinfo_access(const struct bpf_insn *si, struct bpf_insn *insn) { @@ -8859,29 +8948,32 @@ static struct bpf_insn *bpf_convert_shinfo_access(const struct bpf_insn *si, return insn; } -static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_insn *si, +static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog, + const struct bpf_insn *si, struct bpf_insn *insn) { __u8 value_reg = si->dst_reg; __u8 skb_reg = si->src_reg; #ifdef CONFIG_NET_CLS_ACT - __u8 tmp_reg = BPF_REG_AX; - - *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, TC_AT_INGRESS_OFFSET); - *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, TC_AT_INGRESS_MASK); - *insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 5); - /* @ingress, read __sk_buff->tstamp as the (rcv) timestamp, - * so check the skb->mono_delivery_time. - */ - *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, - SKB_MONO_DELIVERY_TIME_OFFSET); - *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, - SKB_MONO_DELIVERY_TIME_MASK); - *insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 2); - /* skb->mono_delivery_time is set, read 0 as the (rcv) timestamp. */ - *insn++ = BPF_MOV64_IMM(value_reg, 0); - *insn++ = BPF_JMP_A(1); + if (!prog->delivery_time_access) { + __u8 tmp_reg = BPF_REG_AX; + + *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, TC_AT_INGRESS_OFFSET); + *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, TC_AT_INGRESS_MASK); + *insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 5); + /* @ingress, read __sk_buff->tstamp as the (rcv) timestamp, + * so check the skb->mono_delivery_time. + */ + *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, + SKB_MONO_DELIVERY_TIME_OFFSET); + *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, + SKB_MONO_DELIVERY_TIME_MASK); + *insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 2); + /* skb->mono_delivery_time is set, read 0 as the (rcv) timestamp. */ + *insn++ = BPF_MOV64_IMM(value_reg, 0); + *insn++ = BPF_JMP_A(1); + } #endif *insn++ = BPF_LDX_MEM(BPF_DW, value_reg, skb_reg, @@ -8889,27 +8981,30 @@ static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_insn *si, return insn; } -static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_insn *si, +static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog, + const struct bpf_insn *si, struct bpf_insn *insn) { __u8 value_reg = si->src_reg; __u8 skb_reg = si->dst_reg; #ifdef CONFIG_NET_CLS_ACT - __u8 tmp_reg = BPF_REG_AX; - - *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, TC_AT_INGRESS_OFFSET); - *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, TC_AT_INGRESS_MASK); - *insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 3); - /* Writing __sk_buff->tstamp at ingress as the (rcv) timestamp. - * Clear the skb->mono_delivery_time. - */ - *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, - SKB_MONO_DELIVERY_TIME_OFFSET); - *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, - ~SKB_MONO_DELIVERY_TIME_MASK); - *insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, - SKB_MONO_DELIVERY_TIME_OFFSET); + if (!prog->delivery_time_access) { + __u8 tmp_reg = BPF_REG_AX; + + *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, TC_AT_INGRESS_OFFSET); + *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, TC_AT_INGRESS_MASK); + *insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 3); + /* Writing __sk_buff->tstamp at ingress as the (rcv) timestamp. + * Clear the skb->mono_delivery_time. + */ + *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, + SKB_MONO_DELIVERY_TIME_OFFSET); + *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, + ~SKB_MONO_DELIVERY_TIME_MASK); + *insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, + SKB_MONO_DELIVERY_TIME_OFFSET); + } #endif /* skb->tstamp = tstamp */ @@ -9226,9 +9321,13 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, BUILD_BUG_ON(sizeof_field(struct sk_buff, tstamp) != 8); if (type == BPF_WRITE) - insn = bpf_convert_tstamp_write(si, insn); + insn = bpf_convert_tstamp_write(prog, si, insn); else - insn = bpf_convert_tstamp_read(si, insn); + insn = bpf_convert_tstamp_read(prog, si, insn); + break; + + case offsetof(struct __sk_buff, delivery_time_type): + insn = bpf_convert_dtime_type_read(si, insn); break; case offsetof(struct __sk_buff, gso_segs): diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index afe3d0d7f5f2..4eebea830613 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5086,6 +5086,37 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. On error * *dst* buffer is zeroed out. + * + * long bpf_skb_set_delivery_time(struct sk_buff *skb, u64 dtime, u32 dtime_type) + * Description + * Set a *dtime* (delivery time) to the __sk_buff->tstamp and also + * change the __sk_buff->delivery_time_type to *dtime_type*. + * + * When setting a delivery time (non zero *dtime*) to + * __sk_buff->tstamp, only BPF_SKB_DELIVERY_TIME_MONO *dtime_type* + * is supported. It is the only delivery_time_type that will be + * kept after bpf_redirect_*(). + * + * If there is no need to change the __sk_buff->delivery_time_type, + * the delivery time can be directly written to __sk_buff->tstamp + * instead. + * + * *dtime* 0 and *dtime_type* BPF_SKB_DELIVERY_TIME_NONE + * can be used to clear any delivery time stored in + * __sk_buff->tstamp. + * + * Only IPv4 and IPv6 skb->protocol are supported. + * + * This function is most useful when it needs to set a + * mono delivery time to __sk_buff->tstamp and then + * bpf_redirect_*() to the egress of an iface. For example, + * changing the (rcv) timestamp in __sk_buff->tstamp at + * ingress to a mono delivery time and then bpf_redirect_*() + * to sch_fq@phy-dev. + * Return + * 0 on success. + * **-EINVAL** for invalid input + * **-EOPNOTSUPP** for unsupported delivery_time_type and protocol */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5280,6 +5311,7 @@ union bpf_attr { FN(xdp_load_bytes), \ FN(xdp_store_bytes), \ FN(copy_from_user_task), \ + FN(skb_set_delivery_time), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -5469,6 +5501,12 @@ union { \ __u64 :64; \ } __attribute__((aligned(8))) +enum { + BPF_SKB_DELIVERY_TIME_NONE, + BPF_SKB_DELIVERY_TIME_UNSPEC, + BPF_SKB_DELIVERY_TIME_MONO, +}; + /* user accessible mirror of in-kernel sk_buff. * new fields can only be added to the end of this structure */ @@ -5509,7 +5547,8 @@ struct __sk_buff { __u32 gso_segs; __bpf_md_ptr(struct bpf_sock *, sk); __u32 gso_size; - __u32 :32; /* Padding, future use. */ + __u8 delivery_time_type; + __u32 :24; /* Padding, future use. */ __u64 hwtstamp; }; -- cgit v1.3-8-gc7d7 From 9bb984f28d5bcb917d35d930fcfb89f90f9449fd Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 9 Mar 2022 01:05:09 -0800 Subject: bpf: Remove BPF_SKB_DELIVERY_TIME_NONE and rename s/delivery_time_/tstamp_/ This patch is to simplify the uapi bpf.h regarding to the tstamp type and use a similar way as the kernel to describe the value stored in __sk_buff->tstamp. My earlier thought was to avoid describing the semantic and clock base for the rcv timestamp until there is more clarity on the use case, so the __sk_buff->delivery_time_type naming instead of __sk_buff->tstamp_type. With some thoughts, it can reuse the UNSPEC naming. This patch first removes BPF_SKB_DELIVERY_TIME_NONE and also rename BPF_SKB_DELIVERY_TIME_UNSPEC to BPF_SKB_TSTAMP_UNSPEC and BPF_SKB_DELIVERY_TIME_MONO to BPF_SKB_TSTAMP_DELIVERY_MONO. The semantic of BPF_SKB_TSTAMP_DELIVERY_MONO is the same: __sk_buff->tstamp has delivery time in mono clock base. BPF_SKB_TSTAMP_UNSPEC means __sk_buff->tstamp has the (rcv) tstamp at ingress and the delivery time at egress. At egress, the clock base could be found from skb->sk->sk_clockid. __sk_buff->tstamp == 0 naturally means NONE, so NONE is not needed. With BPF_SKB_TSTAMP_UNSPEC for the rcv tstamp at ingress, the __sk_buff->delivery_time_type is also renamed to __sk_buff->tstamp_type which was also suggested in the earlier discussion: https://lore.kernel.org/bpf/b181acbe-caf8-502d-4b7b-7d96b9fc5d55@iogearbox.net/ The above will then make __sk_buff->tstamp and __sk_buff->tstamp_type the same as its kernel skb->tstamp and skb->mono_delivery_time counter part. The internal kernel function bpf_skb_convert_dtime_type_read() is then renamed to bpf_skb_convert_tstamp_type_read() and it can be simplified with the BPF_SKB_DELIVERY_TIME_NONE gone. A BPF_ALU32_IMM(BPF_AND) insn is also saved by using BPF_JMP32_IMM(BPF_JSET). The bpf helper bpf_skb_set_delivery_time() is also renamed to bpf_skb_set_tstamp(). The arg name is changed from dtime to tstamp also. It only allows setting tstamp 0 for BPF_SKB_TSTAMP_UNSPEC and it could be relaxed later if there is use case to change mono delivery time to non mono. prog->delivery_time_access is also renamed to prog->tstamp_type_access. Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220309090509.3712315-1-kafai@fb.com --- include/linux/filter.h | 2 +- include/uapi/linux/bpf.h | 40 ++++++++++--------- net/core/filter.c | 88 ++++++++++++++++-------------------------- tools/include/uapi/linux/bpf.h | 40 ++++++++++--------- 4 files changed, 77 insertions(+), 93 deletions(-) (limited to 'include/linux/filter.h') diff --git a/include/linux/filter.h b/include/linux/filter.h index 9bf26307247f..05ed9bd31b45 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -573,7 +573,7 @@ struct bpf_prog { enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */ call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */ call_get_func_ip:1, /* Do we call get_func_ip() */ - delivery_time_access:1; /* Accessed __sk_buff->delivery_time_type */ + tstamp_type_access:1; /* Accessed __sk_buff->tstamp_type */ enum bpf_prog_type type; /* Type of BPF program */ enum bpf_attach_type expected_attach_type; /* For some prog types */ u32 len; /* Number of filter blocks */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index bc23020b638d..d288a0a9f797 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5090,23 +5090,22 @@ union bpf_attr { * 0 on success, or a negative error in case of failure. On error * *dst* buffer is zeroed out. * - * long bpf_skb_set_delivery_time(struct sk_buff *skb, u64 dtime, u32 dtime_type) + * long bpf_skb_set_tstamp(struct sk_buff *skb, u64 tstamp, u32 tstamp_type) * Description - * Set a *dtime* (delivery time) to the __sk_buff->tstamp and also - * change the __sk_buff->delivery_time_type to *dtime_type*. + * Change the __sk_buff->tstamp_type to *tstamp_type* + * and set *tstamp* to the __sk_buff->tstamp together. * - * When setting a delivery time (non zero *dtime*) to - * __sk_buff->tstamp, only BPF_SKB_DELIVERY_TIME_MONO *dtime_type* - * is supported. It is the only delivery_time_type that will be - * kept after bpf_redirect_*(). - * - * If there is no need to change the __sk_buff->delivery_time_type, - * the delivery time can be directly written to __sk_buff->tstamp + * If there is no need to change the __sk_buff->tstamp_type, + * the tstamp value can be directly written to __sk_buff->tstamp * instead. * - * *dtime* 0 and *dtime_type* BPF_SKB_DELIVERY_TIME_NONE - * can be used to clear any delivery time stored in - * __sk_buff->tstamp. + * BPF_SKB_TSTAMP_DELIVERY_MONO is the only tstamp that + * will be kept during bpf_redirect_*(). A non zero + * *tstamp* must be used with the BPF_SKB_TSTAMP_DELIVERY_MONO + * *tstamp_type*. + * + * A BPF_SKB_TSTAMP_UNSPEC *tstamp_type* can only be used + * with a zero *tstamp*. * * Only IPv4 and IPv6 skb->protocol are supported. * @@ -5119,7 +5118,7 @@ union bpf_attr { * Return * 0 on success. * **-EINVAL** for invalid input - * **-EOPNOTSUPP** for unsupported delivery_time_type and protocol + * **-EOPNOTSUPP** for unsupported protocol */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5314,7 +5313,7 @@ union bpf_attr { FN(xdp_load_bytes), \ FN(xdp_store_bytes), \ FN(copy_from_user_task), \ - FN(skb_set_delivery_time), \ + FN(skb_set_tstamp), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -5505,9 +5504,12 @@ union { \ } __attribute__((aligned(8))) enum { - BPF_SKB_DELIVERY_TIME_NONE, - BPF_SKB_DELIVERY_TIME_UNSPEC, - BPF_SKB_DELIVERY_TIME_MONO, + BPF_SKB_TSTAMP_UNSPEC, + BPF_SKB_TSTAMP_DELIVERY_MONO, /* tstamp has mono delivery time */ + /* For any BPF_SKB_TSTAMP_* that the bpf prog cannot handle, + * the bpf prog should handle it like BPF_SKB_TSTAMP_UNSPEC + * and try to deduce it by ingress, egress or skb->sk->sk_clockid. + */ }; /* user accessible mirror of in-kernel sk_buff. @@ -5550,7 +5552,7 @@ struct __sk_buff { __u32 gso_segs; __bpf_md_ptr(struct bpf_sock *, sk); __u32 gso_size; - __u8 delivery_time_type; + __u8 tstamp_type; __u32 :24; /* Padding, future use. */ __u64 hwtstamp; }; diff --git a/net/core/filter.c b/net/core/filter.c index f914e4b13b18..03655f2074ae 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7388,36 +7388,36 @@ static const struct bpf_func_proto bpf_sock_ops_reserve_hdr_opt_proto = { .arg3_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_skb_set_delivery_time, struct sk_buff *, skb, - u64, dtime, u32, dtime_type) +BPF_CALL_3(bpf_skb_set_tstamp, struct sk_buff *, skb, + u64, tstamp, u32, tstamp_type) { /* skb_clear_delivery_time() is done for inet protocol */ if (skb->protocol != htons(ETH_P_IP) && skb->protocol != htons(ETH_P_IPV6)) return -EOPNOTSUPP; - switch (dtime_type) { - case BPF_SKB_DELIVERY_TIME_MONO: - if (!dtime) + switch (tstamp_type) { + case BPF_SKB_TSTAMP_DELIVERY_MONO: + if (!tstamp) return -EINVAL; - skb->tstamp = dtime; + skb->tstamp = tstamp; skb->mono_delivery_time = 1; break; - case BPF_SKB_DELIVERY_TIME_NONE: - if (dtime) + case BPF_SKB_TSTAMP_UNSPEC: + if (tstamp) return -EINVAL; skb->tstamp = 0; skb->mono_delivery_time = 0; break; default: - return -EOPNOTSUPP; + return -EINVAL; } return 0; } -static const struct bpf_func_proto bpf_skb_set_delivery_time_proto = { - .func = bpf_skb_set_delivery_time, +static const struct bpf_func_proto bpf_skb_set_tstamp_proto = { + .func = bpf_skb_set_tstamp, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, @@ -7786,8 +7786,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_tcp_gen_syncookie_proto; case BPF_FUNC_sk_assign: return &bpf_sk_assign_proto; - case BPF_FUNC_skb_set_delivery_time: - return &bpf_skb_set_delivery_time_proto; + case BPF_FUNC_skb_set_tstamp: + return &bpf_skb_set_tstamp_proto; #endif default: return bpf_sk_base_func_proto(func_id); @@ -8127,9 +8127,9 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type return false; info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL; break; - case offsetof(struct __sk_buff, delivery_time_type): + case offsetof(struct __sk_buff, tstamp_type): return false; - case offsetofend(struct __sk_buff, delivery_time_type) ... offsetof(struct __sk_buff, hwtstamp) - 1: + case offsetofend(struct __sk_buff, tstamp_type) ... offsetof(struct __sk_buff, hwtstamp) - 1: /* Explicitly prohibit access to padding in __sk_buff. */ return false; default: @@ -8484,14 +8484,14 @@ static bool tc_cls_act_is_valid_access(int off, int size, break; case bpf_ctx_range_till(struct __sk_buff, family, local_port): return false; - case offsetof(struct __sk_buff, delivery_time_type): + case offsetof(struct __sk_buff, tstamp_type): /* The convert_ctx_access() on reading and writing * __sk_buff->tstamp depends on whether the bpf prog - * has used __sk_buff->delivery_time_type or not. - * Thus, we need to set prog->delivery_time_access + * has used __sk_buff->tstamp_type or not. + * Thus, we need to set prog->tstamp_type_access * earlier during is_valid_access() here. */ - ((struct bpf_prog *)prog)->delivery_time_access = 1; + ((struct bpf_prog *)prog)->tstamp_type_access = 1; return size == sizeof(__u8); } @@ -8888,42 +8888,22 @@ static u32 flow_dissector_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } -static struct bpf_insn *bpf_convert_dtime_type_read(const struct bpf_insn *si, - struct bpf_insn *insn) +static struct bpf_insn *bpf_convert_tstamp_type_read(const struct bpf_insn *si, + struct bpf_insn *insn) { __u8 value_reg = si->dst_reg; __u8 skb_reg = si->src_reg; + /* AX is needed because src_reg and dst_reg could be the same */ __u8 tmp_reg = BPF_REG_AX; *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, PKT_VLAN_PRESENT_OFFSET); - *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, - SKB_MONO_DELIVERY_TIME_MASK); - *insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 2); - /* value_reg = BPF_SKB_DELIVERY_TIME_MONO */ - *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_DELIVERY_TIME_MONO); - *insn++ = BPF_JMP_A(IS_ENABLED(CONFIG_NET_CLS_ACT) ? 10 : 5); - - *insn++ = BPF_LDX_MEM(BPF_DW, tmp_reg, skb_reg, - offsetof(struct sk_buff, tstamp)); - *insn++ = BPF_JMP_IMM(BPF_JNE, tmp_reg, 0, 2); - /* value_reg = BPF_SKB_DELIVERY_TIME_NONE */ - *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_DELIVERY_TIME_NONE); - *insn++ = BPF_JMP_A(IS_ENABLED(CONFIG_NET_CLS_ACT) ? 6 : 1); - -#ifdef CONFIG_NET_CLS_ACT - *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, PKT_VLAN_PRESENT_OFFSET); - *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, TC_AT_INGRESS_MASK); - *insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 2); - /* At ingress, value_reg = 0 */ - *insn++ = BPF_MOV32_IMM(value_reg, 0); + *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, + SKB_MONO_DELIVERY_TIME_MASK, 2); + *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_UNSPEC); *insn++ = BPF_JMP_A(1); -#endif - - /* value_reg = BPF_SKB_DELIVERYT_TIME_UNSPEC */ - *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_DELIVERY_TIME_UNSPEC); + *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_DELIVERY_MONO); - /* 15 insns with CONFIG_NET_CLS_ACT */ return insn; } @@ -8956,11 +8936,11 @@ static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog, __u8 skb_reg = si->src_reg; #ifdef CONFIG_NET_CLS_ACT - /* If the delivery_time_type is read, + /* If the tstamp_type is read, * the bpf prog is aware the tstamp could have delivery time. - * Thus, read skb->tstamp as is if delivery_time_access is true. + * Thus, read skb->tstamp as is if tstamp_type_access is true. */ - if (!prog->delivery_time_access) { + if (!prog->tstamp_type_access) { /* AX is needed because src_reg and dst_reg could be the same */ __u8 tmp_reg = BPF_REG_AX; @@ -8990,13 +8970,13 @@ static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog, __u8 skb_reg = si->dst_reg; #ifdef CONFIG_NET_CLS_ACT - /* If the delivery_time_type is read, + /* If the tstamp_type is read, * the bpf prog is aware the tstamp could have delivery time. - * Thus, write skb->tstamp as is if delivery_time_access is true. + * Thus, write skb->tstamp as is if tstamp_type_access is true. * Otherwise, writing at ingress will have to clear the * mono_delivery_time bit also. */ - if (!prog->delivery_time_access) { + if (!prog->tstamp_type_access) { __u8 tmp_reg = BPF_REG_AX; *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, PKT_VLAN_PRESENT_OFFSET); @@ -9329,8 +9309,8 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, insn = bpf_convert_tstamp_read(prog, si, insn); break; - case offsetof(struct __sk_buff, delivery_time_type): - insn = bpf_convert_dtime_type_read(si, insn); + case offsetof(struct __sk_buff, tstamp_type): + insn = bpf_convert_tstamp_type_read(si, insn); break; case offsetof(struct __sk_buff, gso_segs): diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index bc23020b638d..d288a0a9f797 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5090,23 +5090,22 @@ union bpf_attr { * 0 on success, or a negative error in case of failure. On error * *dst* buffer is zeroed out. * - * long bpf_skb_set_delivery_time(struct sk_buff *skb, u64 dtime, u32 dtime_type) + * long bpf_skb_set_tstamp(struct sk_buff *skb, u64 tstamp, u32 tstamp_type) * Description - * Set a *dtime* (delivery time) to the __sk_buff->tstamp and also - * change the __sk_buff->delivery_time_type to *dtime_type*. + * Change the __sk_buff->tstamp_type to *tstamp_type* + * and set *tstamp* to the __sk_buff->tstamp together. * - * When setting a delivery time (non zero *dtime*) to - * __sk_buff->tstamp, only BPF_SKB_DELIVERY_TIME_MONO *dtime_type* - * is supported. It is the only delivery_time_type that will be - * kept after bpf_redirect_*(). - * - * If there is no need to change the __sk_buff->delivery_time_type, - * the delivery time can be directly written to __sk_buff->tstamp + * If there is no need to change the __sk_buff->tstamp_type, + * the tstamp value can be directly written to __sk_buff->tstamp * instead. * - * *dtime* 0 and *dtime_type* BPF_SKB_DELIVERY_TIME_NONE - * can be used to clear any delivery time stored in - * __sk_buff->tstamp. + * BPF_SKB_TSTAMP_DELIVERY_MONO is the only tstamp that + * will be kept during bpf_redirect_*(). A non zero + * *tstamp* must be used with the BPF_SKB_TSTAMP_DELIVERY_MONO + * *tstamp_type*. + * + * A BPF_SKB_TSTAMP_UNSPEC *tstamp_type* can only be used + * with a zero *tstamp*. * * Only IPv4 and IPv6 skb->protocol are supported. * @@ -5119,7 +5118,7 @@ union bpf_attr { * Return * 0 on success. * **-EINVAL** for invalid input - * **-EOPNOTSUPP** for unsupported delivery_time_type and protocol + * **-EOPNOTSUPP** for unsupported protocol */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5314,7 +5313,7 @@ union bpf_attr { FN(xdp_load_bytes), \ FN(xdp_store_bytes), \ FN(copy_from_user_task), \ - FN(skb_set_delivery_time), \ + FN(skb_set_tstamp), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -5505,9 +5504,12 @@ union { \ } __attribute__((aligned(8))) enum { - BPF_SKB_DELIVERY_TIME_NONE, - BPF_SKB_DELIVERY_TIME_UNSPEC, - BPF_SKB_DELIVERY_TIME_MONO, + BPF_SKB_TSTAMP_UNSPEC, + BPF_SKB_TSTAMP_DELIVERY_MONO, /* tstamp has mono delivery time */ + /* For any BPF_SKB_TSTAMP_* that the bpf prog cannot handle, + * the bpf prog should handle it like BPF_SKB_TSTAMP_UNSPEC + * and try to deduce it by ingress, egress or skb->sk->sk_clockid. + */ }; /* user accessible mirror of in-kernel sk_buff. @@ -5550,7 +5552,7 @@ struct __sk_buff { __u32 gso_segs; __bpf_md_ptr(struct bpf_sock *, sk); __u32 gso_size; - __u8 delivery_time_type; + __u8 tstamp_type; __u32 :24; /* Padding, future use. */ __u64 hwtstamp; }; -- cgit v1.3-8-gc7d7 From d2a3b7c5becc3992f8e7d2b9bf5eacceeedb9a48 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Wed, 9 Mar 2022 20:33:20 +0800 Subject: bpf: Fix net.core.bpf_jit_harden race It is the bpf_jit_harden counterpart to commit 60b58afc96c9 ("bpf: fix net.core.bpf_jit_enable race"). bpf_jit_harden will be tested twice for each subprog if there are subprogs in bpf program and constant blinding may increase the length of program, so when running "./test_progs -t subprogs" and toggling bpf_jit_harden between 0 and 2, jit_subprogs may fail because constant blinding increases the length of subprog instructions during extra passs. So cache the value of bpf_jit_blinding_enabled() during program allocation, and use the cached value during constant blinding, subprog JITing and args tracking of tail call. Signed-off-by: Hou Tao Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220309123321.2400262-4-houtao1@huawei.com --- include/linux/filter.h | 1 + kernel/bpf/core.c | 3 ++- kernel/bpf/verifier.c | 5 +++-- 3 files changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux/filter.h') diff --git a/include/linux/filter.h b/include/linux/filter.h index 05ed9bd31b45..ed0c0ff42ad5 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -566,6 +566,7 @@ struct bpf_prog { gpl_compatible:1, /* Is filter GPL compatible? */ cb_access:1, /* Is control block accessed? */ dst_needed:1, /* Do we need dst entry? */ + blinding_requested:1, /* needs constant blinding */ blinded:1, /* Was blinded */ is_func:1, /* program is a bpf function */ kprobe_override:1, /* Do we override a kprobe? */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ab630f773ec1..1324f9523e7c 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -105,6 +105,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag fp->aux = aux; fp->aux->prog = fp; fp->jit_requested = ebpf_jit_enabled(); + fp->blinding_requested = bpf_jit_blinding_enabled(fp); INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode); mutex_init(&fp->aux->used_maps_mutex); @@ -1382,7 +1383,7 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) struct bpf_insn *insn; int i, rewritten; - if (!bpf_jit_blinding_enabled(prog) || prog->blinded) + if (!prog->blinding_requested || prog->blinded) return prog; clone = bpf_prog_clone_create(prog, GFP_USER); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0db6cd8dcb35..cf92f9c01556 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13023,6 +13023,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->name[0] = 'F'; func[i]->aux->stack_depth = env->subprog_info[i].stack_depth; func[i]->jit_requested = 1; + func[i]->blinding_requested = prog->blinding_requested; func[i]->aux->kfunc_tab = prog->aux->kfunc_tab; func[i]->aux->kfunc_btf_tab = prog->aux->kfunc_btf_tab; func[i]->aux->linfo = prog->aux->linfo; @@ -13146,6 +13147,7 @@ out_free: out_undo_insn: /* cleanup main prog to be interpreted */ prog->jit_requested = 0; + prog->blinding_requested = 0; for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { if (!bpf_pseudo_call(insn)) continue; @@ -13239,7 +13241,6 @@ static int do_misc_fixups(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog; enum bpf_attach_type eatype = prog->expected_attach_type; - bool expect_blinding = bpf_jit_blinding_enabled(prog); enum bpf_prog_type prog_type = resolve_prog_type(prog); struct bpf_insn *insn = prog->insnsi; const struct bpf_func_proto *fn; @@ -13403,7 +13404,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) insn->code = BPF_JMP | BPF_TAIL_CALL; aux = &env->insn_aux_data[i + delta]; - if (env->bpf_capable && !expect_blinding && + if (env->bpf_capable && !prog->blinding_requested && prog->jit_requested && !bpf_map_key_poisoned(aux) && !bpf_map_ptr_poisoned(aux) && -- cgit v1.3-8-gc7d7