From e17d43b93e544f5016c0251d2074c15568d5d963 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 12 May 2020 15:19:08 +0300 Subject: perf: Add perf text poke event Record (single instruction) changes to the kernel text (i.e. self-modifying code) in order to support tracers like Intel PT and ARM CoreSight. A copy of the running kernel code is needed as a reference point (e.g. from /proc/kcore). The text poke event records the old bytes and the new bytes so that the event can be processed forwards or backwards. The basic problem is recording the modified instruction in an unambiguous manner given SMP instruction cache (in)coherence. That is, when modifying an instruction concurrently any solution with one or multiple timestamps is not sufficient: CPU0 CPU1 0 1 write insn A 2 execute insn A 3 sync-I$ 4 Due to I$, CPU1 might execute either the old or new A. No matter where we record tracepoints on CPU0, one simply cannot tell what CPU1 will have observed, except that at 0 it must be the old one and at 4 it must be the new one. To solve this, take inspiration from x86 text poking, which has to solve this exact problem due to variable length instruction encoding and I-fetch windows. 1) overwrite the instruction with a breakpoint and sync I$ This guarantees that that code flow will never hit the target instruction anymore, on any CPU (or rather, it will cause an exception). 2) issue the TEXT_POKE event 3) overwrite the breakpoint with the new instruction and sync I$ Now we know that any execution after the TEXT_POKE event will either observe the breakpoint (and hit the exception) or the new instruction. So by guarding the TEXT_POKE event with an exception on either side; we can now tell, without doubt, which instruction another CPU will have observed. Signed-off-by: Adrian Hunter Signed-off-by: Peter Zijlstra (Intel) Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200512121922.8997-2-adrian.hunter@intel.com --- include/uapi/linux/perf_event.h | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 7b2d6fc9e6ed..e5bee6c17b86 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -383,7 +383,8 @@ struct perf_event_attr { bpf_event : 1, /* include bpf events */ aux_output : 1, /* generate AUX records instead of events */ cgroup : 1, /* include cgroup events */ - __reserved_1 : 31; + text_poke : 1, /* include text poke events */ + __reserved_1 : 30; union { __u32 wakeup_events; /* wakeup every n events */ @@ -1024,6 +1025,24 @@ enum perf_event_type { */ PERF_RECORD_CGROUP = 19, + /* + * Records changes to kernel text i.e. self-modified code. 'old_len' is + * the number of old bytes, 'new_len' is the number of new bytes. Either + * 'old_len' or 'new_len' may be zero to indicate, for example, the + * addition or removal of a trampoline. 'bytes' contains the old bytes + * followed immediately by the new bytes. + * + * struct { + * struct perf_event_header header; + * u64 addr; + * u16 old_len; + * u16 new_len; + * u8 bytes[]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_TEXT_POKE = 20, + PERF_RECORD_MAX, /* non-ABI */ }; -- cgit v1.2.3-59-g8ed1b From 69e49088692899d25dedfa22f00dfb9761e86ed7 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 12 May 2020 15:19:11 +0300 Subject: kprobes: Add perf ksymbol events for kprobe insn pages Symbols are needed for tools to describe instruction addresses. Pages allocated for kprobe's purposes need symbols to be created for them. Add such symbols to be visible via perf ksymbol events. Signed-off-by: Adrian Hunter Signed-off-by: Peter Zijlstra (Intel) Acked-by: Peter Zijlstra (Intel) Acked-by: Masami Hiramatsu Link: https://lkml.kernel.org/r/20200512121922.8997-5-adrian.hunter@intel.com --- include/uapi/linux/perf_event.h | 5 +++++ kernel/kprobes.c | 12 ++++++++++++ 2 files changed, 17 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index e5bee6c17b86..e1a4179144a1 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -1049,6 +1049,11 @@ enum perf_event_type { enum perf_record_ksymbol_type { PERF_RECORD_KSYMBOL_TYPE_UNKNOWN = 0, PERF_RECORD_KSYMBOL_TYPE_BPF = 1, + /* + * Out of line code such as kprobe-replaced instructions or optimized + * kprobes. + */ + PERF_RECORD_KSYMBOL_TYPE_OOL = 2, PERF_RECORD_KSYMBOL_TYPE_MAX /* non-ABI */ }; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 058c0be3464b..2b58740ca0f3 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -184,6 +185,10 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c) kip->cache = c; list_add_rcu(&kip->list, &c->pages); slot = kip->insns; + + /* Record the perf ksymbol register event after adding the page */ + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, (unsigned long)kip->insns, + PAGE_SIZE, false, c->sym); out: mutex_unlock(&c->mutex); return slot; @@ -202,6 +207,13 @@ static int collect_one_slot(struct kprobe_insn_page *kip, int idx) * next time somebody inserts a probe. */ if (!list_is_singular(&kip->list)) { + /* + * Record perf ksymbol unregister event before removing + * the page. + */ + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, + (unsigned long)kip->insns, PAGE_SIZE, true, + kip->cache->sym); list_del_rcu(&kip->list); synchronize_rcu(); kip->cache->free(kip->insns); -- cgit v1.2.3-59-g8ed1b From dd9ddf466ad7a5d2e247925d81ebb0b878bf3b76 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 12 May 2020 15:19:14 +0300 Subject: ftrace: Add perf ksymbol events for ftrace trampolines Symbols are needed for tools to describe instruction addresses. Pages allocated for ftrace's purposes need symbols to be created for them. Add such symbols to be visible via perf ksymbol events. Signed-off-by: Adrian Hunter Signed-off-by: Peter Zijlstra (Intel) Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200512121922.8997-8-adrian.hunter@intel.com --- include/uapi/linux/perf_event.h | 2 +- kernel/trace/ftrace.c | 14 ++++++++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index e1a4179144a1..52ca2093831c 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -1051,7 +1051,7 @@ enum perf_record_ksymbol_type { PERF_RECORD_KSYMBOL_TYPE_BPF = 1, /* * Out of line code such as kprobe-replaced instructions or optimized - * kprobes. + * kprobes or ftrace trampolines. */ PERF_RECORD_KSYMBOL_TYPE_OOL = 2, PERF_RECORD_KSYMBOL_TYPE_MAX /* non-ABI */ diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 31675b209db2..2baaf7716537 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2790,8 +2790,13 @@ static void ftrace_remove_trampoline_from_kallsyms(struct ftrace_ops *ops) static void ftrace_trampoline_free(struct ftrace_ops *ops) { if (ops && (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP) && - ops->trampoline) + ops->trampoline) { + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, + ops->trampoline, ops->trampoline_size, + true, FTRACE_TRAMPOLINE_SYM); + /* Remove from kallsyms after the perf events */ ftrace_remove_trampoline_from_kallsyms(ops); + } arch_ftrace_trampoline_free(ops); } @@ -6805,8 +6810,13 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops) arch_ftrace_update_trampoline(ops); if (ops->trampoline && ops->trampoline != trampoline && - (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) + (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) { + /* Add to kallsyms before the perf events */ ftrace_add_trampoline_to_kallsyms(ops); + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, + ops->trampoline, ops->trampoline_size, false, + FTRACE_TRAMPOLINE_SYM); + } } void ftrace_init_trace_array(struct trace_array *tr) -- cgit v1.2.3-59-g8ed1b From 9b4feb630e8e9801603f3cab3a36369e3c1cf88d Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 24 May 2019 11:31:44 +0200 Subject: arch: wire-up close_range() This wires up the close_range() syscall into all arches at once. Suggested-by: Arnd Bergmann Signed-off-by: Christian Brauner Reviewed-by: Oleg Nesterov Acked-by: Arnd Bergmann Acked-by: Michael Ellerman (powerpc) Cc: Jann Horn Cc: David Howells Cc: Dmitry V. Levin Cc: Linus Torvalds Cc: Al Viro Cc: Florian Weimer Cc: linux-api@vger.kernel.org Cc: linux-alpha@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org Cc: linux-ia64@vger.kernel.org Cc: linux-m68k@lists.linux-m68k.org Cc: linux-mips@vger.kernel.org Cc: linux-parisc@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Cc: linux-s390@vger.kernel.org Cc: linux-sh@vger.kernel.org Cc: sparclinux@vger.kernel.org Cc: linux-xtensa@linux-xtensa.org Cc: linux-arch@vger.kernel.org Cc: x86@kernel.org --- arch/alpha/kernel/syscalls/syscall.tbl | 1 + arch/arm/tools/syscall.tbl | 1 + arch/arm64/include/asm/unistd32.h | 2 ++ arch/ia64/kernel/syscalls/syscall.tbl | 1 + arch/m68k/kernel/syscalls/syscall.tbl | 1 + arch/microblaze/kernel/syscalls/syscall.tbl | 1 + arch/mips/kernel/syscalls/syscall_n32.tbl | 1 + arch/mips/kernel/syscalls/syscall_n64.tbl | 1 + arch/mips/kernel/syscalls/syscall_o32.tbl | 1 + arch/parisc/kernel/syscalls/syscall.tbl | 1 + arch/powerpc/kernel/syscalls/syscall.tbl | 1 + arch/s390/kernel/syscalls/syscall.tbl | 1 + arch/sh/kernel/syscalls/syscall.tbl | 1 + arch/sparc/kernel/syscalls/syscall.tbl | 1 + arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + arch/xtensa/kernel/syscalls/syscall.tbl | 1 + include/uapi/asm-generic/unistd.h | 2 ++ 18 files changed, 20 insertions(+) (limited to 'include/uapi') diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index 5ddd128d4b7a..a28fb211881d 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -475,6 +475,7 @@ 543 common fspick sys_fspick 544 common pidfd_open sys_pidfd_open # 545 reserved for clone3 +546 common close_range sys_close_range 547 common openat2 sys_openat2 548 common pidfd_getfd sys_pidfd_getfd 549 common faccessat2 sys_faccessat2 diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index d5cae5ffede0..7e8ee4adf269 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -449,6 +449,7 @@ 433 common fspick sys_fspick 434 common pidfd_open sys_pidfd_open 435 common clone3 sys_clone3 +436 common close_range sys_close_range 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 6d95d0c8bf2f..c760b9e159f5 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -879,6 +879,8 @@ __SYSCALL(__NR_fspick, sys_fspick) __SYSCALL(__NR_pidfd_open, sys_pidfd_open) #define __NR_clone3 435 __SYSCALL(__NR_clone3, sys_clone3) +#define __NR_close_range 436 +__SYSCALL(__NR_close_range, sys_close_range) #define __NR_openat2 437 __SYSCALL(__NR_openat2, sys_openat2) #define __NR_pidfd_getfd 438 diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl index 49e325b604b3..ced9c83e47c9 100644 --- a/arch/ia64/kernel/syscalls/syscall.tbl +++ b/arch/ia64/kernel/syscalls/syscall.tbl @@ -356,6 +356,7 @@ 433 common fspick sys_fspick 434 common pidfd_open sys_pidfd_open # 435 reserved for clone3 +436 common close_range sys_close_range 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index f71b1bbcc198..1a4822de7292 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -435,6 +435,7 @@ 433 common fspick sys_fspick 434 common pidfd_open sys_pidfd_open 435 common clone3 __sys_clone3 +436 common close_range sys_close_range 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index edacc4561f2b..a3f4be8e7238 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -441,6 +441,7 @@ 433 common fspick sys_fspick 434 common pidfd_open sys_pidfd_open 435 common clone3 sys_clone3 +436 common close_range sys_close_range 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index f777141f5256..501bc09643bd 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -374,6 +374,7 @@ 433 n32 fspick sys_fspick 434 n32 pidfd_open sys_pidfd_open 435 n32 clone3 __sys_clone3 +436 n32 close_range sys_close_range 437 n32 openat2 sys_openat2 438 n32 pidfd_getfd sys_pidfd_getfd 439 n32 faccessat2 sys_faccessat2 diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index da8c76394e17..391acbf425a0 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -350,6 +350,7 @@ 433 n64 fspick sys_fspick 434 n64 pidfd_open sys_pidfd_open 435 n64 clone3 __sys_clone3 +436 n64 close_range sys_close_range 437 n64 openat2 sys_openat2 438 n64 pidfd_getfd sys_pidfd_getfd 439 n64 faccessat2 sys_faccessat2 diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index 13280625d312..d28f12f641d3 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -423,6 +423,7 @@ 433 o32 fspick sys_fspick 434 o32 pidfd_open sys_pidfd_open 435 o32 clone3 __sys_clone3 +436 o32 close_range sys_close_range 437 o32 openat2 sys_openat2 438 o32 pidfd_getfd sys_pidfd_getfd 439 o32 faccessat2 sys_faccessat2 diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index 5a758fa6ec52..5d76b8f15197 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -433,6 +433,7 @@ 433 common fspick sys_fspick 434 common pidfd_open sys_pidfd_open 435 common clone3 sys_clone3_wrapper +436 common close_range sys_close_range 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index f833a3190822..dd87a782d80e 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -525,6 +525,7 @@ 435 32 clone3 ppc_clone3 sys_clone3 435 64 clone3 sys_clone3 435 spu clone3 sys_ni_syscall +436 common close_range sys_close_range 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index bfdcb7633957..effb5195608c 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -438,6 +438,7 @@ 433 common fspick sys_fspick sys_fspick 434 common pidfd_open sys_pidfd_open sys_pidfd_open 435 common clone3 sys_clone3 sys_clone3 +436 common close_range sys_close_range sys_close_range 437 common openat2 sys_openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 sys_faccessat2 diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index acc35daa1b79..96848db9659e 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -438,6 +438,7 @@ 433 common fspick sys_fspick 434 common pidfd_open sys_pidfd_open # 435 reserved for clone3 +436 common close_range sys_close_range 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index 8004a276cb74..d6447d08c9a1 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -481,6 +481,7 @@ 433 common fspick sys_fspick 434 common pidfd_open sys_pidfd_open # 435 reserved for clone3 +436 common close_range sys_close_range 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index d8f8a1a69ed1..3f0c6546b47c 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -440,6 +440,7 @@ 433 i386 fspick sys_fspick 434 i386 pidfd_open sys_pidfd_open 435 i386 clone3 sys_clone3 +436 i386 close_range sys_close_range 437 i386 openat2 sys_openat2 438 i386 pidfd_getfd sys_pidfd_getfd 439 i386 faccessat2 sys_faccessat2 diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 78847b32e137..f8637c2c863d 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -357,6 +357,7 @@ 433 common fspick sys_fspick 434 common pidfd_open sys_pidfd_open 435 common clone3 sys_clone3 +436 common close_range sys_close_range 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index 69d0d73876b3..d216ccba42f7 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -406,6 +406,7 @@ 433 common fspick sys_fspick 434 common pidfd_open sys_pidfd_open 435 common clone3 sys_clone3 +436 common close_range sys_close_range 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index f4a01305d9a6..31001e3323bc 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -850,6 +850,8 @@ __SYSCALL(__NR_pidfd_open, sys_pidfd_open) #define __NR_clone3 435 __SYSCALL(__NR_clone3, sys_clone3) #endif +#define __NR_close_range 436 +__SYSCALL(__NR_close_range, sys_close_range) #define __NR_openat2 437 __SYSCALL(__NR_openat2, sys_openat2) -- cgit v1.2.3-59-g8ed1b From 60997c3d45d9a67daf01c56d805ae4fec37e0bd8 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 3 Jun 2020 21:48:55 +0200 Subject: close_range: add CLOSE_RANGE_UNSHARE One of the use-cases of close_range() is to drop file descriptors just before execve(). This would usually be expressed in the sequence: unshare(CLONE_FILES); close_range(3, ~0U); as pointed out by Linus it might be desirable to have this be a part of close_range() itself under a new flag CLOSE_RANGE_UNSHARE. This expands {dup,unshare)_fd() to take a max_fds argument that indicates the maximum number of file descriptors to copy from the old struct files. When the user requests that all file descriptors are supposed to be closed via close_range(min, max) then we can cap via unshare_fd(min) and hence don't need to do any of the heavy fput() work for everything above min. The patch makes it so that if CLOSE_RANGE_UNSHARE is requested and we do in fact currently share our file descriptor table we create a new private copy. We then close all fds in the requested range and finally after we're done we install the new fd table. Suggested-by: Linus Torvalds Signed-off-by: Christian Brauner --- fs/file.c | 65 +++++++++++++++++++++++++++++++++++----- fs/open.c | 5 +--- include/linux/fdtable.h | 8 +++-- include/uapi/linux/close_range.h | 9 ++++++ kernel/fork.c | 11 +++---- 5 files changed, 79 insertions(+), 19 deletions(-) create mode 100644 include/uapi/linux/close_range.h (limited to 'include/uapi') diff --git a/fs/file.c b/fs/file.c index 1b8ff05e8311..340bc9569f9d 100644 --- a/fs/file.c +++ b/fs/file.c @@ -19,6 +19,7 @@ #include #include #include +#include unsigned int sysctl_nr_open __read_mostly = 1024*1024; unsigned int sysctl_nr_open_min = BITS_PER_LONG; @@ -265,12 +266,22 @@ static unsigned int count_open_files(struct fdtable *fdt) return i; } +static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int max_fds) +{ + unsigned int count; + + count = count_open_files(fdt); + if (max_fds < NR_OPEN_DEFAULT) + max_fds = NR_OPEN_DEFAULT; + return min(count, max_fds); +} + /* * Allocate a new files structure and copy contents from the * passed in files structure. * errorp will be valid only when the returned files_struct is NULL. */ -struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) +struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int *errorp) { struct files_struct *newf; struct file **old_fds, **new_fds; @@ -297,7 +308,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); - open_files = count_open_files(old_fdt); + open_files = sane_fdtable_size(old_fdt, max_fds); /* * Check whether we need to allocate a larger fd array and fd set. @@ -328,7 +339,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) */ spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); - open_files = count_open_files(old_fdt); + open_files = sane_fdtable_size(old_fdt, max_fds); } copy_fd_bitmaps(new_fdt, old_fdt, open_files); @@ -665,32 +676,72 @@ EXPORT_SYMBOL(__close_fd); /* for ksys_close() */ * This closes a range of file descriptors. All file descriptors * from @fd up to and including @max_fd are closed. */ -int __close_range(struct files_struct *files, unsigned fd, unsigned max_fd) +int __close_range(unsigned fd, unsigned max_fd, unsigned int flags) { unsigned int cur_max; + struct task_struct *me = current; + struct files_struct *cur_fds = me->files, *fds = NULL; + + if (flags & ~CLOSE_RANGE_UNSHARE) + return -EINVAL; if (fd > max_fd) return -EINVAL; rcu_read_lock(); - cur_max = files_fdtable(files)->max_fds; + cur_max = files_fdtable(cur_fds)->max_fds; rcu_read_unlock(); /* cap to last valid index into fdtable */ cur_max--; + if (flags & CLOSE_RANGE_UNSHARE) { + int ret; + unsigned int max_unshare_fds = NR_OPEN_MAX; + + /* + * If the requested range is greater than the current maximum, + * we're closing everything so only copy all file descriptors + * beneath the lowest file descriptor. + */ + if (max_fd >= cur_max) + max_unshare_fds = fd; + + ret = unshare_fd(CLONE_FILES, max_unshare_fds, &fds); + if (ret) + return ret; + + /* + * We used to share our file descriptor table, and have now + * created a private one, make sure we're using it below. + */ + if (fds) + swap(cur_fds, fds); + } + max_fd = min(max_fd, cur_max); while (fd <= max_fd) { struct file *file; - file = pick_file(files, fd++); + file = pick_file(cur_fds, fd++); if (!file) continue; - filp_close(file, files); + filp_close(file, cur_fds); cond_resched(); } + if (fds) { + /* + * We're done closing the files we were supposed to. Time to install + * the new file descriptor table and drop the old one. + */ + task_lock(me); + me->files = cur_fds; + task_unlock(me); + put_files_struct(fds); + } + return 0; } diff --git a/fs/open.c b/fs/open.c index 073ea3c45347..5e62f18adc5b 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1324,10 +1324,7 @@ SYSCALL_DEFINE1(close, unsigned int, fd) SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd, unsigned int, flags) { - if (flags) - return -EINVAL; - - return __close_range(current->files, fd, max_fd); + return __close_range(fd, max_fd, flags); } /* diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index fcd07181a365..a32bf47c593e 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -22,6 +22,7 @@ * as this is the granularity returned by copy_fdset(). */ #define NR_OPEN_DEFAULT BITS_PER_LONG +#define NR_OPEN_MAX ~0U struct fdtable { unsigned int max_fds; @@ -109,7 +110,7 @@ struct files_struct *get_files_struct(struct task_struct *); void put_files_struct(struct files_struct *fs); void reset_files_struct(struct files_struct *); int unshare_files(struct files_struct **); -struct files_struct *dup_fd(struct files_struct *, int *) __latent_entropy; +struct files_struct *dup_fd(struct files_struct *, unsigned, int *) __latent_entropy; void do_close_on_exec(struct files_struct *); int iterate_fd(struct files_struct *, unsigned, int (*)(const void *, struct file *, unsigned), @@ -121,9 +122,10 @@ extern void __fd_install(struct files_struct *files, unsigned int fd, struct file *file); extern int __close_fd(struct files_struct *files, unsigned int fd); -extern int __close_range(struct files_struct *files, unsigned int fd, - unsigned int max_fd); +extern int __close_range(unsigned int fd, unsigned int max_fd, unsigned int flags); extern int __close_fd_get_file(unsigned int fd, struct file **res); +extern int unshare_fd(unsigned long unshare_flags, unsigned int max_fds, + struct files_struct **new_fdp); extern struct kmem_cache *files_cachep; diff --git a/include/uapi/linux/close_range.h b/include/uapi/linux/close_range.h new file mode 100644 index 000000000000..6928a9fdee3c --- /dev/null +++ b/include/uapi/linux/close_range.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_CLOSE_RANGE_H +#define _UAPI_LINUX_CLOSE_RANGE_H + +/* Unshare the file descriptor table before closing file descriptors. */ +#define CLOSE_RANGE_UNSHARE (1U << 1) + +#endif /* _UAPI_LINUX_CLOSE_RANGE_H */ + diff --git a/kernel/fork.c b/kernel/fork.c index 142b23645d82..8948121c8454 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1474,7 +1474,7 @@ static int copy_files(unsigned long clone_flags, struct task_struct *tsk) goto out; } - newf = dup_fd(oldf, &error); + newf = dup_fd(oldf, NR_OPEN_MAX, &error); if (!newf) goto out; @@ -2907,14 +2907,15 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) /* * Unshare file descriptor table if it is being shared */ -static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp) +int unshare_fd(unsigned long unshare_flags, unsigned int max_fds, + struct files_struct **new_fdp) { struct files_struct *fd = current->files; int error = 0; if ((unshare_flags & CLONE_FILES) && (fd && atomic_read(&fd->count) > 1)) { - *new_fdp = dup_fd(fd, &error); + *new_fdp = dup_fd(fd, max_fds, &error); if (!*new_fdp) return error; } @@ -2974,7 +2975,7 @@ int ksys_unshare(unsigned long unshare_flags) err = unshare_fs(unshare_flags, &new_fs); if (err) goto bad_unshare_out; - err = unshare_fd(unshare_flags, &new_fd); + err = unshare_fd(unshare_flags, NR_OPEN_MAX, &new_fd); if (err) goto bad_unshare_cleanup_fs; err = unshare_userns(unshare_flags, &new_cred); @@ -3063,7 +3064,7 @@ int unshare_files(struct files_struct **displaced) struct files_struct *copy = NULL; int error; - error = unshare_fd(CLONE_FILES, ©); + error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, ©); if (error || !copy) { *displaced = NULL; return error; -- cgit v1.2.3-59-g8ed1b From 03cc8353c2244c8b790c3c81a0f1532d34a9d738 Mon Sep 17 00:00:00 2001 From: Rob Gill Date: Mon, 1 Jun 2020 21:17:56 +0000 Subject: USB: core: additional Device Classes to debug/usb/devices Several newer USB Device classes are not presently reported individually at /sys/kernel/debug/usb/devices, (reported as "unk."). This patch adds the following classes: 0fh (Personal Healthcare devices), 10h (USB Type-C combined Audio/Video devices) 11h (USB billboard), 12h (USB Type-C Bridge). As defined at [https://www.usb.org/defined-class-codes] Corresponding classes defined in include/linux/usb/ch9.h. Signed-off-by: Rob Gill Link: https://lore.kernel.org/r/20200601211749.6878-1-rrobgill@protonmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/devices.c | 4 ++++ include/uapi/linux/usb/ch9.h | 4 ++++ 2 files changed, 8 insertions(+) (limited to 'include/uapi') diff --git a/drivers/usb/core/devices.c b/drivers/usb/core/devices.c index 94b6fa6e585e..696b2b692b83 100644 --- a/drivers/usb/core/devices.c +++ b/drivers/usb/core/devices.c @@ -133,6 +133,10 @@ static const struct class_info clas_info[] = { {USB_CLASS_CSCID, "scard"}, {USB_CLASS_CONTENT_SEC, "c-sec"}, {USB_CLASS_VIDEO, "video"}, + {USB_CLASS_PERSONAL_HEALTHCARE, "perhc"}, + {USB_CLASS_AUDIO_VIDEO, "av"}, + {USB_CLASS_BILLBOARD, "blbrd"}, + {USB_CLASS_USB_TYPE_C_BRIDGE, "bridg"}, {USB_CLASS_WIRELESS_CONTROLLER, "wlcon"}, {USB_CLASS_MISC, "misc"}, {USB_CLASS_APP_SPEC, "app."}, diff --git a/include/uapi/linux/usb/ch9.h b/include/uapi/linux/usb/ch9.h index 2b623f36af6b..456ab0c2b586 100644 --- a/include/uapi/linux/usb/ch9.h +++ b/include/uapi/linux/usb/ch9.h @@ -326,6 +326,10 @@ struct usb_device_descriptor { #define USB_CLASS_CONTENT_SEC 0x0d /* content security */ #define USB_CLASS_VIDEO 0x0e #define USB_CLASS_WIRELESS_CONTROLLER 0xe0 +#define USB_CLASS_PERSONAL_HEALTHCARE 0x0f +#define USB_CLASS_AUDIO_VIDEO 0x10 +#define USB_CLASS_BILLBOARD 0x11 +#define USB_CLASS_USB_TYPE_C_BRIDGE 0x12 #define USB_CLASS_MISC 0xef #define USB_CLASS_APP_SPEC 0xfe #define USB_CLASS_VENDOR_SPEC 0xff -- cgit v1.2.3-59-g8ed1b From 81c7462883b0cc0a4eeef0687f80ad5b5baee5f6 Mon Sep 17 00:00:00 2001 From: Macpaul Lin Date: Thu, 18 Jun 2020 17:13:38 +0800 Subject: USB: replace hardcode maximum usb string length by definition Replace hardcoded maximum USB string length (126 bytes) by definition "USB_MAX_STRING_LEN". Signed-off-by: Macpaul Lin Acked-by: Alan Stern Link: https://lore.kernel.org/r/1592471618-29428-1-git-send-email-macpaul.lin@mediatek.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/composite.c | 4 ++-- drivers/usb/gadget/configfs.c | 2 +- drivers/usb/gadget/usbstring.c | 4 ++-- include/uapi/linux/usb/ch9.h | 3 +++ 4 files changed, 8 insertions(+), 5 deletions(-) (limited to 'include/uapi') diff --git a/drivers/usb/gadget/composite.c b/drivers/usb/gadget/composite.c index 5c1eb96a5c57..8fbf73467fef 100644 --- a/drivers/usb/gadget/composite.c +++ b/drivers/usb/gadget/composite.c @@ -1085,7 +1085,7 @@ static void collect_langs(struct usb_gadget_strings **sp, __le16 *buf) while (*sp) { s = *sp; language = cpu_to_le16(s->language); - for (tmp = buf; *tmp && tmp < &buf[126]; tmp++) { + for (tmp = buf; *tmp && tmp < &buf[USB_MAX_STRING_LEN]; tmp++) { if (*tmp == language) goto repeat; } @@ -1160,7 +1160,7 @@ static int get_string(struct usb_composite_dev *cdev, collect_langs(sp, s->wData); } - for (len = 0; len <= 126 && s->wData[len]; len++) + for (len = 0; len <= USB_MAX_STRING_LEN && s->wData[len]; len++) continue; if (!len) return -EINVAL; diff --git a/drivers/usb/gadget/configfs.c b/drivers/usb/gadget/configfs.c index 9dc06a4e1b30..56051bb97349 100644 --- a/drivers/usb/gadget/configfs.c +++ b/drivers/usb/gadget/configfs.c @@ -103,7 +103,7 @@ static int usb_string_copy(const char *s, char **s_copy) char *str; char *copy = *s_copy; ret = strlen(s); - if (ret > 126) + if (ret > USB_MAX_STRING_LEN) return -EOVERFLOW; str = kstrdup(s, GFP_KERNEL); diff --git a/drivers/usb/gadget/usbstring.c b/drivers/usb/gadget/usbstring.c index 58a4d3325090..30f889ad3ad2 100644 --- a/drivers/usb/gadget/usbstring.c +++ b/drivers/usb/gadget/usbstring.c @@ -55,9 +55,9 @@ usb_gadget_get_string (const struct usb_gadget_strings *table, int id, u8 *buf) return -EINVAL; /* string descriptors have length, tag, then UTF16-LE text */ - len = min ((size_t) 126, strlen (s->s)); + len = min((size_t)USB_MAX_STRING_LEN, strlen(s->s)); len = utf8s_to_utf16s(s->s, len, UTF16_LITTLE_ENDIAN, - (wchar_t *) &buf[2], 126); + (wchar_t *) &buf[2], USB_MAX_STRING_LEN); if (len < 0) return -EINVAL; buf [0] = (len + 1) * 2; diff --git a/include/uapi/linux/usb/ch9.h b/include/uapi/linux/usb/ch9.h index 456ab0c2b586..b1ed2ccfe9cf 100644 --- a/include/uapi/linux/usb/ch9.h +++ b/include/uapi/linux/usb/ch9.h @@ -368,6 +368,9 @@ struct usb_config_descriptor { /*-------------------------------------------------------------------------*/ +/* USB String descriptors can contain at most 126 characters. */ +#define USB_MAX_STRING_LEN 126 + /* USB_DT_STRING: String descriptor */ struct usb_string_descriptor { __u8 bLength; -- cgit v1.2.3-59-g8ed1b From 5769a351b89cd4d82016f18fa5f6c4077403564d Mon Sep 17 00:00:00 2001 From: Jiufei Xue Date: Wed, 17 Jun 2020 17:53:55 +0800 Subject: io_uring: change the poll type to be 32-bits poll events should be 32-bits to cover EPOLLEXCLUSIVE. Explicit word-swap the poll32_events for big endian to make sure the ABI is not changed. We call this feature IORING_FEAT_POLL_32BITS, applications who want to use EPOLLEXCLUSIVE should check the feature bit first. Signed-off-by: Jiufei Xue Signed-off-by: Jens Axboe --- fs/io_uring.c | 13 +++++++++---- include/uapi/linux/io_uring.h | 4 +++- tools/io_uring/liburing.h | 6 +++++- 3 files changed, 17 insertions(+), 6 deletions(-) (limited to 'include/uapi') diff --git a/fs/io_uring.c b/fs/io_uring.c index a78201b96179..0eb063daa9b5 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4589,7 +4589,7 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_poll_iocb *poll = &req->poll; - u16 events; + u32 events; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -4598,7 +4598,10 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe if (!poll->file) return -EBADF; - events = READ_ONCE(sqe->poll_events); + events = READ_ONCE(sqe->poll32_events); +#ifdef __BIG_ENDIAN + events = swahw32(events); +#endif poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; io_get_req_task(req); @@ -7928,7 +7931,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | - IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL; + IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | + IORING_FEAT_POLL_32BITS; if (copy_to_user(params, p, sizeof(*p))) { ret = -EFAULT; @@ -8217,7 +8221,8 @@ static int __init io_uring_init(void) BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags); BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags); BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags); - BUILD_BUG_SQE_ELEM(28, __u16, poll_events); + BUILD_BUG_SQE_ELEM(28, /* compat */ __u16, poll_events); + BUILD_BUG_SQE_ELEM(28, __u32, poll32_events); BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags); BUILD_BUG_SQE_ELEM(28, __u32, msg_flags); BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags); diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 92c22699a5a7..8d033961cb78 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -31,7 +31,8 @@ struct io_uring_sqe { union { __kernel_rwf_t rw_flags; __u32 fsync_flags; - __u16 poll_events; + __u16 poll_events; /* compatibility */ + __u32 poll32_events; /* word-reversed for BE */ __u32 sync_range_flags; __u32 msg_flags; __u32 timeout_flags; @@ -248,6 +249,7 @@ struct io_uring_params { #define IORING_FEAT_RW_CUR_POS (1U << 3) #define IORING_FEAT_CUR_PERSONALITY (1U << 4) #define IORING_FEAT_FAST_POLL (1U << 5) +#define IORING_FEAT_POLL_32BITS (1U << 6) /* * io_uring_register(2) opcodes and arguments diff --git a/tools/io_uring/liburing.h b/tools/io_uring/liburing.h index 5f305c86b892..28a837b6069d 100644 --- a/tools/io_uring/liburing.h +++ b/tools/io_uring/liburing.h @@ -10,6 +10,7 @@ extern "C" { #include #include "../../include/uapi/linux/io_uring.h" #include +#include #include "barrier.h" /* @@ -145,11 +146,14 @@ static inline void io_uring_prep_write_fixed(struct io_uring_sqe *sqe, int fd, } static inline void io_uring_prep_poll_add(struct io_uring_sqe *sqe, int fd, - short poll_mask) + unsigned poll_mask) { memset(sqe, 0, sizeof(*sqe)); sqe->opcode = IORING_OP_POLL_ADD; sqe->fd = fd; +#if __BYTE_ORDER == __BIG_ENDIAN + poll_mask = __swahw32(poll_mask); +#endif sqe->poll_events = poll_mask; } -- cgit v1.2.3-59-g8ed1b From 2a916ecc405686c1d86f632281bc06aa75ebae4e Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Fri, 19 Jun 2020 03:32:48 +0000 Subject: net/devlink: Support querying hardware address of port function PCI PF and VF devlink port can manage the function represented by a devlink port. Enable users to query port function's hardware address. Example of a PCI VF port which supports a port function: $ devlink port show pci/0000:06:00.0/2 pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1 function: hw_addr 00:11:22:33:44:66 $ devlink port show pci/0000:06:00.0/2 -jp { "port": { "pci/0000:06:00.0/2": { "type": "eth", "netdev": "enp6s0pf0vf1", "flavour": "pcivf", "pfnum": 0, "vfnum": 1, "function": { "hw_addr": "00:11:22:33:44:66" } } } } Signed-off-by: Parav Pandit Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 12 ++++++++++++ include/uapi/linux/devlink.h | 10 ++++++++++ net/core/devlink.c | 45 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 67 insertions(+) (limited to 'include/uapi') diff --git a/include/net/devlink.h b/include/net/devlink.h index 1df6dfec26c2..56fc9cdb189d 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1107,6 +1107,18 @@ struct devlink_ops { int (*trap_policer_counter_get)(struct devlink *devlink, const struct devlink_trap_policer *policer, u64 *p_drops); + /** + * @port_function_hw_addr_get: Port function's hardware address get function. + * + * Should be used by device drivers to report the hardware address of a function managed + * by the devlink port. Driver should return -EOPNOTSUPP if it doesn't support port + * function handling for a particular port. + * + * Note: @extack can be NULL when port notifier queries the port function. + */ + int (*port_function_hw_addr_get)(struct devlink *devlink, struct devlink_port *port, + u8 *hw_addr, int *hw_addr_len, + struct netlink_ext_ack *extack); }; static inline void *devlink_priv(struct devlink *devlink) diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 08563e6a424d..07d0af8f5923 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -451,6 +451,8 @@ enum devlink_attr { DEVLINK_ATTR_TRAP_POLICER_RATE, /* u64 */ DEVLINK_ATTR_TRAP_POLICER_BURST, /* u64 */ + DEVLINK_ATTR_PORT_FUNCTION, /* nested */ + /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, @@ -497,4 +499,12 @@ enum devlink_resource_unit { DEVLINK_RESOURCE_UNIT_ENTRY, }; +enum devlink_port_function_attr { + DEVLINK_PORT_FUNCTION_ATTR_UNSPEC, + DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR, /* binary */ + + __DEVLINK_PORT_FUNCTION_ATTR_MAX, + DEVLINK_PORT_FUNCTION_ATTR_MAX = __DEVLINK_PORT_FUNCTION_ATTR_MAX - 1 +}; + #endif /* _UAPI_LINUX_DEVLINK_H_ */ diff --git a/net/core/devlink.c b/net/core/devlink.c index 05197631d52a..b6848b607e9c 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -563,6 +563,49 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg, return 0; } +static int +devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *port, + struct netlink_ext_ack *extack) +{ + struct devlink *devlink = port->devlink; + const struct devlink_ops *ops; + struct nlattr *function_attr; + bool empty_nest = true; + int err = 0; + + function_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_PORT_FUNCTION); + if (!function_attr) + return -EMSGSIZE; + + ops = devlink->ops; + if (ops->port_function_hw_addr_get) { + int uninitialized_var(hw_addr_len); + u8 hw_addr[MAX_ADDR_LEN]; + + err = ops->port_function_hw_addr_get(devlink, port, hw_addr, &hw_addr_len, extack); + if (err == -EOPNOTSUPP) { + /* Port function attributes are optional for a port. If port doesn't + * support function attribute, returning -EOPNOTSUPP is not an error. + */ + err = 0; + goto out; + } else if (err) { + goto out; + } + err = nla_put(msg, DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR, hw_addr_len, hw_addr); + if (err) + goto out; + empty_nest = false; + } + +out: + if (err || empty_nest) + nla_nest_cancel(msg, function_attr); + else + nla_nest_end(msg, function_attr); + return err; +} + static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink, struct devlink_port *devlink_port, enum devlink_command cmd, u32 portid, @@ -608,6 +651,8 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink, spin_unlock_bh(&devlink_port->type_lock); if (devlink_nl_port_attrs_put(msg, devlink_port)) goto nla_put_failure; + if (devlink_nl_port_function_attrs_put(msg, devlink_port, extack)) + goto nla_put_failure; genlmsg_end(msg, hdr); return 0; -- cgit v1.2.3-59-g8ed1b From b5872cd0e823e4cb50b3a75cd9522167eeb676a2 Mon Sep 17 00:00:00 2001 From: Vasundhara Volam Date: Sat, 20 Jun 2020 22:01:56 +0530 Subject: devlink: Add support for board.serial_number to info_get cb. Board serial number is a serial number, often available in PCI *Vital Product Data*. Also, update devlink-info.rst documentation file. Cc: Jiri Pirko Cc: Jakub Kicinski Signed-off-by: Vasundhara Volam Reviewed-by: Michael Chan Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- Documentation/networking/devlink/devlink-info.rst | 12 +++++------- include/net/devlink.h | 2 ++ include/uapi/linux/devlink.h | 2 ++ net/core/devlink.c | 8 ++++++++ 4 files changed, 17 insertions(+), 7 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/networking/devlink/devlink-info.rst b/Documentation/networking/devlink/devlink-info.rst index 3fe11401b838..7572bf6de5c1 100644 --- a/Documentation/networking/devlink/devlink-info.rst +++ b/Documentation/networking/devlink/devlink-info.rst @@ -44,9 +44,11 @@ versions is generally discouraged - here, and via any other Linux API. reported for two ports of the same device or on two hosts of a multi-host device should be identical. - .. note:: ``devlink-info`` API should be extended with a new field - if devices want to report board/product serial number (often - reported in PCI *Vital Product Data* capability). + * - ``board.serial_number`` + - Board serial number of the device. + + This is usually the serial number of the board, often available in + PCI *Vital Product Data*. * - ``fixed`` - Group for hardware identifiers, and versions of components @@ -201,10 +203,6 @@ Future work The following extensions could be useful: - - product serial number - NIC boards often get labeled with a board serial - number rather than ASIC serial number; it'd be useful to add board serial - numbers to the API if they can be retrieved from the device; - - on-disk firmware file names - drivers list the file names of firmware they may need to load onto devices via the ``MODULE_FIRMWARE()`` macro. These, however, are per module, rather than per device. It'd be useful to list diff --git a/include/net/devlink.h b/include/net/devlink.h index 7007f93585a5..428f55f8197c 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1284,6 +1284,8 @@ int devlink_info_serial_number_put(struct devlink_info_req *req, const char *sn); int devlink_info_driver_name_put(struct devlink_info_req *req, const char *name); +int devlink_info_board_serial_number_put(struct devlink_info_req *req, + const char *bsn); int devlink_info_version_fixed_put(struct devlink_info_req *req, const char *version_name, const char *version_value); diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 07d0af8f5923..87c83a82991b 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -453,6 +453,8 @@ enum devlink_attr { DEVLINK_ATTR_PORT_FUNCTION, /* nested */ + DEVLINK_ATTR_INFO_BOARD_SERIAL_NUMBER, /* string */ + /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index baa45eca6b5a..455998a57671 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -4502,6 +4502,14 @@ int devlink_info_serial_number_put(struct devlink_info_req *req, const char *sn) } EXPORT_SYMBOL_GPL(devlink_info_serial_number_put); +int devlink_info_board_serial_number_put(struct devlink_info_req *req, + const char *bsn) +{ + return nla_put_string(req->msg, DEVLINK_ATTR_INFO_BOARD_SERIAL_NUMBER, + bsn); +} +EXPORT_SYMBOL_GPL(devlink_info_board_serial_number_put); + static int devlink_info_version_put(struct devlink_info_req *req, int attr, const char *version_name, const char *version_value) -- cgit v1.2.3-59-g8ed1b From 23a60f834406c8e3805328b630d09d5546b460c1 Mon Sep 17 00:00:00 2001 From: Collin Walling Date: Mon, 22 Jun 2020 11:46:36 -0400 Subject: s390/kvm: diagnose 0x318 sync and reset DIAGNOSE 0x318 (diag318) sets information regarding the environment the VM is running in (Linux, z/VM, etc) and is observed via firmware/service events. This is a privileged s390x instruction that must be intercepted by SIE. Userspace handles the instruction as well as migration. Data is communicated via VCPU register synchronization. The Control Program Name Code (CPNC) is stored in the SIE block. The CPNC along with the Control Program Version Code (CPVC) are stored in the kvm_vcpu_arch struct. This data is reset on load normal and clear resets. Signed-off-by: Collin Walling Reviewed-by: Janosch Frank Acked-by: Cornelia Huck Reviewed-by: David Hildenbrand Link: https://lore.kernel.org/r/20200622154636.5499-3-walling@linux.ibm.com [borntraeger@de.ibm.com: fix sync_reg position] Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/kvm_host.h | 4 +++- arch/s390/include/uapi/asm/kvm.h | 7 +++++-- arch/s390/kvm/kvm-s390.c | 11 ++++++++++- arch/s390/kvm/vsie.c | 1 + include/uapi/linux/kvm.h | 1 + 5 files changed, 20 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index cee3cb6455a2..371ec6beb618 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -260,7 +260,8 @@ struct kvm_s390_sie_block { __u32 scaol; /* 0x0064 */ __u8 sdf; /* 0x0068 */ __u8 epdx; /* 0x0069 */ - __u8 reserved6a[2]; /* 0x006a */ + __u8 cpnc; /* 0x006a */ + __u8 reserved6b; /* 0x006b */ __u32 todpr; /* 0x006c */ #define GISA_FORMAT1 0x00000001 __u32 gd; /* 0x0070 */ @@ -745,6 +746,7 @@ struct kvm_vcpu_arch { bool gs_enabled; bool skey_enabled; struct kvm_s390_pv_vcpu pv; + union diag318_info diag318_info; }; struct kvm_vm_stat { diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h index 436ec7636927..7a6b14874d65 100644 --- a/arch/s390/include/uapi/asm/kvm.h +++ b/arch/s390/include/uapi/asm/kvm.h @@ -231,11 +231,13 @@ struct kvm_guest_debug_arch { #define KVM_SYNC_GSCB (1UL << 9) #define KVM_SYNC_BPBC (1UL << 10) #define KVM_SYNC_ETOKEN (1UL << 11) +#define KVM_SYNC_DIAG318 (1UL << 12) #define KVM_SYNC_S390_VALID_FIELDS \ (KVM_SYNC_PREFIX | KVM_SYNC_GPRS | KVM_SYNC_ACRS | KVM_SYNC_CRS | \ KVM_SYNC_ARCH0 | KVM_SYNC_PFAULT | KVM_SYNC_VRS | KVM_SYNC_RICCB | \ - KVM_SYNC_FPRS | KVM_SYNC_GSCB | KVM_SYNC_BPBC | KVM_SYNC_ETOKEN) + KVM_SYNC_FPRS | KVM_SYNC_GSCB | KVM_SYNC_BPBC | KVM_SYNC_ETOKEN | \ + KVM_SYNC_DIAG318) /* length and alignment of the sdnx as a power of two */ #define SDNXC 8 @@ -264,7 +266,8 @@ struct kvm_sync_regs { __u8 reserved2 : 7; __u8 padding1[51]; /* riccb needs to be 64byte aligned */ __u8 riccb[64]; /* runtime instrumentation controls block */ - __u8 padding2[192]; /* sdnx needs to be 256byte aligned */ + __u64 diag318; /* diagnose 0x318 info */ + __u8 padding2[184]; /* sdnx needs to be 256byte aligned */ union { __u8 sdnx[SDNXL]; /* state description annex */ struct { diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index d47c19718615..08e6cf6cb454 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -545,6 +545,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_AIS_MIGRATION: case KVM_CAP_S390_VCPU_RESETS: case KVM_CAP_SET_GUEST_DEBUG: + case KVM_CAP_S390_DIAG318: r = 1; break; case KVM_CAP_S390_HPAGE_1M: @@ -3267,7 +3268,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) KVM_SYNC_ACRS | KVM_SYNC_CRS | KVM_SYNC_ARCH0 | - KVM_SYNC_PFAULT; + KVM_SYNC_PFAULT | + KVM_SYNC_DIAG318; kvm_s390_set_prefix(vcpu, 0); if (test_kvm_facility(vcpu->kvm, 64)) vcpu->run->kvm_valid_regs |= KVM_SYNC_RICCB; @@ -3562,6 +3564,7 @@ static void kvm_arch_vcpu_ioctl_initial_reset(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->pp = 0; vcpu->arch.sie_block->fpf &= ~FPF_BPBC; vcpu->arch.sie_block->todpr = 0; + vcpu->arch.sie_block->cpnc = 0; } } @@ -3579,6 +3582,7 @@ static void kvm_arch_vcpu_ioctl_clear_reset(struct kvm_vcpu *vcpu) regs->etoken = 0; regs->etoken_extension = 0; + regs->diag318 = 0; } int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) @@ -4196,6 +4200,10 @@ static void sync_regs_fmt2(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (vcpu->arch.pfault_token == KVM_S390_PFAULT_TOKEN_INVALID) kvm_clear_async_pf_completion_queue(vcpu); } + if (kvm_run->kvm_dirty_regs & KVM_SYNC_DIAG318) { + vcpu->arch.diag318_info.val = kvm_run->s.regs.diag318; + vcpu->arch.sie_block->cpnc = vcpu->arch.diag318_info.cpnc; + } /* * If userspace sets the riccb (e.g. after migration) to a valid state, * we should enable RI here instead of doing the lazy enablement. @@ -4297,6 +4305,7 @@ static void store_regs_fmt2(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) kvm_run->s.regs.pp = vcpu->arch.sie_block->pp; kvm_run->s.regs.gbea = vcpu->arch.sie_block->gbea; kvm_run->s.regs.bpbc = (vcpu->arch.sie_block->fpf & FPF_BPBC) == FPF_BPBC; + kvm_run->s.regs.diag318 = vcpu->arch.diag318_info.val; if (MACHINE_HAS_GS) { __ctl_set_bit(2, 4); if (vcpu->arch.gs_enabled) diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 9e9056cebfcf..4f3cbf6003a9 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -548,6 +548,7 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) scb_s->ecd |= scb_o->ecd & ECD_ETOKENF; scb_s->hpid = HPID_VSIE; + scb_s->cpnc = scb_o->cpnc; prepare_ibc(vcpu, vsie_page); rc = shadow_crycb(vcpu, vsie_page); diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 4fdf30316582..35cdb4307904 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1031,6 +1031,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_PPC_SECURE_GUEST 181 #define KVM_CAP_HALT_POLL 182 #define KVM_CAP_ASYNC_PF_INT 183 +#define KVM_CAP_S390_DIAG318 184 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3-59-g8ed1b From ac53503ee38a1ffbc47c7cca6cbfc48ba9c65c5e Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Thu, 14 May 2020 18:01:43 +0200 Subject: media: videobuf2: add V4L2_FLAG_MEMORY_NON_CONSISTENT flag By setting or clearing V4L2_FLAG_MEMORY_NON_CONSISTENT flag user-space should be able to set or clear queue's NON_CONSISTENT ->dma_attrs. Queue's ->dma_attrs are passed to the underlying allocator in __vb2_buf_mem_alloc(), so thus user-space is able to request vb2 buffer's memory to be either consistent (coherent) or non-consistent. The patch set also adds a corresponding capability flag: fill_buf_caps() reports V4L2_BUF_CAP_SUPPORTS_MMAP_CACHE_HINTS when queue supports user-space cache management hints. Note, however, that MMAP_CACHE_HINTS capability only valid when the queue is used for memory MMAP-ed streaming I/O. Signed-off-by: Sergey Senozhatsky Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- Documentation/userspace-api/media/v4l/buffer.rst | 40 ++++++++++++++++++++-- .../userspace-api/media/v4l/vidioc-reqbufs.rst | 10 ++++++ drivers/media/common/videobuf2/videobuf2-v4l2.c | 2 ++ include/uapi/linux/videodev2.h | 3 ++ 4 files changed, 53 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/userspace-api/media/v4l/buffer.rst b/Documentation/userspace-api/media/v4l/buffer.rst index 951ae1ed485f..5088393b5a5c 100644 --- a/Documentation/userspace-api/media/v4l/buffer.rst +++ b/Documentation/userspace-api/media/v4l/buffer.rst @@ -577,7 +577,10 @@ Buffer Flags applications shall use this flag if the data captured in the buffer is not going to be touched by the CPU, instead the buffer will, probably, be passed on to a DMA-capable hardware unit for - further processing or output. + further processing or output. This flag is ignored unless the + queue is used for :ref:`memory mapping ` streaming I/O and + reports :ref:`V4L2_BUF_CAP_SUPPORTS_MMAP_CACHE_HINTS + ` capability. * .. _`V4L2-BUF-FLAG-NO-CACHE-CLEAN`: - ``V4L2_BUF_FLAG_NO_CACHE_CLEAN`` @@ -585,7 +588,10 @@ Buffer Flags - Caches do not have to be cleaned for this buffer. Typically applications shall use this flag for output buffers if the data in this buffer has not been created by the CPU but by some - DMA-capable unit, in which case caches have not been used. + DMA-capable unit, in which case caches have not been used. This flag + is ignored unless the queue is used for :ref:`memory mapping ` + streaming I/O and reports :ref:`V4L2_BUF_CAP_SUPPORTS_MMAP_CACHE_HINTS + ` capability. * .. _`V4L2-BUF-FLAG-M2M-HOLD-CAPTURE-BUF`: - ``V4L2_BUF_FLAG_M2M_HOLD_CAPTURE_BUF`` @@ -681,6 +687,36 @@ Buffer Flags \normalsize +.. _memory-flags: + +Memory Consistency Flags +======================== + +.. tabularcolumns:: |p{7.0cm}|p{2.2cm}|p{8.3cm}| + +.. cssclass:: longtable + +.. flat-table:: + :header-rows: 0 + :stub-columns: 0 + :widths: 3 1 4 + + * .. _`V4L2-FLAG-MEMORY-NON-CONSISTENT`: + + - ``V4L2_FLAG_MEMORY_NON_CONSISTENT`` + - 0x00000001 + - A buffer is allocated either in consistent (it will be automatically + coherent between the CPU and the bus) or non-consistent memory. The + latter can provide performance gains, for instance the CPU cache + sync/flush operations can be avoided if the buffer is accessed by the + corresponding device only and the CPU does not read/write to/from that + buffer. However, this requires extra care from the driver -- it must + guarantee memory consistency by issuing a cache flush/sync when + consistency is needed. If this flag is set V4L2 will attempt to + allocate the buffer in non-consistent memory. The flag takes effect + only if the buffer is used for :ref:`memory mapping ` I/O and the + queue reports the :ref:`V4L2_BUF_CAP_SUPPORTS_MMAP_CACHE_HINTS + ` capability. .. c:type:: v4l2_memory diff --git a/Documentation/userspace-api/media/v4l/vidioc-reqbufs.rst b/Documentation/userspace-api/media/v4l/vidioc-reqbufs.rst index b6d52083707b..96a59793d857 100644 --- a/Documentation/userspace-api/media/v4l/vidioc-reqbufs.rst +++ b/Documentation/userspace-api/media/v4l/vidioc-reqbufs.rst @@ -126,6 +126,7 @@ aborting or finishing any DMA in progress, an implicit .. _V4L2-BUF-CAP-SUPPORTS-REQUESTS: .. _V4L2-BUF-CAP-SUPPORTS-ORPHANED-BUFS: .. _V4L2-BUF-CAP-SUPPORTS-M2M-HOLD-CAPTURE-BUF: +.. _V4L2-BUF-CAP-SUPPORTS-MMAP-CACHE-HINTS: .. cssclass:: longtable @@ -156,6 +157,15 @@ aborting or finishing any DMA in progress, an implicit - Only valid for stateless decoders. If set, then userspace can set the ``V4L2_BUF_FLAG_M2M_HOLD_CAPTURE_BUF`` flag to hold off on returning the capture buffer until the OUTPUT timestamp changes. + * - ``V4L2_BUF_CAP_SUPPORTS_MMAP_CACHE_HINTS`` + - 0x00000040 + - This capability is set by the driver to indicate that the queue supports + cache and memory management hints. However, it's only valid when the + queue is used for :ref:`memory mapping ` streaming I/O. See + :ref:`V4L2_FLAG_MEMORY_NON_CONSISTENT `, + :ref:`V4L2_BUF_FLAG_NO_CACHE_INVALIDATE ` and + :ref:`V4L2_BUF_FLAG_NO_CACHE_CLEAN `. + Return Value ============ diff --git a/drivers/media/common/videobuf2/videobuf2-v4l2.c b/drivers/media/common/videobuf2/videobuf2-v4l2.c index f13851212cc8..e4b4354b42b8 100644 --- a/drivers/media/common/videobuf2/videobuf2-v4l2.c +++ b/drivers/media/common/videobuf2/videobuf2-v4l2.c @@ -710,6 +710,8 @@ static void fill_buf_caps(struct vb2_queue *q, u32 *caps) *caps |= V4L2_BUF_CAP_SUPPORTS_DMABUF; if (q->subsystem_flags & VB2_V4L2_FL_SUPPORTS_M2M_HOLD_CAPTURE_BUF) *caps |= V4L2_BUF_CAP_SUPPORTS_M2M_HOLD_CAPTURE_BUF; + if (q->allow_cache_hints && q->io_modes & VB2_MMAP) + *caps |= V4L2_BUF_CAP_SUPPORTS_MMAP_CACHE_HINTS; #ifdef CONFIG_MEDIA_CONTROLLER_REQUEST_API if (q->supports_requests) *caps |= V4L2_BUF_CAP_SUPPORTS_REQUESTS; diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index c3a1cf1c507f..34ba1017b89b 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -189,6 +189,8 @@ enum v4l2_memory { V4L2_MEMORY_DMABUF = 4, }; +#define V4L2_FLAG_MEMORY_NON_CONSISTENT (1 << 0) + /* see also http://vektor.theorem.ca/graphics/ycbcr/ */ enum v4l2_colorspace { /* @@ -954,6 +956,7 @@ struct v4l2_requestbuffers { #define V4L2_BUF_CAP_SUPPORTS_REQUESTS (1 << 3) #define V4L2_BUF_CAP_SUPPORTS_ORPHANED_BUFS (1 << 4) #define V4L2_BUF_CAP_SUPPORTS_M2M_HOLD_CAPTURE_BUF (1 << 5) +#define V4L2_BUF_CAP_SUPPORTS_MMAP_CACHE_HINTS (1 << 6) /** * struct v4l2_plane - plane info for multi-planar buffers -- cgit v1.2.3-59-g8ed1b From 1e0b2318fa75d186ee0d2be31843ce867385fcc4 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Thu, 14 May 2020 18:01:45 +0200 Subject: media: videobuf2: handle V4L2_FLAG_MEMORY_NON_CONSISTENT flag This patch lets user-space to request a non-consistent memory allocation during CREATE_BUFS and REQBUFS ioctl calls. = CREATE_BUFS struct v4l2_create_buffers has seven 4-byte reserved areas, so reserved[0] is renamed to ->flags. The struct, thus, now has six reserved 4-byte regions. = CREATE_BUFS32 struct v4l2_create_buffers32 has seven 4-byte reserved areas, so reserved[0] is renamed to ->flags. The struct, thus, now has six reserved 4-byte regions. = REQBUFS We use one bit of a ->reserved[1] member of struct v4l2_requestbuffers, which is now renamed to ->flags. Unlike v4l2_create_buffers, struct v4l2_requestbuffers does not have enough reserved room. Therefore for backward compatibility ->reserved and ->flags were put into anonymous union. Signed-off-by: Sergey Senozhatsky Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- .../userspace-api/media/v4l/vidioc-create-bufs.rst | 7 ++++++- .../userspace-api/media/v4l/vidioc-reqbufs.rst | 11 ++++++++-- drivers/media/common/videobuf2/videobuf2-core.c | 6 ++++++ drivers/media/common/videobuf2/videobuf2-v4l2.c | 24 ++++++++++++++++++---- drivers/media/v4l2-core/v4l2-compat-ioctl32.c | 10 +++++++-- drivers/media/v4l2-core/v4l2-ioctl.c | 5 +---- include/uapi/linux/videodev2.h | 11 ++++++++-- 7 files changed, 59 insertions(+), 15 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/userspace-api/media/v4l/vidioc-create-bufs.rst b/Documentation/userspace-api/media/v4l/vidioc-create-bufs.rst index e1afc5b504c2..f2a702870fad 100644 --- a/Documentation/userspace-api/media/v4l/vidioc-create-bufs.rst +++ b/Documentation/userspace-api/media/v4l/vidioc-create-bufs.rst @@ -121,7 +121,12 @@ than the number requested. other changes, then set ``count`` to 0, ``memory`` to ``V4L2_MEMORY_MMAP`` and ``format.type`` to the buffer type. * - __u32 - - ``reserved``\ [7] + - ``flags`` + - Specifies additional buffer management attributes. + See :ref:`memory-flags`. + + * - __u32 + - ``reserved``\ [6] - A place holder for future extensions. Drivers and applications must set the array to zero. diff --git a/Documentation/userspace-api/media/v4l/vidioc-reqbufs.rst b/Documentation/userspace-api/media/v4l/vidioc-reqbufs.rst index 96a59793d857..75d894d9c36c 100644 --- a/Documentation/userspace-api/media/v4l/vidioc-reqbufs.rst +++ b/Documentation/userspace-api/media/v4l/vidioc-reqbufs.rst @@ -112,10 +112,17 @@ aborting or finishing any DMA in progress, an implicit ``V4L2_MEMORY_MMAP`` and ``type`` set to the buffer type. This will free any previously allocated buffers, so this is typically something that will be done at the start of the application. + * - union { + - (anonymous) + * - __u32 + - ``flags`` + - Specifies additional buffer management attributes. + See :ref:`memory-flags`. * - __u32 - ``reserved``\ [1] - - A place holder for future extensions. Drivers and applications - must set the array to zero. + - Kept for backwards compatibility. Use ``flags`` instead. + * - } + - .. tabularcolumns:: |p{6.1cm}|p{2.2cm}|p{8.7cm}| diff --git a/drivers/media/common/videobuf2/videobuf2-core.c b/drivers/media/common/videobuf2/videobuf2-core.c index 0fdcf90330df..626c4db5134c 100644 --- a/drivers/media/common/videobuf2/videobuf2-core.c +++ b/drivers/media/common/videobuf2/videobuf2-core.c @@ -694,6 +694,9 @@ int vb2_core_reqbufs(struct vb2_queue *q, enum vb2_memory memory, unsigned int i; int ret; + if (flags & V4L2_FLAG_MEMORY_NON_CONSISTENT) + consistent_mem = false; + if (q->streaming) { dprintk(1, "streaming active\n"); return -EBUSY; @@ -837,6 +840,9 @@ int vb2_core_create_bufs(struct vb2_queue *q, enum vb2_memory memory, bool consistent_mem = true; int ret; + if (flags & V4L2_FLAG_MEMORY_NON_CONSISTENT) + consistent_mem = false; + if (q->num_buffers == VB2_MAX_FRAME) { dprintk(1, "maximum number of buffers already allocated\n"); return -ENOBUFS; diff --git a/drivers/media/common/videobuf2/videobuf2-v4l2.c b/drivers/media/common/videobuf2/videobuf2-v4l2.c index 26a3ec333bb7..559a229cac41 100644 --- a/drivers/media/common/videobuf2/videobuf2-v4l2.c +++ b/drivers/media/common/videobuf2/videobuf2-v4l2.c @@ -718,12 +718,22 @@ static void fill_buf_caps(struct vb2_queue *q, u32 *caps) #endif } +static void clear_consistency_attr(struct vb2_queue *q, + int memory, + unsigned int *flags) +{ + if (!q->allow_cache_hints || memory != V4L2_MEMORY_MMAP) + *flags &= ~V4L2_FLAG_MEMORY_NON_CONSISTENT; +} + int vb2_reqbufs(struct vb2_queue *q, struct v4l2_requestbuffers *req) { int ret = vb2_verify_memory_type(q, req->memory, req->type); fill_buf_caps(q, &req->capabilities); - return ret ? ret : vb2_core_reqbufs(q, req->memory, 0, &req->count); + clear_consistency_attr(q, req->memory, &req->flags); + return ret ? ret : vb2_core_reqbufs(q, req->memory, + req->flags, &req->count); } EXPORT_SYMBOL_GPL(vb2_reqbufs); @@ -755,6 +765,7 @@ int vb2_create_bufs(struct vb2_queue *q, struct v4l2_create_buffers *create) unsigned i; fill_buf_caps(q, &create->capabilities); + clear_consistency_attr(q, create->memory, &create->flags); create->index = q->num_buffers; if (create->count == 0) return ret != -EBUSY ? ret : 0; @@ -797,8 +808,11 @@ int vb2_create_bufs(struct vb2_queue *q, struct v4l2_create_buffers *create) for (i = 0; i < requested_planes; i++) if (requested_sizes[i] == 0) return -EINVAL; - return ret ? ret : vb2_core_create_bufs(q, create->memory, 0, - &create->count, requested_planes, requested_sizes); + return ret ? ret : vb2_core_create_bufs(q, create->memory, + create->flags, + &create->count, + requested_planes, + requested_sizes); } EXPORT_SYMBOL_GPL(vb2_create_bufs); @@ -969,11 +983,12 @@ int vb2_ioctl_reqbufs(struct file *file, void *priv, int res = vb2_verify_memory_type(vdev->queue, p->memory, p->type); fill_buf_caps(vdev->queue, &p->capabilities); + clear_consistency_attr(vdev->queue, p->memory, &p->flags); if (res) return res; if (vb2_queue_is_busy(vdev, file)) return -EBUSY; - res = vb2_core_reqbufs(vdev->queue, p->memory, 0, &p->count); + res = vb2_core_reqbufs(vdev->queue, p->memory, p->flags, &p->count); /* If count == 0, then the owner has released all buffers and he is no longer owner of the queue. Otherwise we have a new owner. */ if (res == 0) @@ -991,6 +1006,7 @@ int vb2_ioctl_create_bufs(struct file *file, void *priv, p->index = vdev->queue->num_buffers; fill_buf_caps(vdev->queue, &p->capabilities); + clear_consistency_attr(vdev->queue, p->memory, &p->flags); /* * If count == 0, then just check if memory and type are valid. * Any -EBUSY result from vb2_verify_memory_type can be mapped to 0. diff --git a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c index a99e82ec9ab6..593bcf6c3735 100644 --- a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c +++ b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c @@ -246,6 +246,9 @@ struct v4l2_format32 { * @memory: buffer memory type * @format: frame format, for which buffers are requested * @capabilities: capabilities of this buffer type. + * @flags: additional buffer management attributes (ignored unless the + * queue has V4L2_BUF_CAP_SUPPORTS_MMAP_CACHE_HINTS capability and + * configured for MMAP streaming I/O). * @reserved: future extensions */ struct v4l2_create_buffers32 { @@ -254,7 +257,8 @@ struct v4l2_create_buffers32 { __u32 memory; /* enum v4l2_memory */ struct v4l2_format32 format; __u32 capabilities; - __u32 reserved[7]; + __u32 flags; + __u32 reserved[6]; }; static int __bufsize_v4l2_format(struct v4l2_format32 __user *p32, u32 *size) @@ -355,7 +359,8 @@ static int get_v4l2_create32(struct v4l2_create_buffers __user *p64, { if (!access_ok(p32, sizeof(*p32)) || copy_in_user(p64, p32, - offsetof(struct v4l2_create_buffers32, format))) + offsetof(struct v4l2_create_buffers32, format)) || + assign_in_user(&p64->flags, &p32->flags)) return -EFAULT; return __get_v4l2_format32(&p64->format, &p32->format, aux_buf, aux_space); @@ -417,6 +422,7 @@ static int put_v4l2_create32(struct v4l2_create_buffers __user *p64, copy_in_user(p32, p64, offsetof(struct v4l2_create_buffers32, format)) || assign_in_user(&p32->capabilities, &p64->capabilities) || + assign_in_user(&p32->flags, &p64->flags) || copy_in_user(p32->reserved, p64->reserved, sizeof(p64->reserved))) return -EFAULT; return __put_v4l2_format32(&p64->format, &p32->format); diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c index 2322f08a98be..02bfef0da76d 100644 --- a/drivers/media/v4l2-core/v4l2-ioctl.c +++ b/drivers/media/v4l2-core/v4l2-ioctl.c @@ -2038,9 +2038,6 @@ static int v4l_reqbufs(const struct v4l2_ioctl_ops *ops, if (ret) return ret; - - CLEAR_AFTER_FIELD(p, capabilities); - return ops->vidioc_reqbufs(file, fh, p); } @@ -2080,7 +2077,7 @@ static int v4l_create_bufs(const struct v4l2_ioctl_ops *ops, if (ret) return ret; - CLEAR_AFTER_FIELD(create, capabilities); + CLEAR_AFTER_FIELD(create, flags); v4l_sanitize_format(&create->format); diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index 34ba1017b89b..fec2607a07e3 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -946,7 +946,10 @@ struct v4l2_requestbuffers { __u32 type; /* enum v4l2_buf_type */ __u32 memory; /* enum v4l2_memory */ __u32 capabilities; - __u32 reserved[1]; + union { + __u32 flags; + __u32 reserved[1]; + }; }; /* capabilities for struct v4l2_requestbuffers and v4l2_create_buffers */ @@ -2450,6 +2453,9 @@ struct v4l2_dbg_chip_info { * @memory: enum v4l2_memory; buffer memory type * @format: frame format, for which buffers are requested * @capabilities: capabilities of this buffer type. + * @flags: additional buffer management attributes (ignored unless the + * queue has V4L2_BUF_CAP_SUPPORTS_MMAP_CACHE_HINTS capability + * and configured for MMAP streaming I/O). * @reserved: future extensions */ struct v4l2_create_buffers { @@ -2458,7 +2464,8 @@ struct v4l2_create_buffers { __u32 memory; struct v4l2_format format; __u32 capabilities; - __u32 reserved[7]; + __u32 flags; + __u32 reserved[6]; }; /* -- cgit v1.2.3-59-g8ed1b From 286cf7d3a99e1ca8c1d8e674b9a98f2dbe8520dc Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Tue, 26 May 2020 10:59:53 +0200 Subject: media: videodev2.h: add V4L2_FMT_FLAG_ENC_CAP_FRAME_INTERVAL flag Add the V4L2_FMT_FLAG_ENC_CAP_FRAME_INTERVAL flag to signal that the coded frame interval can be set separately from the raw frame interval for stateful encoders. Signed-off-by: Hans Verkuil Reviewed-by: Michael Tretter Acked-by: Tomasz Figa Signed-off-by: Mauro Carvalho Chehab --- .../userspace-api/media/v4l/vidioc-enum-fmt.rst | 30 ++++++++++++++++++---- .../userspace-api/media/videodev2.h.rst.exceptions | 1 + include/uapi/linux/videodev2.h | 1 + 3 files changed, 27 insertions(+), 5 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/userspace-api/media/v4l/vidioc-enum-fmt.rst b/Documentation/userspace-api/media/v4l/vidioc-enum-fmt.rst index a53dd3d7f7e2..05835e04c20b 100644 --- a/Documentation/userspace-api/media/v4l/vidioc-enum-fmt.rst +++ b/Documentation/userspace-api/media/v4l/vidioc-enum-fmt.rst @@ -167,17 +167,37 @@ the ``mbus_code`` field is handled differently: - The hardware decoder for this compressed bytestream format (aka coded format) is capable of parsing a continuous bytestream. Applications do not need to parse the bytestream themselves to find the boundaries - between frames/fields. This flag can only be used in combination with - the ``V4L2_FMT_FLAG_COMPRESSED`` flag, since this applies to compressed + between frames/fields. + + This flag can only be used in combination with the + ``V4L2_FMT_FLAG_COMPRESSED`` flag, since this applies to compressed formats only. This flag is valid for stateful decoders only. * - ``V4L2_FMT_FLAG_DYN_RESOLUTION`` - 0x0008 - Dynamic resolution switching is supported by the device for this compressed bytestream format (aka coded format). It will notify the user via the event ``V4L2_EVENT_SOURCE_CHANGE`` when changes in the video - parameters are detected. This flag can only be used in combination - with the ``V4L2_FMT_FLAG_COMPRESSED`` flag, since this applies to - compressed formats only. It is also only applies to stateful codecs. + parameters are detected. + + This flag can only be used in combination with the + ``V4L2_FMT_FLAG_COMPRESSED`` flag, since this applies to + compressed formats only. This flag is valid for stateful codecs only. + * - ``V4L2_FMT_FLAG_ENC_CAP_FRAME_INTERVAL`` + - 0x0010 + - The hardware encoder supports setting the ``CAPTURE`` coded frame + interval separately from the ``OUTPUT`` raw frame interval. + Setting the ``OUTPUT`` raw frame interval with :ref:`VIDIOC_S_PARM ` + also sets the ``CAPTURE`` coded frame interval to the same value. + If this flag is set, then the ``CAPTURE`` coded frame interval can be + set to a different value afterwards. This is typically used for + offline encoding where the ``OUTPUT`` raw frame interval is used as + a hint for reserving hardware encoder resources and the ``CAPTURE`` coded + frame interval is the actual frame rate embedded in the encoded video + stream. + + This flag can only be used in combination with the + ``V4L2_FMT_FLAG_COMPRESSED`` flag, since this applies to + compressed formats only. This flag is valid for stateful encoders only. Return Value diff --git a/Documentation/userspace-api/media/videodev2.h.rst.exceptions b/Documentation/userspace-api/media/videodev2.h.rst.exceptions index a625fb90e3a9..ca05e4e126b2 100644 --- a/Documentation/userspace-api/media/videodev2.h.rst.exceptions +++ b/Documentation/userspace-api/media/videodev2.h.rst.exceptions @@ -187,6 +187,7 @@ replace define V4L2_FMT_FLAG_COMPRESSED fmtdesc-flags replace define V4L2_FMT_FLAG_EMULATED fmtdesc-flags replace define V4L2_FMT_FLAG_CONTINUOUS_BYTESTREAM fmtdesc-flags replace define V4L2_FMT_FLAG_DYN_RESOLUTION fmtdesc-flags +replace define V4L2_FMT_FLAG_ENC_CAP_FRAME_INTERVAL fmtdesc-flags # V4L2 timecode types replace define V4L2_TC_TYPE_24FPS timecode-type diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index fec2607a07e3..303805438814 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -794,6 +794,7 @@ struct v4l2_fmtdesc { #define V4L2_FMT_FLAG_EMULATED 0x0002 #define V4L2_FMT_FLAG_CONTINUOUS_BYTESTREAM 0x0004 #define V4L2_FMT_FLAG_DYN_RESOLUTION 0x0008 +#define V4L2_FMT_FLAG_ENC_CAP_FRAME_INTERVAL 0x0010 /* Frame Size and frame rate enumeration */ /* -- cgit v1.2.3-59-g8ed1b From 79a28ddd18e9c653f13f60dfabee15c024e64b9b Mon Sep 17 00:00:00 2001 From: Alexandre Cassen Date: Tue, 23 Jun 2020 10:33:45 +0200 Subject: rtnetlink: add keepalived rtm_protocol Keepalived can set global static ip routes or virtual ip routes dynamically following VRRP protocol states. Using a dedicated rtm_protocol will help keeping track of it. Changes in v2: - fix tab/space indenting Signed-off-by: Alexandre Cassen Signed-off-by: David S. Miller --- include/uapi/linux/rtnetlink.h | 45 +++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 22 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 073e71ef6bdd..879e64950a0a 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -257,12 +257,12 @@ enum { /* rtm_protocol */ -#define RTPROT_UNSPEC 0 -#define RTPROT_REDIRECT 1 /* Route installed by ICMP redirects; - not used by current IPv4 */ -#define RTPROT_KERNEL 2 /* Route installed by kernel */ -#define RTPROT_BOOT 3 /* Route installed during boot */ -#define RTPROT_STATIC 4 /* Route installed by administrator */ +#define RTPROT_UNSPEC 0 +#define RTPROT_REDIRECT 1 /* Route installed by ICMP redirects; + not used by current IPv4 */ +#define RTPROT_KERNEL 2 /* Route installed by kernel */ +#define RTPROT_BOOT 3 /* Route installed during boot */ +#define RTPROT_STATIC 4 /* Route installed by administrator */ /* Values of protocol >= RTPROT_STATIC are not interpreted by kernel; they are just passed from user and back as is. @@ -271,22 +271,23 @@ enum { avoid conflicts. */ -#define RTPROT_GATED 8 /* Apparently, GateD */ -#define RTPROT_RA 9 /* RDISC/ND router advertisements */ -#define RTPROT_MRT 10 /* Merit MRT */ -#define RTPROT_ZEBRA 11 /* Zebra */ -#define RTPROT_BIRD 12 /* BIRD */ -#define RTPROT_DNROUTED 13 /* DECnet routing daemon */ -#define RTPROT_XORP 14 /* XORP */ -#define RTPROT_NTK 15 /* Netsukuku */ -#define RTPROT_DHCP 16 /* DHCP client */ -#define RTPROT_MROUTED 17 /* Multicast daemon */ -#define RTPROT_BABEL 42 /* Babel daemon */ -#define RTPROT_BGP 186 /* BGP Routes */ -#define RTPROT_ISIS 187 /* ISIS Routes */ -#define RTPROT_OSPF 188 /* OSPF Routes */ -#define RTPROT_RIP 189 /* RIP Routes */ -#define RTPROT_EIGRP 192 /* EIGRP Routes */ +#define RTPROT_GATED 8 /* Apparently, GateD */ +#define RTPROT_RA 9 /* RDISC/ND router advertisements */ +#define RTPROT_MRT 10 /* Merit MRT */ +#define RTPROT_ZEBRA 11 /* Zebra */ +#define RTPROT_BIRD 12 /* BIRD */ +#define RTPROT_DNROUTED 13 /* DECnet routing daemon */ +#define RTPROT_XORP 14 /* XORP */ +#define RTPROT_NTK 15 /* Netsukuku */ +#define RTPROT_DHCP 16 /* DHCP client */ +#define RTPROT_MROUTED 17 /* Multicast daemon */ +#define RTPROT_KEEPALIVED 18 /* Keepalived daemon */ +#define RTPROT_BABEL 42 /* Babel daemon */ +#define RTPROT_BGP 186 /* BGP Routes */ +#define RTPROT_ISIS 187 /* ISIS Routes */ +#define RTPROT_OSPF 188 /* OSPF Routes */ +#define RTPROT_RIP 189 /* RIP Routes */ +#define RTPROT_EIGRP 192 /* EIGRP Routes */ /* rtm_scope -- cgit v1.2.3-59-g8ed1b From bdb7b79b4ce864a724250e1d35948c46f135de36 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 22 Jun 2020 20:22:21 -0700 Subject: bpf: Switch most helper return values from 32-bit int to 64-bit long Switch most of BPF helper definitions from returning int to long. These definitions are coming from comments in BPF UAPI header and are used to generate bpf_helper_defs.h (under libbpf) to be later included and used from BPF programs. In actual in-kernel implementation, all the helpers are defined as returning u64, but due to some historical reasons, most of them are actually defined as returning int in UAPI (usually, to return 0 on success, and negative value on error). This actually causes Clang to quite often generate sub-optimal code, because compiler believes that return value is 32-bit, and in a lot of cases has to be up-converted (usually with a pair of 32-bit bit shifts) to 64-bit values, before they can be used further in BPF code. Besides just "polluting" the code, these 32-bit shifts quite often cause problems for cases in which return value matters. This is especially the case for the family of bpf_probe_read_str() functions. There are few other similar helpers (e.g., bpf_read_branch_records()), in which return value is used by BPF program logic to record variable-length data and process it. For such cases, BPF program logic carefully manages offsets within some array or map to read variable-length data. For such uses, it's crucial for BPF verifier to track possible range of register values to prove that all the accesses happen within given memory bounds. Those extraneous zero-extending bit shifts, inserted by Clang (and quite often interleaved with other code, which makes the issues even more challenging and sometimes requires employing extra per-variable compiler barriers), throws off verifier logic and makes it mark registers as having unknown variable offset. We'll study this pattern a bit later below. Another common pattern is to check return of BPF helper for non-zero state to detect error conditions and attempt alternative actions in such case. Even in this simple and straightforward case, this 32-bit vs BPF's native 64-bit mode quite often leads to sub-optimal and unnecessary extra code. We'll look at this pattern as well. Clang's BPF target supports two modes of code generation: ALU32, in which it is capable of using lower 32-bit parts of registers, and no-ALU32, in which only full 64-bit registers are being used. ALU32 mode somewhat mitigates the above described problems, but not in all cases. This patch switches all the cases in which BPF helpers return 0 or negative error from returning int to returning long. It is shown below that such change in definition leads to equivalent or better code. No-ALU32 mode benefits more, but ALU32 mode doesn't degrade or still gets improved code generation. Another class of cases switched from int to long are bpf_probe_read_str()-like helpers, which encode successful case as non-negative values, while still returning negative value for errors. In all of such cases, correctness is preserved due to two's complement encoding of negative values and the fact that all helpers return values with 32-bit absolute value. Two's complement ensures that for negative values higher 32 bits are all ones and when truncated, leave valid negative 32-bit value with the same value. Non-negative values have upper 32 bits set to zero and similarly preserve value when high 32 bits are truncated. This means that just casting to int/u32 is correct and efficient (and in ALU32 mode doesn't require any extra shifts). To minimize the chances of regressions, two code patterns were investigated, as mentioned above. For both patterns, BPF assembly was analyzed in ALU32/NO-ALU32 compiler modes, both with current 32-bit int return type and new 64-bit long return type. Case 1. Variable-length data reading and concatenation. This is quite ubiquitous pattern in tracing/monitoring applications, reading data like process's environment variables, file path, etc. In such case, many pieces of string-like variable-length data are read into a single big buffer, and at the end of the process, only a part of array containing actual data is sent to user-space for further processing. This case is tested in test_varlen.c selftest (in the next patch). Code flow is roughly as follows: void *payload = &sample->payload; u64 len; len = bpf_probe_read_kernel_str(payload, MAX_SZ1, &source_data1); if (len <= MAX_SZ1) { payload += len; sample->len1 = len; } len = bpf_probe_read_kernel_str(payload, MAX_SZ2, &source_data2); if (len <= MAX_SZ2) { payload += len; sample->len2 = len; } /* and so on */ sample->total_len = payload - &sample->payload; /* send over, e.g., perf buffer */ There could be two variations with slightly different code generated: when len is 64-bit integer and when it is 32-bit integer. Both variations were analysed. BPF assembly instructions between two successive invocations of bpf_probe_read_kernel_str() were used to check code regressions. Results are below, followed by short analysis. Left side is using helpers with int return type, the right one is after the switch to long. ALU32 + INT ALU32 + LONG =========== ============ 64-BIT (13 insns): 64-BIT (10 insns): ------------------------------------ ------------------------------------ 17: call 115 17: call 115 18: if w0 > 256 goto +9 18: if r0 > 256 goto +6 19: w1 = w0 19: r1 = 0 ll 20: r1 <<= 32 21: *(u64 *)(r1 + 0) = r0 21: r1 s>>= 32 22: r6 = 0 ll 22: r2 = 0 ll 24: r6 += r0 24: *(u64 *)(r2 + 0) = r1 00000000000000c8 : 25: r6 = 0 ll 25: r1 = r6 27: r6 += r1 26: w2 = 256 00000000000000e0 : 27: r3 = 0 ll 28: r1 = r6 29: call 115 29: w2 = 256 30: r3 = 0 ll 32: call 115 32-BIT (11 insns): 32-BIT (12 insns): ------------------------------------ ------------------------------------ 17: call 115 17: call 115 18: if w0 > 256 goto +7 18: if w0 > 256 goto +8 19: r1 = 0 ll 19: r1 = 0 ll 21: *(u32 *)(r1 + 0) = r0 21: *(u32 *)(r1 + 0) = r0 22: w1 = w0 22: r0 <<= 32 23: r6 = 0 ll 23: r0 >>= 32 25: r6 += r1 24: r6 = 0 ll 00000000000000d0 : 26: r6 += r0 26: r1 = r6 00000000000000d8 : 27: w2 = 256 27: r1 = r6 28: r3 = 0 ll 28: w2 = 256 30: call 115 29: r3 = 0 ll 31: call 115 In ALU32 mode, the variant using 64-bit length variable clearly wins and avoids unnecessary zero-extension bit shifts. In practice, this is even more important and good, because BPF code won't need to do extra checks to "prove" that payload/len are within good bounds. 32-bit len is one instruction longer. Clang decided to do 64-to-32 casting with two bit shifts, instead of equivalent `w1 = w0` assignment. The former uses extra register. The latter might potentially lose some range information, but not for 32-bit value. So in this case, verifier infers that r0 is [0, 256] after check at 18:, and shifting 32 bits left/right keeps that range intact. We should probably look into Clang's logic and see why it chooses bitshifts over sub-register assignments for this. NO-ALU32 + INT NO-ALU32 + LONG ============== =============== 64-BIT (14 insns): 64-BIT (10 insns): ------------------------------------ ------------------------------------ 17: call 115 17: call 115 18: r0 <<= 32 18: if r0 > 256 goto +6 19: r1 = r0 19: r1 = 0 ll 20: r1 >>= 32 21: *(u64 *)(r1 + 0) = r0 21: if r1 > 256 goto +7 22: r6 = 0 ll 22: r0 s>>= 32 24: r6 += r0 23: r1 = 0 ll 00000000000000c8 : 25: *(u64 *)(r1 + 0) = r0 25: r1 = r6 26: r6 = 0 ll 26: r2 = 256 28: r6 += r0 27: r3 = 0 ll 00000000000000e8 : 29: call 115 29: r1 = r6 30: r2 = 256 31: r3 = 0 ll 33: call 115 32-BIT (13 insns): 32-BIT (13 insns): ------------------------------------ ------------------------------------ 17: call 115 17: call 115 18: r1 = r0 18: r1 = r0 19: r1 <<= 32 19: r1 <<= 32 20: r1 >>= 32 20: r1 >>= 32 21: if r1 > 256 goto +6 21: if r1 > 256 goto +6 22: r2 = 0 ll 22: r2 = 0 ll 24: *(u32 *)(r2 + 0) = r0 24: *(u32 *)(r2 + 0) = r0 25: r6 = 0 ll 25: r6 = 0 ll 27: r6 += r1 27: r6 += r1 00000000000000e0 : 00000000000000e0 : 28: r1 = r6 28: r1 = r6 29: r2 = 256 29: r2 = 256 30: r3 = 0 ll 30: r3 = 0 ll 32: call 115 32: call 115 In NO-ALU32 mode, for the case of 64-bit len variable, Clang generates much superior code, as expected, eliminating unnecessary bit shifts. For 32-bit len, code is identical. So overall, only ALU-32 32-bit len case is more-or-less equivalent and the difference stems from internal Clang decision, rather than compiler lacking enough information about types. Case 2. Let's look at the simpler case of checking return result of BPF helper for errors. The code is very simple: long bla; if (bpf_probe_read_kenerl(&bla, sizeof(bla), 0)) return 1; else return 0; ALU32 + CHECK (9 insns) ALU32 + CHECK (9 insns) ==================================== ==================================== 0: r1 = r10 0: r1 = r10 1: r1 += -8 1: r1 += -8 2: w2 = 8 2: w2 = 8 3: r3 = 0 3: r3 = 0 4: call 113 4: call 113 5: w1 = w0 5: r1 = r0 6: w0 = 1 6: w0 = 1 7: if w1 != 0 goto +1 7: if r1 != 0 goto +1 8: w0 = 0 8: w0 = 0 0000000000000048 : 0000000000000048 : 9: exit 9: exit Almost identical code, the only difference is the use of full register assignment (r1 = r0) vs half-registers (w1 = w0) in instruction #5. On 32-bit architectures, new BPF assembly might be slightly less optimal, in theory. But one can argue that's not a big issue, given that use of full registers is still prevalent (e.g., for parameter passing). NO-ALU32 + CHECK (11 insns) NO-ALU32 + CHECK (9 insns) ==================================== ==================================== 0: r1 = r10 0: r1 = r10 1: r1 += -8 1: r1 += -8 2: r2 = 8 2: r2 = 8 3: r3 = 0 3: r3 = 0 4: call 113 4: call 113 5: r1 = r0 5: r1 = r0 6: r1 <<= 32 6: r0 = 1 7: r1 >>= 32 7: if r1 != 0 goto +1 8: r0 = 1 8: r0 = 0 9: if r1 != 0 goto +1 0000000000000048 : 10: r0 = 0 9: exit 0000000000000058 : 11: exit NO-ALU32 is a clear improvement, getting rid of unnecessary zero-extension bit shifts. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200623032224.4020118-1-andriin@fb.com --- include/uapi/linux/bpf.h | 192 ++++++++++++++++++++--------------------- tools/include/uapi/linux/bpf.h | 192 ++++++++++++++++++++--------------------- 2 files changed, 192 insertions(+), 192 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 19684813faae..be0efee49093 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -653,7 +653,7 @@ union bpf_attr { * Map value associated to *key*, or **NULL** if no entry was * found. * - * int bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags) + * long bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags) * Description * Add or update the value of the entry associated to *key* in * *map* with *value*. *flags* is one of: @@ -671,13 +671,13 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_map_delete_elem(struct bpf_map *map, const void *key) + * long bpf_map_delete_elem(struct bpf_map *map, const void *key) * Description * Delete entry with *key* from *map*. * Return * 0 on success, or a negative error in case of failure. * - * int bpf_probe_read(void *dst, u32 size, const void *unsafe_ptr) + * long bpf_probe_read(void *dst, u32 size, const void *unsafe_ptr) * Description * For tracing programs, safely attempt to read *size* bytes from * kernel space address *unsafe_ptr* and store the data in *dst*. @@ -695,7 +695,7 @@ union bpf_attr { * Return * Current *ktime*. * - * int bpf_trace_printk(const char *fmt, u32 fmt_size, ...) + * long bpf_trace_printk(const char *fmt, u32 fmt_size, ...) * Description * This helper is a "printk()-like" facility for debugging. It * prints a message defined by format *fmt* (of size *fmt_size*) @@ -775,7 +775,7 @@ union bpf_attr { * Return * The SMP id of the processor running the program. * - * int bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags) + * long bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags) * Description * Store *len* bytes from address *from* into the packet * associated to *skb*, at *offset*. *flags* are a combination of @@ -792,7 +792,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_l3_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 size) + * long bpf_l3_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 size) * Description * Recompute the layer 3 (e.g. IP) checksum for the packet * associated to *skb*. Computation is incremental, so the helper @@ -817,7 +817,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_l4_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 flags) + * long bpf_l4_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 flags) * Description * Recompute the layer 4 (e.g. TCP, UDP or ICMP) checksum for the * packet associated to *skb*. Computation is incremental, so the @@ -849,7 +849,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_tail_call(void *ctx, struct bpf_map *prog_array_map, u32 index) + * long bpf_tail_call(void *ctx, struct bpf_map *prog_array_map, u32 index) * Description * This special helper is used to trigger a "tail call", or in * other words, to jump into another eBPF program. The same stack @@ -880,7 +880,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_clone_redirect(struct sk_buff *skb, u32 ifindex, u64 flags) + * long bpf_clone_redirect(struct sk_buff *skb, u32 ifindex, u64 flags) * Description * Clone and redirect the packet associated to *skb* to another * net device of index *ifindex*. Both ingress and egress @@ -916,7 +916,7 @@ union bpf_attr { * A 64-bit integer containing the current GID and UID, and * created as such: *current_gid* **<< 32 \|** *current_uid*. * - * int bpf_get_current_comm(void *buf, u32 size_of_buf) + * long bpf_get_current_comm(void *buf, u32 size_of_buf) * Description * Copy the **comm** attribute of the current task into *buf* of * *size_of_buf*. The **comm** attribute contains the name of @@ -953,7 +953,7 @@ union bpf_attr { * Return * The classid, or 0 for the default unconfigured classid. * - * int bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) + * long bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) * Description * Push a *vlan_tci* (VLAN tag control information) of protocol * *vlan_proto* to the packet associated to *skb*, then update @@ -969,7 +969,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_vlan_pop(struct sk_buff *skb) + * long bpf_skb_vlan_pop(struct sk_buff *skb) * Description * Pop a VLAN header from the packet associated to *skb*. * @@ -981,7 +981,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_get_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) + * long bpf_skb_get_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) * Description * Get tunnel metadata. This helper takes a pointer *key* to an * empty **struct bpf_tunnel_key** of **size**, that will be @@ -1032,7 +1032,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_set_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) + * long bpf_skb_set_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) * Description * Populate tunnel metadata for packet associated to *skb.* The * tunnel metadata is set to the contents of *key*, of *size*. The @@ -1098,7 +1098,7 @@ union bpf_attr { * The value of the perf event counter read from the map, or a * negative error code in case of failure. * - * int bpf_redirect(u32 ifindex, u64 flags) + * long bpf_redirect(u32 ifindex, u64 flags) * Description * Redirect the packet to another net device of index *ifindex*. * This helper is somewhat similar to **bpf_clone_redirect**\ @@ -1145,7 +1145,7 @@ union bpf_attr { * The realm of the route for the packet associated to *skb*, or 0 * if none was found. * - * int bpf_perf_event_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * long bpf_perf_event_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) * Description * Write raw *data* blob into a special BPF perf event held by * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf @@ -1190,7 +1190,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_load_bytes(const void *skb, u32 offset, void *to, u32 len) + * long bpf_skb_load_bytes(const void *skb, u32 offset, void *to, u32 len) * Description * This helper was provided as an easy way to load data from a * packet. It can be used to load *len* bytes from *offset* from @@ -1207,7 +1207,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_get_stackid(void *ctx, struct bpf_map *map, u64 flags) + * long bpf_get_stackid(void *ctx, struct bpf_map *map, u64 flags) * Description * Walk a user or a kernel stack and return its id. To achieve * this, the helper needs *ctx*, which is a pointer to the context @@ -1276,7 +1276,7 @@ union bpf_attr { * The checksum result, or a negative error code in case of * failure. * - * int bpf_skb_get_tunnel_opt(struct sk_buff *skb, void *opt, u32 size) + * long bpf_skb_get_tunnel_opt(struct sk_buff *skb, void *opt, u32 size) * Description * Retrieve tunnel options metadata for the packet associated to * *skb*, and store the raw tunnel option data to the buffer *opt* @@ -1294,7 +1294,7 @@ union bpf_attr { * Return * The size of the option data retrieved. * - * int bpf_skb_set_tunnel_opt(struct sk_buff *skb, void *opt, u32 size) + * long bpf_skb_set_tunnel_opt(struct sk_buff *skb, void *opt, u32 size) * Description * Set tunnel options metadata for the packet associated to *skb* * to the option data contained in the raw buffer *opt* of *size*. @@ -1304,7 +1304,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_change_proto(struct sk_buff *skb, __be16 proto, u64 flags) + * long bpf_skb_change_proto(struct sk_buff *skb, __be16 proto, u64 flags) * Description * Change the protocol of the *skb* to *proto*. Currently * supported are transition from IPv4 to IPv6, and from IPv6 to @@ -1331,7 +1331,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_change_type(struct sk_buff *skb, u32 type) + * long bpf_skb_change_type(struct sk_buff *skb, u32 type) * Description * Change the packet type for the packet associated to *skb*. This * comes down to setting *skb*\ **->pkt_type** to *type*, except @@ -1358,7 +1358,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_under_cgroup(struct sk_buff *skb, struct bpf_map *map, u32 index) + * long bpf_skb_under_cgroup(struct sk_buff *skb, struct bpf_map *map, u32 index) * Description * Check whether *skb* is a descendant of the cgroup2 held by * *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*. @@ -1389,7 +1389,7 @@ union bpf_attr { * Return * A pointer to the current task struct. * - * int bpf_probe_write_user(void *dst, const void *src, u32 len) + * long bpf_probe_write_user(void *dst, const void *src, u32 len) * Description * Attempt in a safe way to write *len* bytes from the buffer * *src* to *dst* in memory. It only works for threads that are in @@ -1408,7 +1408,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_current_task_under_cgroup(struct bpf_map *map, u32 index) + * long bpf_current_task_under_cgroup(struct bpf_map *map, u32 index) * Description * Check whether the probe is being run is the context of a given * subset of the cgroup2 hierarchy. The cgroup2 to test is held by @@ -1420,7 +1420,7 @@ union bpf_attr { * * 1, if the *skb* task does not belong to the cgroup2. * * A negative error code, if an error occurred. * - * int bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags) + * long bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags) * Description * Resize (trim or grow) the packet associated to *skb* to the * new *len*. The *flags* are reserved for future usage, and must @@ -1444,7 +1444,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_pull_data(struct sk_buff *skb, u32 len) + * long bpf_skb_pull_data(struct sk_buff *skb, u32 len) * Description * Pull in non-linear data in case the *skb* is non-linear and not * all of *len* are part of the linear section. Make *len* bytes @@ -1500,7 +1500,7 @@ union bpf_attr { * recalculation the next time the kernel tries to access this * hash or when the **bpf_get_hash_recalc**\ () helper is called. * - * int bpf_get_numa_node_id(void) + * long bpf_get_numa_node_id(void) * Description * Return the id of the current NUMA node. The primary use case * for this helper is the selection of sockets for the local NUMA @@ -1511,7 +1511,7 @@ union bpf_attr { * Return * The id of current NUMA node. * - * int bpf_skb_change_head(struct sk_buff *skb, u32 len, u64 flags) + * long bpf_skb_change_head(struct sk_buff *skb, u32 len, u64 flags) * Description * Grows headroom of packet associated to *skb* and adjusts the * offset of the MAC header accordingly, adding *len* bytes of @@ -1532,7 +1532,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_xdp_adjust_head(struct xdp_buff *xdp_md, int delta) + * long bpf_xdp_adjust_head(struct xdp_buff *xdp_md, int delta) * Description * Adjust (move) *xdp_md*\ **->data** by *delta* bytes. Note that * it is possible to use a negative value for *delta*. This helper @@ -1547,7 +1547,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_probe_read_str(void *dst, u32 size, const void *unsafe_ptr) + * long bpf_probe_read_str(void *dst, u32 size, const void *unsafe_ptr) * Description * Copy a NUL terminated string from an unsafe kernel address * *unsafe_ptr* to *dst*. See **bpf_probe_read_kernel_str**\ () for @@ -1595,14 +1595,14 @@ union bpf_attr { * is returned (note that **overflowuid** might also be the actual * UID value for the socket). * - * u32 bpf_set_hash(struct sk_buff *skb, u32 hash) + * long bpf_set_hash(struct sk_buff *skb, u32 hash) * Description * Set the full hash for *skb* (set the field *skb*\ **->hash**) * to value *hash*. * Return * 0 * - * int bpf_setsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) + * long bpf_setsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) * Description * Emulate a call to **setsockopt()** on the socket associated to * *bpf_socket*, which must be a full socket. The *level* at @@ -1630,7 +1630,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_adjust_room(struct sk_buff *skb, s32 len_diff, u32 mode, u64 flags) + * long bpf_skb_adjust_room(struct sk_buff *skb, s32 len_diff, u32 mode, u64 flags) * Description * Grow or shrink the room for data in the packet associated to * *skb* by *len_diff*, and according to the selected *mode*. @@ -1676,7 +1676,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags) + * long bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags) * Description * Redirect the packet to the endpoint referenced by *map* at * index *key*. Depending on its type, this *map* can contain @@ -1697,7 +1697,7 @@ union bpf_attr { * **XDP_REDIRECT** on success, or the value of the two lower bits * of the *flags* argument on error. * - * int bpf_sk_redirect_map(struct sk_buff *skb, struct bpf_map *map, u32 key, u64 flags) + * long bpf_sk_redirect_map(struct sk_buff *skb, struct bpf_map *map, u32 key, u64 flags) * Description * Redirect the packet to the socket referenced by *map* (of type * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and @@ -1708,7 +1708,7 @@ union bpf_attr { * Return * **SK_PASS** on success, or **SK_DROP** on error. * - * int bpf_sock_map_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) + * long bpf_sock_map_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) * Description * Add an entry to, or update a *map* referencing sockets. The * *skops* is used as a new value for the entry associated to @@ -1727,7 +1727,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta) + * long bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta) * Description * Adjust the address pointed by *xdp_md*\ **->data_meta** by * *delta* (which can be positive or negative). Note that this @@ -1756,7 +1756,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_perf_event_read_value(struct bpf_map *map, u64 flags, struct bpf_perf_event_value *buf, u32 buf_size) + * long bpf_perf_event_read_value(struct bpf_map *map, u64 flags, struct bpf_perf_event_value *buf, u32 buf_size) * Description * Read the value of a perf event counter, and store it into *buf* * of size *buf_size*. This helper relies on a *map* of type @@ -1806,7 +1806,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_perf_prog_read_value(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, u32 buf_size) + * long bpf_perf_prog_read_value(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, u32 buf_size) * Description * For en eBPF program attached to a perf event, retrieve the * value of the event counter associated to *ctx* and store it in @@ -1817,7 +1817,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_getsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) + * long bpf_getsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) * Description * Emulate a call to **getsockopt()** on the socket associated to * *bpf_socket*, which must be a full socket. The *level* at @@ -1842,7 +1842,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_override_return(struct pt_regs *regs, u64 rc) + * long bpf_override_return(struct pt_regs *regs, u64 rc) * Description * Used for error injection, this helper uses kprobes to override * the return value of the probed function, and to set it to *rc*. @@ -1867,7 +1867,7 @@ union bpf_attr { * Return * 0 * - * int bpf_sock_ops_cb_flags_set(struct bpf_sock_ops *bpf_sock, int argval) + * long bpf_sock_ops_cb_flags_set(struct bpf_sock_ops *bpf_sock, int argval) * Description * Attempt to set the value of the **bpf_sock_ops_cb_flags** field * for the full TCP socket associated to *bpf_sock_ops* to @@ -1911,7 +1911,7 @@ union bpf_attr { * be set is returned (which comes down to 0 if all bits were set * as required). * - * int bpf_msg_redirect_map(struct sk_msg_buff *msg, struct bpf_map *map, u32 key, u64 flags) + * long bpf_msg_redirect_map(struct sk_msg_buff *msg, struct bpf_map *map, u32 key, u64 flags) * Description * This helper is used in programs implementing policies at the * socket level. If the message *msg* is allowed to pass (i.e. if @@ -1925,7 +1925,7 @@ union bpf_attr { * Return * **SK_PASS** on success, or **SK_DROP** on error. * - * int bpf_msg_apply_bytes(struct sk_msg_buff *msg, u32 bytes) + * long bpf_msg_apply_bytes(struct sk_msg_buff *msg, u32 bytes) * Description * For socket policies, apply the verdict of the eBPF program to * the next *bytes* (number of bytes) of message *msg*. @@ -1959,7 +1959,7 @@ union bpf_attr { * Return * 0 * - * int bpf_msg_cork_bytes(struct sk_msg_buff *msg, u32 bytes) + * long bpf_msg_cork_bytes(struct sk_msg_buff *msg, u32 bytes) * Description * For socket policies, prevent the execution of the verdict eBPF * program for message *msg* until *bytes* (byte number) have been @@ -1977,7 +1977,7 @@ union bpf_attr { * Return * 0 * - * int bpf_msg_pull_data(struct sk_msg_buff *msg, u32 start, u32 end, u64 flags) + * long bpf_msg_pull_data(struct sk_msg_buff *msg, u32 start, u32 end, u64 flags) * Description * For socket policies, pull in non-linear data from user space * for *msg* and set pointers *msg*\ **->data** and *msg*\ @@ -2008,7 +2008,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_bind(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len) + * long bpf_bind(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len) * Description * Bind the socket associated to *ctx* to the address pointed by * *addr*, of length *addr_len*. This allows for making outgoing @@ -2026,7 +2026,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta) + * long bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta) * Description * Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is * possible to both shrink and grow the packet tail. @@ -2040,7 +2040,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_get_xfrm_state(struct sk_buff *skb, u32 index, struct bpf_xfrm_state *xfrm_state, u32 size, u64 flags) + * long bpf_skb_get_xfrm_state(struct sk_buff *skb, u32 index, struct bpf_xfrm_state *xfrm_state, u32 size, u64 flags) * Description * Retrieve the XFRM state (IP transform framework, see also * **ip-xfrm(8)**) at *index* in XFRM "security path" for *skb*. @@ -2056,7 +2056,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_get_stack(void *ctx, void *buf, u32 size, u64 flags) + * long bpf_get_stack(void *ctx, void *buf, u32 size, u64 flags) * Description * Return a user or a kernel stack in bpf program provided buffer. * To achieve this, the helper needs *ctx*, which is a pointer @@ -2089,7 +2089,7 @@ union bpf_attr { * A non-negative value equal to or less than *size* on success, * or a negative error in case of failure. * - * int bpf_skb_load_bytes_relative(const void *skb, u32 offset, void *to, u32 len, u32 start_header) + * long bpf_skb_load_bytes_relative(const void *skb, u32 offset, void *to, u32 len, u32 start_header) * Description * This helper is similar to **bpf_skb_load_bytes**\ () in that * it provides an easy way to load *len* bytes from *offset* @@ -2111,7 +2111,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags) + * long bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags) * Description * Do FIB lookup in kernel tables using parameters in *params*. * If lookup is successful and result shows packet is to be @@ -2142,7 +2142,7 @@ union bpf_attr { * * > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the * packet is not forwarded or needs assist from full stack * - * int bpf_sock_hash_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) + * long bpf_sock_hash_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) * Description * Add an entry to, or update a sockhash *map* referencing sockets. * The *skops* is used as a new value for the entry associated to @@ -2161,7 +2161,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, void *key, u64 flags) + * long bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, void *key, u64 flags) * Description * This helper is used in programs implementing policies at the * socket level. If the message *msg* is allowed to pass (i.e. if @@ -2175,7 +2175,7 @@ union bpf_attr { * Return * **SK_PASS** on success, or **SK_DROP** on error. * - * int bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void *key, u64 flags) + * long bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void *key, u64 flags) * Description * This helper is used in programs implementing policies at the * skb socket level. If the sk_buff *skb* is allowed to pass (i.e. @@ -2189,7 +2189,7 @@ union bpf_attr { * Return * **SK_PASS** on success, or **SK_DROP** on error. * - * int bpf_lwt_push_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) + * long bpf_lwt_push_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) * Description * Encapsulate the packet associated to *skb* within a Layer 3 * protocol header. This header is provided in the buffer at @@ -2226,7 +2226,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_lwt_seg6_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len) + * long bpf_lwt_seg6_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len) * Description * Store *len* bytes from address *from* into the packet * associated to *skb*, at *offset*. Only the flags, tag and TLVs @@ -2241,7 +2241,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_lwt_seg6_adjust_srh(struct sk_buff *skb, u32 offset, s32 delta) + * long bpf_lwt_seg6_adjust_srh(struct sk_buff *skb, u32 offset, s32 delta) * Description * Adjust the size allocated to TLVs in the outermost IPv6 * Segment Routing Header contained in the packet associated to @@ -2257,7 +2257,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_lwt_seg6_action(struct sk_buff *skb, u32 action, void *param, u32 param_len) + * long bpf_lwt_seg6_action(struct sk_buff *skb, u32 action, void *param, u32 param_len) * Description * Apply an IPv6 Segment Routing action of type *action* to the * packet associated to *skb*. Each action takes a parameter @@ -2286,7 +2286,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_rc_repeat(void *ctx) + * long bpf_rc_repeat(void *ctx) * Description * This helper is used in programs implementing IR decoding, to * report a successfully decoded repeat key message. This delays @@ -2305,7 +2305,7 @@ union bpf_attr { * Return * 0 * - * int bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle) + * long bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle) * Description * This helper is used in programs implementing IR decoding, to * report a successfully decoded key press with *scancode*, @@ -2370,7 +2370,7 @@ union bpf_attr { * Return * A pointer to the local storage area. * - * int bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags) + * long bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags) * Description * Select a **SO_REUSEPORT** socket from a * **BPF_MAP_TYPE_REUSEPORT_ARRAY** *map*. @@ -2471,7 +2471,7 @@ union bpf_attr { * result is from *reuse*\ **->socks**\ [] using the hash of the * tuple. * - * int bpf_sk_release(struct bpf_sock *sock) + * long bpf_sk_release(struct bpf_sock *sock) * Description * Release the reference held by *sock*. *sock* must be a * non-**NULL** pointer that was returned from @@ -2479,7 +2479,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags) + * long bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags) * Description * Push an element *value* in *map*. *flags* is one of: * @@ -2489,19 +2489,19 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_map_pop_elem(struct bpf_map *map, void *value) + * long bpf_map_pop_elem(struct bpf_map *map, void *value) * Description * Pop an element from *map*. * Return * 0 on success, or a negative error in case of failure. * - * int bpf_map_peek_elem(struct bpf_map *map, void *value) + * long bpf_map_peek_elem(struct bpf_map *map, void *value) * Description * Get an element from *map* without removing it. * Return * 0 on success, or a negative error in case of failure. * - * int bpf_msg_push_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags) + * long bpf_msg_push_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags) * Description * For socket policies, insert *len* bytes into *msg* at offset * *start*. @@ -2517,7 +2517,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags) + * long bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags) * Description * Will remove *len* bytes from a *msg* starting at byte *start*. * This may result in **ENOMEM** errors under certain situations if @@ -2529,7 +2529,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_rc_pointer_rel(void *ctx, s32 rel_x, s32 rel_y) + * long bpf_rc_pointer_rel(void *ctx, s32 rel_x, s32 rel_y) * Description * This helper is used in programs implementing IR decoding, to * report a successfully decoded pointer movement. @@ -2543,7 +2543,7 @@ union bpf_attr { * Return * 0 * - * int bpf_spin_lock(struct bpf_spin_lock *lock) + * long bpf_spin_lock(struct bpf_spin_lock *lock) * Description * Acquire a spinlock represented by the pointer *lock*, which is * stored as part of a value of a map. Taking the lock allows to @@ -2591,7 +2591,7 @@ union bpf_attr { * Return * 0 * - * int bpf_spin_unlock(struct bpf_spin_lock *lock) + * long bpf_spin_unlock(struct bpf_spin_lock *lock) * Description * Release the *lock* previously locked by a call to * **bpf_spin_lock**\ (\ *lock*\ ). @@ -2614,7 +2614,7 @@ union bpf_attr { * A **struct bpf_tcp_sock** pointer on success, or **NULL** in * case of failure. * - * int bpf_skb_ecn_set_ce(struct sk_buff *skb) + * long bpf_skb_ecn_set_ce(struct sk_buff *skb) * Description * Set ECN (Explicit Congestion Notification) field of IP header * to **CE** (Congestion Encountered) if current value is **ECT** @@ -2651,7 +2651,7 @@ union bpf_attr { * result is from *reuse*\ **->socks**\ [] using the hash of the * tuple. * - * int bpf_tcp_check_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) + * long bpf_tcp_check_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) * Description * Check whether *iph* and *th* contain a valid SYN cookie ACK for * the listening socket in *sk*. @@ -2666,7 +2666,7 @@ union bpf_attr { * 0 if *iph* and *th* are a valid SYN cookie ACK, or a negative * error otherwise. * - * int bpf_sysctl_get_name(struct bpf_sysctl *ctx, char *buf, size_t buf_len, u64 flags) + * long bpf_sysctl_get_name(struct bpf_sysctl *ctx, char *buf, size_t buf_len, u64 flags) * Description * Get name of sysctl in /proc/sys/ and copy it into provided by * program buffer *buf* of size *buf_len*. @@ -2682,7 +2682,7 @@ union bpf_attr { * **-E2BIG** if the buffer wasn't big enough (*buf* will contain * truncated name in this case). * - * int bpf_sysctl_get_current_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) + * long bpf_sysctl_get_current_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) * Description * Get current value of sysctl as it is presented in /proc/sys * (incl. newline, etc), and copy it as a string into provided @@ -2701,7 +2701,7 @@ union bpf_attr { * **-EINVAL** if current value was unavailable, e.g. because * sysctl is uninitialized and read returns -EIO for it. * - * int bpf_sysctl_get_new_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) + * long bpf_sysctl_get_new_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) * Description * Get new value being written by user space to sysctl (before * the actual write happens) and copy it as a string into @@ -2718,7 +2718,7 @@ union bpf_attr { * * **-EINVAL** if sysctl is being read. * - * int bpf_sysctl_set_new_value(struct bpf_sysctl *ctx, const char *buf, size_t buf_len) + * long bpf_sysctl_set_new_value(struct bpf_sysctl *ctx, const char *buf, size_t buf_len) * Description * Override new value being written by user space to sysctl with * value provided by program in buffer *buf* of size *buf_len*. @@ -2735,7 +2735,7 @@ union bpf_attr { * * **-EINVAL** if sysctl is being read. * - * int bpf_strtol(const char *buf, size_t buf_len, u64 flags, long *res) + * long bpf_strtol(const char *buf, size_t buf_len, u64 flags, long *res) * Description * Convert the initial part of the string from buffer *buf* of * size *buf_len* to a long integer according to the given base @@ -2759,7 +2759,7 @@ union bpf_attr { * * **-ERANGE** if resulting value was out of range. * - * int bpf_strtoul(const char *buf, size_t buf_len, u64 flags, unsigned long *res) + * long bpf_strtoul(const char *buf, size_t buf_len, u64 flags, unsigned long *res) * Description * Convert the initial part of the string from buffer *buf* of * size *buf_len* to an unsigned long integer according to the @@ -2810,7 +2810,7 @@ union bpf_attr { * **NULL** if not found or there was an error in adding * a new bpf-local-storage. * - * int bpf_sk_storage_delete(struct bpf_map *map, struct bpf_sock *sk) + * long bpf_sk_storage_delete(struct bpf_map *map, struct bpf_sock *sk) * Description * Delete a bpf-local-storage from a *sk*. * Return @@ -2818,7 +2818,7 @@ union bpf_attr { * * **-ENOENT** if the bpf-local-storage cannot be found. * - * int bpf_send_signal(u32 sig) + * long bpf_send_signal(u32 sig) * Description * Send signal *sig* to the process of the current task. * The signal may be delivered to any of this process's threads. @@ -2859,7 +2859,7 @@ union bpf_attr { * * **-EPROTONOSUPPORT** IP packet version is not 4 or 6 * - * int bpf_skb_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * long bpf_skb_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) * Description * Write raw *data* blob into a special BPF perf event held by * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf @@ -2883,21 +2883,21 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_probe_read_user(void *dst, u32 size, const void *unsafe_ptr) + * long bpf_probe_read_user(void *dst, u32 size, const void *unsafe_ptr) * Description * Safely attempt to read *size* bytes from user space address * *unsafe_ptr* and store the data in *dst*. * Return * 0 on success, or a negative error in case of failure. * - * int bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr) + * long bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr) * Description * Safely attempt to read *size* bytes from kernel space address * *unsafe_ptr* and store the data in *dst*. * Return * 0 on success, or a negative error in case of failure. * - * int bpf_probe_read_user_str(void *dst, u32 size, const void *unsafe_ptr) + * long bpf_probe_read_user_str(void *dst, u32 size, const void *unsafe_ptr) * Description * Copy a NUL terminated string from an unsafe user address * *unsafe_ptr* to *dst*. The *size* should include the @@ -2941,7 +2941,7 @@ union bpf_attr { * including the trailing NUL character. On error, a negative * value. * - * int bpf_probe_read_kernel_str(void *dst, u32 size, const void *unsafe_ptr) + * long bpf_probe_read_kernel_str(void *dst, u32 size, const void *unsafe_ptr) * Description * Copy a NUL terminated string from an unsafe kernel address *unsafe_ptr* * to *dst*. Same semantics as with **bpf_probe_read_user_str**\ () apply. @@ -2949,14 +2949,14 @@ union bpf_attr { * On success, the strictly positive length of the string, including * the trailing NUL character. On error, a negative value. * - * int bpf_tcp_send_ack(void *tp, u32 rcv_nxt) + * long bpf_tcp_send_ack(void *tp, u32 rcv_nxt) * Description * Send out a tcp-ack. *tp* is the in-kernel struct **tcp_sock**. * *rcv_nxt* is the ack_seq to be sent out. * Return * 0 on success, or a negative error in case of failure. * - * int bpf_send_signal_thread(u32 sig) + * long bpf_send_signal_thread(u32 sig) * Description * Send signal *sig* to the thread corresponding to the current task. * Return @@ -2976,7 +2976,7 @@ union bpf_attr { * Return * The 64 bit jiffies * - * int bpf_read_branch_records(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags) + * long bpf_read_branch_records(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags) * Description * For an eBPF program attached to a perf event, retrieve the * branch records (**struct perf_branch_entry**) associated to *ctx* @@ -2995,7 +2995,7 @@ union bpf_attr { * * **-ENOENT** if architecture does not support branch records. * - * int bpf_get_ns_current_pid_tgid(u64 dev, u64 ino, struct bpf_pidns_info *nsdata, u32 size) + * long bpf_get_ns_current_pid_tgid(u64 dev, u64 ino, struct bpf_pidns_info *nsdata, u32 size) * Description * Returns 0 on success, values for *pid* and *tgid* as seen from the current * *namespace* will be returned in *nsdata*. @@ -3007,7 +3007,7 @@ union bpf_attr { * * **-ENOENT** if pidns does not exists for the current task. * - * int bpf_xdp_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * long bpf_xdp_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) * Description * Write raw *data* blob into a special BPF perf event held by * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf @@ -3062,7 +3062,7 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * - * int bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags) + * long bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags) * Description * Assign the *sk* to the *skb*. When combined with appropriate * routing configuration to receive the packet towards the socket, @@ -3097,7 +3097,7 @@ union bpf_attr { * Return * Current *ktime*. * - * int bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len) + * long bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len) * Description * **bpf_seq_printf**\ () uses seq_file **seq_printf**\ () to print * out the format string. @@ -3126,7 +3126,7 @@ union bpf_attr { * * **-EOVERFLOW** if an overflow happened: The same object will be tried again. * - * int bpf_seq_write(struct seq_file *m, const void *data, u32 len) + * long bpf_seq_write(struct seq_file *m, const void *data, u32 len) * Description * **bpf_seq_write**\ () uses seq_file **seq_write**\ () to write the data. * The *m* represents the seq_file. The *data* and *len* represent the @@ -3221,7 +3221,7 @@ union bpf_attr { * Return * Requested value, or 0, if flags are not recognized. * - * int bpf_csum_level(struct sk_buff *skb, u64 level) + * long bpf_csum_level(struct sk_buff *skb, u64 level) * Description * Change the skbs checksum level by one layer up or down, or * reset it entirely to none in order to have the stack perform diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 19684813faae..be0efee49093 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -653,7 +653,7 @@ union bpf_attr { * Map value associated to *key*, or **NULL** if no entry was * found. * - * int bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags) + * long bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags) * Description * Add or update the value of the entry associated to *key* in * *map* with *value*. *flags* is one of: @@ -671,13 +671,13 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_map_delete_elem(struct bpf_map *map, const void *key) + * long bpf_map_delete_elem(struct bpf_map *map, const void *key) * Description * Delete entry with *key* from *map*. * Return * 0 on success, or a negative error in case of failure. * - * int bpf_probe_read(void *dst, u32 size, const void *unsafe_ptr) + * long bpf_probe_read(void *dst, u32 size, const void *unsafe_ptr) * Description * For tracing programs, safely attempt to read *size* bytes from * kernel space address *unsafe_ptr* and store the data in *dst*. @@ -695,7 +695,7 @@ union bpf_attr { * Return * Current *ktime*. * - * int bpf_trace_printk(const char *fmt, u32 fmt_size, ...) + * long bpf_trace_printk(const char *fmt, u32 fmt_size, ...) * Description * This helper is a "printk()-like" facility for debugging. It * prints a message defined by format *fmt* (of size *fmt_size*) @@ -775,7 +775,7 @@ union bpf_attr { * Return * The SMP id of the processor running the program. * - * int bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags) + * long bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags) * Description * Store *len* bytes from address *from* into the packet * associated to *skb*, at *offset*. *flags* are a combination of @@ -792,7 +792,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_l3_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 size) + * long bpf_l3_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 size) * Description * Recompute the layer 3 (e.g. IP) checksum for the packet * associated to *skb*. Computation is incremental, so the helper @@ -817,7 +817,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_l4_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 flags) + * long bpf_l4_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 flags) * Description * Recompute the layer 4 (e.g. TCP, UDP or ICMP) checksum for the * packet associated to *skb*. Computation is incremental, so the @@ -849,7 +849,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_tail_call(void *ctx, struct bpf_map *prog_array_map, u32 index) + * long bpf_tail_call(void *ctx, struct bpf_map *prog_array_map, u32 index) * Description * This special helper is used to trigger a "tail call", or in * other words, to jump into another eBPF program. The same stack @@ -880,7 +880,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_clone_redirect(struct sk_buff *skb, u32 ifindex, u64 flags) + * long bpf_clone_redirect(struct sk_buff *skb, u32 ifindex, u64 flags) * Description * Clone and redirect the packet associated to *skb* to another * net device of index *ifindex*. Both ingress and egress @@ -916,7 +916,7 @@ union bpf_attr { * A 64-bit integer containing the current GID and UID, and * created as such: *current_gid* **<< 32 \|** *current_uid*. * - * int bpf_get_current_comm(void *buf, u32 size_of_buf) + * long bpf_get_current_comm(void *buf, u32 size_of_buf) * Description * Copy the **comm** attribute of the current task into *buf* of * *size_of_buf*. The **comm** attribute contains the name of @@ -953,7 +953,7 @@ union bpf_attr { * Return * The classid, or 0 for the default unconfigured classid. * - * int bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) + * long bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) * Description * Push a *vlan_tci* (VLAN tag control information) of protocol * *vlan_proto* to the packet associated to *skb*, then update @@ -969,7 +969,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_vlan_pop(struct sk_buff *skb) + * long bpf_skb_vlan_pop(struct sk_buff *skb) * Description * Pop a VLAN header from the packet associated to *skb*. * @@ -981,7 +981,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_get_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) + * long bpf_skb_get_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) * Description * Get tunnel metadata. This helper takes a pointer *key* to an * empty **struct bpf_tunnel_key** of **size**, that will be @@ -1032,7 +1032,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_set_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) + * long bpf_skb_set_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) * Description * Populate tunnel metadata for packet associated to *skb.* The * tunnel metadata is set to the contents of *key*, of *size*. The @@ -1098,7 +1098,7 @@ union bpf_attr { * The value of the perf event counter read from the map, or a * negative error code in case of failure. * - * int bpf_redirect(u32 ifindex, u64 flags) + * long bpf_redirect(u32 ifindex, u64 flags) * Description * Redirect the packet to another net device of index *ifindex*. * This helper is somewhat similar to **bpf_clone_redirect**\ @@ -1145,7 +1145,7 @@ union bpf_attr { * The realm of the route for the packet associated to *skb*, or 0 * if none was found. * - * int bpf_perf_event_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * long bpf_perf_event_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) * Description * Write raw *data* blob into a special BPF perf event held by * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf @@ -1190,7 +1190,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_load_bytes(const void *skb, u32 offset, void *to, u32 len) + * long bpf_skb_load_bytes(const void *skb, u32 offset, void *to, u32 len) * Description * This helper was provided as an easy way to load data from a * packet. It can be used to load *len* bytes from *offset* from @@ -1207,7 +1207,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_get_stackid(void *ctx, struct bpf_map *map, u64 flags) + * long bpf_get_stackid(void *ctx, struct bpf_map *map, u64 flags) * Description * Walk a user or a kernel stack and return its id. To achieve * this, the helper needs *ctx*, which is a pointer to the context @@ -1276,7 +1276,7 @@ union bpf_attr { * The checksum result, or a negative error code in case of * failure. * - * int bpf_skb_get_tunnel_opt(struct sk_buff *skb, void *opt, u32 size) + * long bpf_skb_get_tunnel_opt(struct sk_buff *skb, void *opt, u32 size) * Description * Retrieve tunnel options metadata for the packet associated to * *skb*, and store the raw tunnel option data to the buffer *opt* @@ -1294,7 +1294,7 @@ union bpf_attr { * Return * The size of the option data retrieved. * - * int bpf_skb_set_tunnel_opt(struct sk_buff *skb, void *opt, u32 size) + * long bpf_skb_set_tunnel_opt(struct sk_buff *skb, void *opt, u32 size) * Description * Set tunnel options metadata for the packet associated to *skb* * to the option data contained in the raw buffer *opt* of *size*. @@ -1304,7 +1304,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_change_proto(struct sk_buff *skb, __be16 proto, u64 flags) + * long bpf_skb_change_proto(struct sk_buff *skb, __be16 proto, u64 flags) * Description * Change the protocol of the *skb* to *proto*. Currently * supported are transition from IPv4 to IPv6, and from IPv6 to @@ -1331,7 +1331,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_change_type(struct sk_buff *skb, u32 type) + * long bpf_skb_change_type(struct sk_buff *skb, u32 type) * Description * Change the packet type for the packet associated to *skb*. This * comes down to setting *skb*\ **->pkt_type** to *type*, except @@ -1358,7 +1358,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_under_cgroup(struct sk_buff *skb, struct bpf_map *map, u32 index) + * long bpf_skb_under_cgroup(struct sk_buff *skb, struct bpf_map *map, u32 index) * Description * Check whether *skb* is a descendant of the cgroup2 held by * *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*. @@ -1389,7 +1389,7 @@ union bpf_attr { * Return * A pointer to the current task struct. * - * int bpf_probe_write_user(void *dst, const void *src, u32 len) + * long bpf_probe_write_user(void *dst, const void *src, u32 len) * Description * Attempt in a safe way to write *len* bytes from the buffer * *src* to *dst* in memory. It only works for threads that are in @@ -1408,7 +1408,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_current_task_under_cgroup(struct bpf_map *map, u32 index) + * long bpf_current_task_under_cgroup(struct bpf_map *map, u32 index) * Description * Check whether the probe is being run is the context of a given * subset of the cgroup2 hierarchy. The cgroup2 to test is held by @@ -1420,7 +1420,7 @@ union bpf_attr { * * 1, if the *skb* task does not belong to the cgroup2. * * A negative error code, if an error occurred. * - * int bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags) + * long bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags) * Description * Resize (trim or grow) the packet associated to *skb* to the * new *len*. The *flags* are reserved for future usage, and must @@ -1444,7 +1444,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_pull_data(struct sk_buff *skb, u32 len) + * long bpf_skb_pull_data(struct sk_buff *skb, u32 len) * Description * Pull in non-linear data in case the *skb* is non-linear and not * all of *len* are part of the linear section. Make *len* bytes @@ -1500,7 +1500,7 @@ union bpf_attr { * recalculation the next time the kernel tries to access this * hash or when the **bpf_get_hash_recalc**\ () helper is called. * - * int bpf_get_numa_node_id(void) + * long bpf_get_numa_node_id(void) * Description * Return the id of the current NUMA node. The primary use case * for this helper is the selection of sockets for the local NUMA @@ -1511,7 +1511,7 @@ union bpf_attr { * Return * The id of current NUMA node. * - * int bpf_skb_change_head(struct sk_buff *skb, u32 len, u64 flags) + * long bpf_skb_change_head(struct sk_buff *skb, u32 len, u64 flags) * Description * Grows headroom of packet associated to *skb* and adjusts the * offset of the MAC header accordingly, adding *len* bytes of @@ -1532,7 +1532,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_xdp_adjust_head(struct xdp_buff *xdp_md, int delta) + * long bpf_xdp_adjust_head(struct xdp_buff *xdp_md, int delta) * Description * Adjust (move) *xdp_md*\ **->data** by *delta* bytes. Note that * it is possible to use a negative value for *delta*. This helper @@ -1547,7 +1547,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_probe_read_str(void *dst, u32 size, const void *unsafe_ptr) + * long bpf_probe_read_str(void *dst, u32 size, const void *unsafe_ptr) * Description * Copy a NUL terminated string from an unsafe kernel address * *unsafe_ptr* to *dst*. See **bpf_probe_read_kernel_str**\ () for @@ -1595,14 +1595,14 @@ union bpf_attr { * is returned (note that **overflowuid** might also be the actual * UID value for the socket). * - * u32 bpf_set_hash(struct sk_buff *skb, u32 hash) + * long bpf_set_hash(struct sk_buff *skb, u32 hash) * Description * Set the full hash for *skb* (set the field *skb*\ **->hash**) * to value *hash*. * Return * 0 * - * int bpf_setsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) + * long bpf_setsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) * Description * Emulate a call to **setsockopt()** on the socket associated to * *bpf_socket*, which must be a full socket. The *level* at @@ -1630,7 +1630,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_adjust_room(struct sk_buff *skb, s32 len_diff, u32 mode, u64 flags) + * long bpf_skb_adjust_room(struct sk_buff *skb, s32 len_diff, u32 mode, u64 flags) * Description * Grow or shrink the room for data in the packet associated to * *skb* by *len_diff*, and according to the selected *mode*. @@ -1676,7 +1676,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags) + * long bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags) * Description * Redirect the packet to the endpoint referenced by *map* at * index *key*. Depending on its type, this *map* can contain @@ -1697,7 +1697,7 @@ union bpf_attr { * **XDP_REDIRECT** on success, or the value of the two lower bits * of the *flags* argument on error. * - * int bpf_sk_redirect_map(struct sk_buff *skb, struct bpf_map *map, u32 key, u64 flags) + * long bpf_sk_redirect_map(struct sk_buff *skb, struct bpf_map *map, u32 key, u64 flags) * Description * Redirect the packet to the socket referenced by *map* (of type * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and @@ -1708,7 +1708,7 @@ union bpf_attr { * Return * **SK_PASS** on success, or **SK_DROP** on error. * - * int bpf_sock_map_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) + * long bpf_sock_map_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) * Description * Add an entry to, or update a *map* referencing sockets. The * *skops* is used as a new value for the entry associated to @@ -1727,7 +1727,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta) + * long bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta) * Description * Adjust the address pointed by *xdp_md*\ **->data_meta** by * *delta* (which can be positive or negative). Note that this @@ -1756,7 +1756,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_perf_event_read_value(struct bpf_map *map, u64 flags, struct bpf_perf_event_value *buf, u32 buf_size) + * long bpf_perf_event_read_value(struct bpf_map *map, u64 flags, struct bpf_perf_event_value *buf, u32 buf_size) * Description * Read the value of a perf event counter, and store it into *buf* * of size *buf_size*. This helper relies on a *map* of type @@ -1806,7 +1806,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_perf_prog_read_value(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, u32 buf_size) + * long bpf_perf_prog_read_value(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, u32 buf_size) * Description * For en eBPF program attached to a perf event, retrieve the * value of the event counter associated to *ctx* and store it in @@ -1817,7 +1817,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_getsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) + * long bpf_getsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) * Description * Emulate a call to **getsockopt()** on the socket associated to * *bpf_socket*, which must be a full socket. The *level* at @@ -1842,7 +1842,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_override_return(struct pt_regs *regs, u64 rc) + * long bpf_override_return(struct pt_regs *regs, u64 rc) * Description * Used for error injection, this helper uses kprobes to override * the return value of the probed function, and to set it to *rc*. @@ -1867,7 +1867,7 @@ union bpf_attr { * Return * 0 * - * int bpf_sock_ops_cb_flags_set(struct bpf_sock_ops *bpf_sock, int argval) + * long bpf_sock_ops_cb_flags_set(struct bpf_sock_ops *bpf_sock, int argval) * Description * Attempt to set the value of the **bpf_sock_ops_cb_flags** field * for the full TCP socket associated to *bpf_sock_ops* to @@ -1911,7 +1911,7 @@ union bpf_attr { * be set is returned (which comes down to 0 if all bits were set * as required). * - * int bpf_msg_redirect_map(struct sk_msg_buff *msg, struct bpf_map *map, u32 key, u64 flags) + * long bpf_msg_redirect_map(struct sk_msg_buff *msg, struct bpf_map *map, u32 key, u64 flags) * Description * This helper is used in programs implementing policies at the * socket level. If the message *msg* is allowed to pass (i.e. if @@ -1925,7 +1925,7 @@ union bpf_attr { * Return * **SK_PASS** on success, or **SK_DROP** on error. * - * int bpf_msg_apply_bytes(struct sk_msg_buff *msg, u32 bytes) + * long bpf_msg_apply_bytes(struct sk_msg_buff *msg, u32 bytes) * Description * For socket policies, apply the verdict of the eBPF program to * the next *bytes* (number of bytes) of message *msg*. @@ -1959,7 +1959,7 @@ union bpf_attr { * Return * 0 * - * int bpf_msg_cork_bytes(struct sk_msg_buff *msg, u32 bytes) + * long bpf_msg_cork_bytes(struct sk_msg_buff *msg, u32 bytes) * Description * For socket policies, prevent the execution of the verdict eBPF * program for message *msg* until *bytes* (byte number) have been @@ -1977,7 +1977,7 @@ union bpf_attr { * Return * 0 * - * int bpf_msg_pull_data(struct sk_msg_buff *msg, u32 start, u32 end, u64 flags) + * long bpf_msg_pull_data(struct sk_msg_buff *msg, u32 start, u32 end, u64 flags) * Description * For socket policies, pull in non-linear data from user space * for *msg* and set pointers *msg*\ **->data** and *msg*\ @@ -2008,7 +2008,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_bind(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len) + * long bpf_bind(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len) * Description * Bind the socket associated to *ctx* to the address pointed by * *addr*, of length *addr_len*. This allows for making outgoing @@ -2026,7 +2026,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta) + * long bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta) * Description * Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is * possible to both shrink and grow the packet tail. @@ -2040,7 +2040,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_get_xfrm_state(struct sk_buff *skb, u32 index, struct bpf_xfrm_state *xfrm_state, u32 size, u64 flags) + * long bpf_skb_get_xfrm_state(struct sk_buff *skb, u32 index, struct bpf_xfrm_state *xfrm_state, u32 size, u64 flags) * Description * Retrieve the XFRM state (IP transform framework, see also * **ip-xfrm(8)**) at *index* in XFRM "security path" for *skb*. @@ -2056,7 +2056,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_get_stack(void *ctx, void *buf, u32 size, u64 flags) + * long bpf_get_stack(void *ctx, void *buf, u32 size, u64 flags) * Description * Return a user or a kernel stack in bpf program provided buffer. * To achieve this, the helper needs *ctx*, which is a pointer @@ -2089,7 +2089,7 @@ union bpf_attr { * A non-negative value equal to or less than *size* on success, * or a negative error in case of failure. * - * int bpf_skb_load_bytes_relative(const void *skb, u32 offset, void *to, u32 len, u32 start_header) + * long bpf_skb_load_bytes_relative(const void *skb, u32 offset, void *to, u32 len, u32 start_header) * Description * This helper is similar to **bpf_skb_load_bytes**\ () in that * it provides an easy way to load *len* bytes from *offset* @@ -2111,7 +2111,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags) + * long bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags) * Description * Do FIB lookup in kernel tables using parameters in *params*. * If lookup is successful and result shows packet is to be @@ -2142,7 +2142,7 @@ union bpf_attr { * * > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the * packet is not forwarded or needs assist from full stack * - * int bpf_sock_hash_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) + * long bpf_sock_hash_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) * Description * Add an entry to, or update a sockhash *map* referencing sockets. * The *skops* is used as a new value for the entry associated to @@ -2161,7 +2161,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, void *key, u64 flags) + * long bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, void *key, u64 flags) * Description * This helper is used in programs implementing policies at the * socket level. If the message *msg* is allowed to pass (i.e. if @@ -2175,7 +2175,7 @@ union bpf_attr { * Return * **SK_PASS** on success, or **SK_DROP** on error. * - * int bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void *key, u64 flags) + * long bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void *key, u64 flags) * Description * This helper is used in programs implementing policies at the * skb socket level. If the sk_buff *skb* is allowed to pass (i.e. @@ -2189,7 +2189,7 @@ union bpf_attr { * Return * **SK_PASS** on success, or **SK_DROP** on error. * - * int bpf_lwt_push_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) + * long bpf_lwt_push_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) * Description * Encapsulate the packet associated to *skb* within a Layer 3 * protocol header. This header is provided in the buffer at @@ -2226,7 +2226,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_lwt_seg6_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len) + * long bpf_lwt_seg6_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len) * Description * Store *len* bytes from address *from* into the packet * associated to *skb*, at *offset*. Only the flags, tag and TLVs @@ -2241,7 +2241,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_lwt_seg6_adjust_srh(struct sk_buff *skb, u32 offset, s32 delta) + * long bpf_lwt_seg6_adjust_srh(struct sk_buff *skb, u32 offset, s32 delta) * Description * Adjust the size allocated to TLVs in the outermost IPv6 * Segment Routing Header contained in the packet associated to @@ -2257,7 +2257,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_lwt_seg6_action(struct sk_buff *skb, u32 action, void *param, u32 param_len) + * long bpf_lwt_seg6_action(struct sk_buff *skb, u32 action, void *param, u32 param_len) * Description * Apply an IPv6 Segment Routing action of type *action* to the * packet associated to *skb*. Each action takes a parameter @@ -2286,7 +2286,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_rc_repeat(void *ctx) + * long bpf_rc_repeat(void *ctx) * Description * This helper is used in programs implementing IR decoding, to * report a successfully decoded repeat key message. This delays @@ -2305,7 +2305,7 @@ union bpf_attr { * Return * 0 * - * int bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle) + * long bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle) * Description * This helper is used in programs implementing IR decoding, to * report a successfully decoded key press with *scancode*, @@ -2370,7 +2370,7 @@ union bpf_attr { * Return * A pointer to the local storage area. * - * int bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags) + * long bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags) * Description * Select a **SO_REUSEPORT** socket from a * **BPF_MAP_TYPE_REUSEPORT_ARRAY** *map*. @@ -2471,7 +2471,7 @@ union bpf_attr { * result is from *reuse*\ **->socks**\ [] using the hash of the * tuple. * - * int bpf_sk_release(struct bpf_sock *sock) + * long bpf_sk_release(struct bpf_sock *sock) * Description * Release the reference held by *sock*. *sock* must be a * non-**NULL** pointer that was returned from @@ -2479,7 +2479,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags) + * long bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags) * Description * Push an element *value* in *map*. *flags* is one of: * @@ -2489,19 +2489,19 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_map_pop_elem(struct bpf_map *map, void *value) + * long bpf_map_pop_elem(struct bpf_map *map, void *value) * Description * Pop an element from *map*. * Return * 0 on success, or a negative error in case of failure. * - * int bpf_map_peek_elem(struct bpf_map *map, void *value) + * long bpf_map_peek_elem(struct bpf_map *map, void *value) * Description * Get an element from *map* without removing it. * Return * 0 on success, or a negative error in case of failure. * - * int bpf_msg_push_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags) + * long bpf_msg_push_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags) * Description * For socket policies, insert *len* bytes into *msg* at offset * *start*. @@ -2517,7 +2517,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags) + * long bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags) * Description * Will remove *len* bytes from a *msg* starting at byte *start*. * This may result in **ENOMEM** errors under certain situations if @@ -2529,7 +2529,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_rc_pointer_rel(void *ctx, s32 rel_x, s32 rel_y) + * long bpf_rc_pointer_rel(void *ctx, s32 rel_x, s32 rel_y) * Description * This helper is used in programs implementing IR decoding, to * report a successfully decoded pointer movement. @@ -2543,7 +2543,7 @@ union bpf_attr { * Return * 0 * - * int bpf_spin_lock(struct bpf_spin_lock *lock) + * long bpf_spin_lock(struct bpf_spin_lock *lock) * Description * Acquire a spinlock represented by the pointer *lock*, which is * stored as part of a value of a map. Taking the lock allows to @@ -2591,7 +2591,7 @@ union bpf_attr { * Return * 0 * - * int bpf_spin_unlock(struct bpf_spin_lock *lock) + * long bpf_spin_unlock(struct bpf_spin_lock *lock) * Description * Release the *lock* previously locked by a call to * **bpf_spin_lock**\ (\ *lock*\ ). @@ -2614,7 +2614,7 @@ union bpf_attr { * A **struct bpf_tcp_sock** pointer on success, or **NULL** in * case of failure. * - * int bpf_skb_ecn_set_ce(struct sk_buff *skb) + * long bpf_skb_ecn_set_ce(struct sk_buff *skb) * Description * Set ECN (Explicit Congestion Notification) field of IP header * to **CE** (Congestion Encountered) if current value is **ECT** @@ -2651,7 +2651,7 @@ union bpf_attr { * result is from *reuse*\ **->socks**\ [] using the hash of the * tuple. * - * int bpf_tcp_check_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) + * long bpf_tcp_check_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) * Description * Check whether *iph* and *th* contain a valid SYN cookie ACK for * the listening socket in *sk*. @@ -2666,7 +2666,7 @@ union bpf_attr { * 0 if *iph* and *th* are a valid SYN cookie ACK, or a negative * error otherwise. * - * int bpf_sysctl_get_name(struct bpf_sysctl *ctx, char *buf, size_t buf_len, u64 flags) + * long bpf_sysctl_get_name(struct bpf_sysctl *ctx, char *buf, size_t buf_len, u64 flags) * Description * Get name of sysctl in /proc/sys/ and copy it into provided by * program buffer *buf* of size *buf_len*. @@ -2682,7 +2682,7 @@ union bpf_attr { * **-E2BIG** if the buffer wasn't big enough (*buf* will contain * truncated name in this case). * - * int bpf_sysctl_get_current_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) + * long bpf_sysctl_get_current_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) * Description * Get current value of sysctl as it is presented in /proc/sys * (incl. newline, etc), and copy it as a string into provided @@ -2701,7 +2701,7 @@ union bpf_attr { * **-EINVAL** if current value was unavailable, e.g. because * sysctl is uninitialized and read returns -EIO for it. * - * int bpf_sysctl_get_new_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) + * long bpf_sysctl_get_new_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) * Description * Get new value being written by user space to sysctl (before * the actual write happens) and copy it as a string into @@ -2718,7 +2718,7 @@ union bpf_attr { * * **-EINVAL** if sysctl is being read. * - * int bpf_sysctl_set_new_value(struct bpf_sysctl *ctx, const char *buf, size_t buf_len) + * long bpf_sysctl_set_new_value(struct bpf_sysctl *ctx, const char *buf, size_t buf_len) * Description * Override new value being written by user space to sysctl with * value provided by program in buffer *buf* of size *buf_len*. @@ -2735,7 +2735,7 @@ union bpf_attr { * * **-EINVAL** if sysctl is being read. * - * int bpf_strtol(const char *buf, size_t buf_len, u64 flags, long *res) + * long bpf_strtol(const char *buf, size_t buf_len, u64 flags, long *res) * Description * Convert the initial part of the string from buffer *buf* of * size *buf_len* to a long integer according to the given base @@ -2759,7 +2759,7 @@ union bpf_attr { * * **-ERANGE** if resulting value was out of range. * - * int bpf_strtoul(const char *buf, size_t buf_len, u64 flags, unsigned long *res) + * long bpf_strtoul(const char *buf, size_t buf_len, u64 flags, unsigned long *res) * Description * Convert the initial part of the string from buffer *buf* of * size *buf_len* to an unsigned long integer according to the @@ -2810,7 +2810,7 @@ union bpf_attr { * **NULL** if not found or there was an error in adding * a new bpf-local-storage. * - * int bpf_sk_storage_delete(struct bpf_map *map, struct bpf_sock *sk) + * long bpf_sk_storage_delete(struct bpf_map *map, struct bpf_sock *sk) * Description * Delete a bpf-local-storage from a *sk*. * Return @@ -2818,7 +2818,7 @@ union bpf_attr { * * **-ENOENT** if the bpf-local-storage cannot be found. * - * int bpf_send_signal(u32 sig) + * long bpf_send_signal(u32 sig) * Description * Send signal *sig* to the process of the current task. * The signal may be delivered to any of this process's threads. @@ -2859,7 +2859,7 @@ union bpf_attr { * * **-EPROTONOSUPPORT** IP packet version is not 4 or 6 * - * int bpf_skb_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * long bpf_skb_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) * Description * Write raw *data* blob into a special BPF perf event held by * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf @@ -2883,21 +2883,21 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_probe_read_user(void *dst, u32 size, const void *unsafe_ptr) + * long bpf_probe_read_user(void *dst, u32 size, const void *unsafe_ptr) * Description * Safely attempt to read *size* bytes from user space address * *unsafe_ptr* and store the data in *dst*. * Return * 0 on success, or a negative error in case of failure. * - * int bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr) + * long bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr) * Description * Safely attempt to read *size* bytes from kernel space address * *unsafe_ptr* and store the data in *dst*. * Return * 0 on success, or a negative error in case of failure. * - * int bpf_probe_read_user_str(void *dst, u32 size, const void *unsafe_ptr) + * long bpf_probe_read_user_str(void *dst, u32 size, const void *unsafe_ptr) * Description * Copy a NUL terminated string from an unsafe user address * *unsafe_ptr* to *dst*. The *size* should include the @@ -2941,7 +2941,7 @@ union bpf_attr { * including the trailing NUL character. On error, a negative * value. * - * int bpf_probe_read_kernel_str(void *dst, u32 size, const void *unsafe_ptr) + * long bpf_probe_read_kernel_str(void *dst, u32 size, const void *unsafe_ptr) * Description * Copy a NUL terminated string from an unsafe kernel address *unsafe_ptr* * to *dst*. Same semantics as with **bpf_probe_read_user_str**\ () apply. @@ -2949,14 +2949,14 @@ union bpf_attr { * On success, the strictly positive length of the string, including * the trailing NUL character. On error, a negative value. * - * int bpf_tcp_send_ack(void *tp, u32 rcv_nxt) + * long bpf_tcp_send_ack(void *tp, u32 rcv_nxt) * Description * Send out a tcp-ack. *tp* is the in-kernel struct **tcp_sock**. * *rcv_nxt* is the ack_seq to be sent out. * Return * 0 on success, or a negative error in case of failure. * - * int bpf_send_signal_thread(u32 sig) + * long bpf_send_signal_thread(u32 sig) * Description * Send signal *sig* to the thread corresponding to the current task. * Return @@ -2976,7 +2976,7 @@ union bpf_attr { * Return * The 64 bit jiffies * - * int bpf_read_branch_records(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags) + * long bpf_read_branch_records(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags) * Description * For an eBPF program attached to a perf event, retrieve the * branch records (**struct perf_branch_entry**) associated to *ctx* @@ -2995,7 +2995,7 @@ union bpf_attr { * * **-ENOENT** if architecture does not support branch records. * - * int bpf_get_ns_current_pid_tgid(u64 dev, u64 ino, struct bpf_pidns_info *nsdata, u32 size) + * long bpf_get_ns_current_pid_tgid(u64 dev, u64 ino, struct bpf_pidns_info *nsdata, u32 size) * Description * Returns 0 on success, values for *pid* and *tgid* as seen from the current * *namespace* will be returned in *nsdata*. @@ -3007,7 +3007,7 @@ union bpf_attr { * * **-ENOENT** if pidns does not exists for the current task. * - * int bpf_xdp_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * long bpf_xdp_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) * Description * Write raw *data* blob into a special BPF perf event held by * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf @@ -3062,7 +3062,7 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * - * int bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags) + * long bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags) * Description * Assign the *sk* to the *skb*. When combined with appropriate * routing configuration to receive the packet towards the socket, @@ -3097,7 +3097,7 @@ union bpf_attr { * Return * Current *ktime*. * - * int bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len) + * long bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len) * Description * **bpf_seq_printf**\ () uses seq_file **seq_printf**\ () to print * out the format string. @@ -3126,7 +3126,7 @@ union bpf_attr { * * **-EOVERFLOW** if an overflow happened: The same object will be tried again. * - * int bpf_seq_write(struct seq_file *m, const void *data, u32 len) + * long bpf_seq_write(struct seq_file *m, const void *data, u32 len) * Description * **bpf_seq_write**\ () uses seq_file **seq_write**\ () to write the data. * The *m* represents the seq_file. The *data* and *len* represent the @@ -3221,7 +3221,7 @@ union bpf_attr { * Return * Requested value, or 0, if flags are not recognized. * - * int bpf_csum_level(struct sk_buff *skb, u64 level) + * long bpf_csum_level(struct sk_buff *skb, u64 level) * Description * Change the skbs checksum level by one layer up or down, or * reset it entirely to none in order to have the stack perform -- cgit v1.2.3-59-g8ed1b From 428d2459cceb77357b81c242ca22462a6a904817 Mon Sep 17 00:00:00 2001 From: Petr Vaněk Date: Sat, 30 May 2020 14:39:12 +0200 Subject: xfrm: introduce oseq-may-wrap flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit RFC 4303 in section 3.3.3 suggests to disable anti-replay for manually distributed ICVs in which case the sender does not need to monitor or reset the counter. However, the sender still increments the counter and when it reaches the maximum value, the counter rolls over back to zero. This patch introduces new extra_flag XFRM_SA_XFLAG_OSEQ_MAY_WRAP which allows sequence number to cycle in outbound packets if set. This flag is used only in legacy and bmp code, because esn should not be negotiated if anti-replay is disabled (see note in 3.3.3 section). Signed-off-by: Petr Vaněk Acked-by: Christophe Gouault Signed-off-by: Steffen Klassert --- include/uapi/linux/xfrm.h | 1 + net/xfrm/xfrm_replay.c | 12 ++++++++---- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h index ff7cfdc6cb44..ffc6a5391bb7 100644 --- a/include/uapi/linux/xfrm.h +++ b/include/uapi/linux/xfrm.h @@ -387,6 +387,7 @@ struct xfrm_usersa_info { }; #define XFRM_SA_XFLAG_DONT_ENCAP_DSCP 1 +#define XFRM_SA_XFLAG_OSEQ_MAY_WRAP 2 struct xfrm_usersa_id { xfrm_address_t daddr; diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c index 98943f8d01aa..c6a4338a0d08 100644 --- a/net/xfrm/xfrm_replay.c +++ b/net/xfrm/xfrm_replay.c @@ -89,7 +89,8 @@ static int xfrm_replay_overflow(struct xfrm_state *x, struct sk_buff *skb) if (x->type->flags & XFRM_TYPE_REPLAY_PROT) { XFRM_SKB_CB(skb)->seq.output.low = ++x->replay.oseq; XFRM_SKB_CB(skb)->seq.output.hi = 0; - if (unlikely(x->replay.oseq == 0)) { + if (unlikely(x->replay.oseq == 0) && + !(x->props.extra_flags & XFRM_SA_XFLAG_OSEQ_MAY_WRAP)) { x->replay.oseq--; xfrm_audit_state_replay_overflow(x, skb); err = -EOVERFLOW; @@ -168,7 +169,8 @@ static int xfrm_replay_overflow_bmp(struct xfrm_state *x, struct sk_buff *skb) if (x->type->flags & XFRM_TYPE_REPLAY_PROT) { XFRM_SKB_CB(skb)->seq.output.low = ++replay_esn->oseq; XFRM_SKB_CB(skb)->seq.output.hi = 0; - if (unlikely(replay_esn->oseq == 0)) { + if (unlikely(replay_esn->oseq == 0) && + !(x->props.extra_flags & XFRM_SA_XFLAG_OSEQ_MAY_WRAP)) { replay_esn->oseq--; xfrm_audit_state_replay_overflow(x, skb); err = -EOVERFLOW; @@ -572,7 +574,8 @@ static int xfrm_replay_overflow_offload(struct xfrm_state *x, struct sk_buff *sk XFRM_SKB_CB(skb)->seq.output.hi = 0; xo->seq.hi = 0; - if (unlikely(oseq < x->replay.oseq)) { + if (unlikely(oseq < x->replay.oseq) && + !(x->props.extra_flags & XFRM_SA_XFLAG_OSEQ_MAY_WRAP)) { xfrm_audit_state_replay_overflow(x, skb); err = -EOVERFLOW; @@ -611,7 +614,8 @@ static int xfrm_replay_overflow_offload_bmp(struct xfrm_state *x, struct sk_buff XFRM_SKB_CB(skb)->seq.output.hi = 0; xo->seq.hi = 0; - if (unlikely(oseq < replay_esn->oseq)) { + if (unlikely(oseq < replay_esn->oseq) && + !(x->props.extra_flags & XFRM_SA_XFLAG_OSEQ_MAY_WRAP)) { xfrm_audit_state_replay_overflow(x, skb); err = -EOVERFLOW; -- cgit v1.2.3-59-g8ed1b From 65959522f8060659e308977f09f3eb7b7af5e43f Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Tue, 23 Jun 2020 14:30:40 +0300 Subject: RDMA: Add support to dump resource tracker in RAW format Add support to get resource dump in raw format. It enable drivers to return the entire device specific QP/CQ/MR context without a need from the driver to set each field separately. The raw query returns only the device specific data, general data is still returned by using the existing queries. Example: $ rdma res show mr dev mlx5_1 mrn 2 -r -j [{"ifindex":7,"ifname":"mlx5_1", "data":[0,4,255,254,0,0,0,0,0,0,0,0,16,28,0,216,...]}] Link: https://lore.kernel.org/r/20200623113043.1228482-9-leon@kernel.org Signed-off-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 3 + drivers/infiniband/core/nldev.c | 180 +++++++++++++++++++++++++-------------- include/rdma/ib_verbs.h | 3 + include/uapi/rdma/rdma_netlink.h | 8 ++ 4 files changed, 132 insertions(+), 62 deletions(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index cbe95e729cf1..1335ed1f1e4a 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -2619,8 +2619,11 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, enable_driver); SET_DEVICE_OP(dev_ops, fill_res_cm_id_entry); SET_DEVICE_OP(dev_ops, fill_res_cq_entry); + SET_DEVICE_OP(dev_ops, fill_res_cq_entry_raw); SET_DEVICE_OP(dev_ops, fill_res_mr_entry); + SET_DEVICE_OP(dev_ops, fill_res_mr_entry_raw); SET_DEVICE_OP(dev_ops, fill_res_qp_entry); + SET_DEVICE_OP(dev_ops, fill_res_qp_entry_raw); SET_DEVICE_OP(dev_ops, fill_stat_mr_entry); SET_DEVICE_OP(dev_ops, get_dev_fw_str); SET_DEVICE_OP(dev_ops, get_dma_mr); diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 394e307c342c..1051b5622b62 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -114,6 +114,7 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_RES_PS] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_QP] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_QP_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_RAW] = { .type = NLA_BINARY }, [RDMA_NLDEV_ATTR_RES_RKEY] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_RQPN] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_RQ_PSN] = { .type = NLA_U32 }, @@ -446,11 +447,11 @@ static int fill_res_name_pid(struct sk_buff *msg, return err ? -EMSGSIZE : 0; } -static int fill_res_qp_entry(struct sk_buff *msg, bool has_cap_net_admin, - struct rdma_restrack_entry *res, uint32_t port) +static int fill_res_qp_entry_query(struct sk_buff *msg, + struct rdma_restrack_entry *res, + struct ib_device *dev, + struct ib_qp *qp) { - struct ib_qp *qp = container_of(res, struct ib_qp, res); - struct ib_device *dev = qp->device; struct ib_qp_init_attr qp_init_attr; struct ib_qp_attr qp_attr; int ret; @@ -459,16 +460,6 @@ static int fill_res_qp_entry(struct sk_buff *msg, bool has_cap_net_admin, if (ret) return ret; - if (port && port != qp_attr.port_num) - return -EAGAIN; - - /* In create_qp() port is not set yet */ - if (qp_attr.port_num && - nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, qp_attr.port_num)) - goto err; - - if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qp->qp_num)) - goto err; if (qp->qp_type == IB_QPT_RC || qp->qp_type == IB_QPT_UC) { if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_RQPN, qp_attr.dest_qp_num)) @@ -492,13 +483,6 @@ static int fill_res_qp_entry(struct sk_buff *msg, bool has_cap_net_admin, if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_STATE, qp_attr.qp_state)) goto err; - if (!rdma_is_kernel_res(res) && - nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, qp->pd->res.id)) - goto err; - - if (fill_res_name_pid(msg, res)) - goto err; - if (dev->ops.fill_res_qp_entry) return dev->ops.fill_res_qp_entry(msg, qp); return 0; @@ -506,6 +490,48 @@ static int fill_res_qp_entry(struct sk_buff *msg, bool has_cap_net_admin, err: return -EMSGSIZE; } +static int fill_res_qp_entry(struct sk_buff *msg, bool has_cap_net_admin, + struct rdma_restrack_entry *res, uint32_t port) +{ + struct ib_qp *qp = container_of(res, struct ib_qp, res); + struct ib_device *dev = qp->device; + int ret; + + if (port && port != qp->port) + return -EAGAIN; + + /* In create_qp() port is not set yet */ + if (qp->port && nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, qp->port)) + return -EINVAL; + + ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qp->qp_num); + if (ret) + return -EMSGSIZE; + + if (!rdma_is_kernel_res(res) && + nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, qp->pd->res.id)) + return -EMSGSIZE; + + ret = fill_res_name_pid(msg, res); + if (ret) + return -EMSGSIZE; + + return fill_res_qp_entry_query(msg, res, dev, qp); +} + +static int fill_res_qp_raw_entry(struct sk_buff *msg, bool has_cap_net_admin, + struct rdma_restrack_entry *res, uint32_t port) +{ + struct ib_qp *qp = container_of(res, struct ib_qp, res); + struct ib_device *dev = qp->device; + + if (port && port != qp->port) + return -EAGAIN; + if (!dev->ops.fill_res_qp_entry_raw) + return -EINVAL; + return dev->ops.fill_res_qp_entry_raw(msg, qp); +} + static int fill_res_cm_id_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { @@ -565,34 +591,42 @@ static int fill_res_cq_entry(struct sk_buff *msg, bool has_cap_net_admin, struct ib_device *dev = cq->device; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQE, cq->cqe)) - goto err; + return -EMSGSIZE; if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_USECNT, atomic_read(&cq->usecnt), RDMA_NLDEV_ATTR_PAD)) - goto err; + return -EMSGSIZE; /* Poll context is only valid for kernel CQs */ if (rdma_is_kernel_res(res) && nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_POLL_CTX, cq->poll_ctx)) - goto err; + return -EMSGSIZE; if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_DIM, (cq->dim != NULL))) - goto err; + return -EMSGSIZE; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQN, res->id)) - goto err; + return -EMSGSIZE; if (!rdma_is_kernel_res(res) && nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CTXN, cq->uobject->uevent.uobject.context->res.id)) - goto err; + return -EMSGSIZE; if (fill_res_name_pid(msg, res)) - goto err; + return -EMSGSIZE; - if (dev->ops.fill_res_cq_entry) - return dev->ops.fill_res_cq_entry(msg, cq); - return 0; + return (dev->ops.fill_res_cq_entry) ? + dev->ops.fill_res_cq_entry(msg, cq) : 0; +} -err: return -EMSGSIZE; +static int fill_res_cq_raw_entry(struct sk_buff *msg, bool has_cap_net_admin, + struct rdma_restrack_entry *res, uint32_t port) +{ + struct ib_cq *cq = container_of(res, struct ib_cq, res); + struct ib_device *dev = cq->device; + + if (!dev->ops.fill_res_cq_entry_raw) + return -EINVAL; + return dev->ops.fill_res_cq_entry_raw(msg, cq); } static int fill_res_mr_entry(struct sk_buff *msg, bool has_cap_net_admin, @@ -603,30 +637,39 @@ static int fill_res_mr_entry(struct sk_buff *msg, bool has_cap_net_admin, if (has_cap_net_admin) { if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_RKEY, mr->rkey)) - goto err; + return -EMSGSIZE; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LKEY, mr->lkey)) - goto err; + return -EMSGSIZE; } if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_MRLEN, mr->length, RDMA_NLDEV_ATTR_PAD)) - goto err; + return -EMSGSIZE; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_MRN, res->id)) - goto err; + return -EMSGSIZE; if (!rdma_is_kernel_res(res) && nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, mr->pd->res.id)) - goto err; + return -EMSGSIZE; if (fill_res_name_pid(msg, res)) - goto err; + return -EMSGSIZE; - if (dev->ops.fill_res_mr_entry) - return dev->ops.fill_res_mr_entry(msg, mr); - return 0; + return (dev->ops.fill_res_mr_entry) ? + dev->ops.fill_res_mr_entry(msg, mr) : + 0; +} -err: return -EMSGSIZE; +static int fill_res_mr_raw_entry(struct sk_buff *msg, bool has_cap_net_admin, + struct rdma_restrack_entry *res, uint32_t port) +{ + struct ib_mr *mr = container_of(res, struct ib_mr, res); + struct ib_device *dev = mr->pd->device; + + if (!dev->ops.fill_res_mr_entry_raw) + return -EINVAL; + return dev->ops.fill_res_mr_entry_raw(msg, mr); } static int fill_res_pd_entry(struct sk_buff *msg, bool has_cap_net_admin, @@ -1149,7 +1192,6 @@ static int nldev_res_get_dumpit(struct sk_buff *skb, struct nldev_fill_res_entry { enum rdma_nldev_attr nldev_attr; - enum rdma_nldev_command nldev_cmd; u8 flags; u32 entry; u32 id; @@ -1161,40 +1203,34 @@ enum nldev_res_flags { static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = { [RDMA_RESTRACK_QP] = { - .nldev_cmd = RDMA_NLDEV_CMD_RES_QP_GET, .nldev_attr = RDMA_NLDEV_ATTR_RES_QP, .entry = RDMA_NLDEV_ATTR_RES_QP_ENTRY, .id = RDMA_NLDEV_ATTR_RES_LQPN, }, [RDMA_RESTRACK_CM_ID] = { - .nldev_cmd = RDMA_NLDEV_CMD_RES_CM_ID_GET, .nldev_attr = RDMA_NLDEV_ATTR_RES_CM_ID, .entry = RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY, .id = RDMA_NLDEV_ATTR_RES_CM_IDN, }, [RDMA_RESTRACK_CQ] = { - .nldev_cmd = RDMA_NLDEV_CMD_RES_CQ_GET, .nldev_attr = RDMA_NLDEV_ATTR_RES_CQ, .flags = NLDEV_PER_DEV, .entry = RDMA_NLDEV_ATTR_RES_CQ_ENTRY, .id = RDMA_NLDEV_ATTR_RES_CQN, }, [RDMA_RESTRACK_MR] = { - .nldev_cmd = RDMA_NLDEV_CMD_RES_MR_GET, .nldev_attr = RDMA_NLDEV_ATTR_RES_MR, .flags = NLDEV_PER_DEV, .entry = RDMA_NLDEV_ATTR_RES_MR_ENTRY, .id = RDMA_NLDEV_ATTR_RES_MRN, }, [RDMA_RESTRACK_PD] = { - .nldev_cmd = RDMA_NLDEV_CMD_RES_PD_GET, .nldev_attr = RDMA_NLDEV_ATTR_RES_PD, .flags = NLDEV_PER_DEV, .entry = RDMA_NLDEV_ATTR_RES_PD_ENTRY, .id = RDMA_NLDEV_ATTR_RES_PDN, }, [RDMA_RESTRACK_COUNTER] = { - .nldev_cmd = RDMA_NLDEV_CMD_STAT_GET, .nldev_attr = RDMA_NLDEV_ATTR_STAT_COUNTER, .entry = RDMA_NLDEV_ATTR_STAT_COUNTER_ENTRY, .id = RDMA_NLDEV_ATTR_STAT_COUNTER_ID, @@ -1253,7 +1289,8 @@ static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh, } nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, - RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, fe->nldev_cmd), + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, + RDMA_NL_GET_OP(nlh->nlmsg_type)), 0, 0); if (fill_nldev_handle(msg, device)) { @@ -1331,7 +1368,8 @@ static int res_get_common_dumpit(struct sk_buff *skb, } nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, fe->nldev_cmd), + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, + RDMA_NL_GET_OP(cb->nlh->nlmsg_type)), 0, NLM_F_MULTI); if (fill_nldev_handle(skb, device)) { @@ -1413,26 +1451,29 @@ err_index: return ret; } -#define RES_GET_FUNCS(name, type) \ - static int nldev_res_get_##name##_dumpit(struct sk_buff *skb, \ +#define RES_GET_FUNCS(name, type) \ + static int nldev_res_get_##name##_dumpit(struct sk_buff *skb, \ struct netlink_callback *cb) \ - { \ - return res_get_common_dumpit(skb, cb, type, \ - fill_res_##name##_entry); \ - } \ - static int nldev_res_get_##name##_doit(struct sk_buff *skb, \ - struct nlmsghdr *nlh, \ + { \ + return res_get_common_dumpit(skb, cb, type, \ + fill_res_##name##_entry); \ + } \ + static int nldev_res_get_##name##_doit(struct sk_buff *skb, \ + struct nlmsghdr *nlh, \ struct netlink_ext_ack *extack) \ - { \ - return res_get_common_doit(skb, nlh, extack, type, \ - fill_res_##name##_entry); \ + { \ + return res_get_common_doit(skb, nlh, extack, type, \ + fill_res_##name##_entry); \ } RES_GET_FUNCS(qp, RDMA_RESTRACK_QP); +RES_GET_FUNCS(qp_raw, RDMA_RESTRACK_QP); RES_GET_FUNCS(cm_id, RDMA_RESTRACK_CM_ID); RES_GET_FUNCS(cq, RDMA_RESTRACK_CQ); +RES_GET_FUNCS(cq_raw, RDMA_RESTRACK_CQ); RES_GET_FUNCS(pd, RDMA_RESTRACK_PD); RES_GET_FUNCS(mr, RDMA_RESTRACK_MR); +RES_GET_FUNCS(mr_raw, RDMA_RESTRACK_MR); RES_GET_FUNCS(counter, RDMA_RESTRACK_COUNTER); static LIST_HEAD(link_ops); @@ -2117,6 +2158,21 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { .doit = nldev_stat_del_doit, .flags = RDMA_NL_ADMIN_PERM, }, + [RDMA_NLDEV_CMD_RES_QP_GET_RAW] = { + .doit = nldev_res_get_qp_raw_doit, + .dump = nldev_res_get_qp_raw_dumpit, + .flags = RDMA_NL_ADMIN_PERM, + }, + [RDMA_NLDEV_CMD_RES_CQ_GET_RAW] = { + .doit = nldev_res_get_cq_raw_doit, + .dump = nldev_res_get_cq_raw_dumpit, + .flags = RDMA_NL_ADMIN_PERM, + }, + [RDMA_NLDEV_CMD_RES_MR_GET_RAW] = { + .doit = nldev_res_get_mr_raw_doit, + .dump = nldev_res_get_mr_raw_dumpit, + .flags = RDMA_NL_ADMIN_PERM, + }, }; void __init nldev_init(void) diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 9127cffafccd..77106ff3cd26 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2583,8 +2583,11 @@ struct ib_device_ops { * Allows rdma drivers to add their own restrack attributes. */ int (*fill_res_mr_entry)(struct sk_buff *msg, struct ib_mr *ibmr); + int (*fill_res_mr_entry_raw)(struct sk_buff *msg, struct ib_mr *ibmr); int (*fill_res_cq_entry)(struct sk_buff *msg, struct ib_cq *ibcq); + int (*fill_res_cq_entry_raw)(struct sk_buff *msg, struct ib_cq *ibcq); int (*fill_res_qp_entry)(struct sk_buff *msg, struct ib_qp *ibqp); + int (*fill_res_qp_entry_raw)(struct sk_buff *msg, struct ib_qp *ibqp); int (*fill_res_cm_id_entry)(struct sk_buff *msg, struct rdma_cm_id *id); /* Device lifecycle callbacks */ diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 8e277783fa96..3826143d420d 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -287,6 +287,12 @@ enum rdma_nldev_command { RDMA_NLDEV_CMD_STAT_DEL, + RDMA_NLDEV_CMD_RES_QP_GET_RAW, + + RDMA_NLDEV_CMD_RES_CQ_GET_RAW, + + RDMA_NLDEV_CMD_RES_MR_GET_RAW, + RDMA_NLDEV_NUM_OPS }; @@ -525,6 +531,8 @@ enum rdma_nldev_attr { */ RDMA_NLDEV_ATTR_DEV_DIM, /* u8 */ + RDMA_NLDEV_ATTR_RES_RAW, /* binary */ + /* * Always the end */ -- cgit v1.2.3-59-g8ed1b From 62fb45d317c5fa08e4db093441835bb6f33acbd7 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 18 Jun 2020 16:42:06 +0200 Subject: USB: ch9: add "USB_" prefix in front of TEST defines For some reason, the TEST_ defines in the usb/ch9.h files did not have the USB_ prefix on it, making it a bit confusing when reading the file, as well as not the nicest thing to do in a uapi file. So fix that up and add the USB_ prefix on to them, and fix up all in-kernel usages. This included deleting the duplicate copy in the net2272.h file. Cc: Felipe Balbi Cc: Michal Simek Cc: Mathias Nyman Cc: Pawel Laszczak Cc: YueHaibing Cc: Nathan Chancellor Cc: Jason Yan Cc: Jia-Ju Bai Cc: Stephen Boyd Cc: Christophe JAILLET Cc: Arnd Bergmann Cc: Jules Irenge Cc: Alan Stern Cc: Thinh Nguyen Cc: Rob Gill Cc: Macpaul Lin Acked-by: Minas Harutyunyan Acked-by: Bin Liu Acked-by: Chunfeng Yun Acked-by: Peter Chen Link: https://lore.kernel.org/r/20200618144206.2655890-1-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/cdns3/ep0.c | 8 ++++---- drivers/usb/chipidea/udc.c | 10 +++++----- drivers/usb/common/debug.c | 10 +++++----- drivers/usb/dwc2/debugfs.c | 20 ++++++++++---------- drivers/usb/dwc2/gadget.c | 10 +++++----- drivers/usb/dwc3/debugfs.c | 20 ++++++++++---------- drivers/usb/dwc3/ep0.c | 10 +++++----- drivers/usb/dwc3/gadget.c | 10 +++++----- drivers/usb/gadget/udc/bdc/bdc_ep.c | 10 +++++----- drivers/usb/gadget/udc/gr_udc.c | 4 ++-- drivers/usb/gadget/udc/mv_udc_core.c | 2 +- drivers/usb/gadget/udc/net2272.c | 2 +- drivers/usb/gadget/udc/net2272.h | 5 ----- drivers/usb/gadget/udc/udc-xilinx.c | 4 ++-- drivers/usb/host/xhci-hub.c | 7 ++++--- drivers/usb/misc/ehset.c | 8 ++++---- drivers/usb/mtu3/mtu3_gadget_ep0.c | 16 ++++++++-------- drivers/usb/musb/musb_gadget_ep0.c | 20 ++++++++------------ drivers/usb/musb/musb_virthub.c | 20 ++++++++++---------- include/uapi/linux/usb/ch9.h | 10 +++++----- 20 files changed, 99 insertions(+), 107 deletions(-) (limited to 'include/uapi') diff --git a/drivers/usb/cdns3/ep0.c b/drivers/usb/cdns3/ep0.c index 82645a2a0f52..04a522f5ae58 100644 --- a/drivers/usb/cdns3/ep0.c +++ b/drivers/usb/cdns3/ep0.c @@ -328,10 +328,10 @@ static int cdns3_ep0_feature_handle_device(struct cdns3_device *priv_dev, return -EINVAL; switch (tmode >> 8) { - case TEST_J: - case TEST_K: - case TEST_SE0_NAK: - case TEST_PACKET: + case USB_TEST_J: + case USB_TEST_K: + case USB_TEST_SE0_NAK: + case USB_TEST_PACKET: cdns3_set_register_bit(&priv_dev->regs->usb_cmd, USB_CMD_STMODE | USB_STS_TMODE_SEL(tmode - 1)); diff --git a/drivers/usb/chipidea/udc.c b/drivers/usb/chipidea/udc.c index db0cfde0cc3c..4beb25888917 100644 --- a/drivers/usb/chipidea/udc.c +++ b/drivers/usb/chipidea/udc.c @@ -1215,11 +1215,11 @@ __acquires(ci->lock) case USB_DEVICE_TEST_MODE: tmode = le16_to_cpu(req.wIndex) >> 8; switch (tmode) { - case TEST_J: - case TEST_K: - case TEST_SE0_NAK: - case TEST_PACKET: - case TEST_FORCE_EN: + case USB_TEST_J: + case USB_TEST_K: + case USB_TEST_SE0_NAK: + case USB_TEST_PACKET: + case USB_TEST_FORCE_ENABLE: ci->test_mode = tmode; err = isr_setup_status_phase( ci); diff --git a/drivers/usb/common/debug.c b/drivers/usb/common/debug.c index 92a986aeaa5d..410acd670ca7 100644 --- a/drivers/usb/common/debug.c +++ b/drivers/usb/common/debug.c @@ -53,15 +53,15 @@ static const char *usb_decode_device_feature(u16 wValue) static const char *usb_decode_test_mode(u16 wIndex) { switch (wIndex) { - case TEST_J: + case USB_TEST_J: return ": TEST_J"; - case TEST_K: + case USB_TEST_K: return ": TEST_K"; - case TEST_SE0_NAK: + case USB_TEST_SE0_NAK: return ": TEST_SE0_NAK"; - case TEST_PACKET: + case USB_TEST_PACKET: return ": TEST_PACKET"; - case TEST_FORCE_EN: + case USB_TEST_FORCE_ENABLE: return ": TEST_FORCE_EN"; default: return ": UNKNOWN"; diff --git a/drivers/usb/dwc2/debugfs.c b/drivers/usb/dwc2/debugfs.c index 3a0dcbfbc827..aaafd463d72a 100644 --- a/drivers/usb/dwc2/debugfs.c +++ b/drivers/usb/dwc2/debugfs.c @@ -37,15 +37,15 @@ static ssize_t testmode_write(struct file *file, const char __user *ubuf, size_t return -EFAULT; if (!strncmp(buf, "test_j", 6)) - testmode = TEST_J; + testmode = USB_TEST_J; else if (!strncmp(buf, "test_k", 6)) - testmode = TEST_K; + testmode = USB_TEST_K; else if (!strncmp(buf, "test_se0_nak", 12)) - testmode = TEST_SE0_NAK; + testmode = USB_TEST_SE0_NAK; else if (!strncmp(buf, "test_packet", 11)) - testmode = TEST_PACKET; + testmode = USB_TEST_PACKET; else if (!strncmp(buf, "test_force_enable", 17)) - testmode = TEST_FORCE_EN; + testmode = USB_TEST_FORCE_ENABLE; else testmode = 0; @@ -78,19 +78,19 @@ static int testmode_show(struct seq_file *s, void *unused) case 0: seq_puts(s, "no test\n"); break; - case TEST_J: + case USB_TEST_J: seq_puts(s, "test_j\n"); break; - case TEST_K: + case USB_TEST_K: seq_puts(s, "test_k\n"); break; - case TEST_SE0_NAK: + case USB_TEST_SE0_NAK: seq_puts(s, "test_se0_nak\n"); break; - case TEST_PACKET: + case USB_TEST_PACKET: seq_puts(s, "test_packet\n"); break; - case TEST_FORCE_EN: + case USB_TEST_FORCE_ENABLE: seq_puts(s, "test_force_enable\n"); break; default: diff --git a/drivers/usb/dwc2/gadget.c b/drivers/usb/dwc2/gadget.c index 12b98b466287..38fc46b0c026 100644 --- a/drivers/usb/dwc2/gadget.c +++ b/drivers/usb/dwc2/gadget.c @@ -1561,11 +1561,11 @@ int dwc2_hsotg_set_test_mode(struct dwc2_hsotg *hsotg, int testmode) dctl &= ~DCTL_TSTCTL_MASK; switch (testmode) { - case TEST_J: - case TEST_K: - case TEST_SE0_NAK: - case TEST_PACKET: - case TEST_FORCE_EN: + case USB_TEST_J: + case USB_TEST_K: + case USB_TEST_SE0_NAK: + case USB_TEST_PACKET: + case USB_TEST_FORCE_ENABLE: dctl |= testmode << DCTL_TSTCTL_SHIFT; break; default: diff --git a/drivers/usb/dwc3/debugfs.c b/drivers/usb/dwc3/debugfs.c index 6d9de334e46a..14dc6a37305d 100644 --- a/drivers/usb/dwc3/debugfs.c +++ b/drivers/usb/dwc3/debugfs.c @@ -466,19 +466,19 @@ static int dwc3_testmode_show(struct seq_file *s, void *unused) case 0: seq_printf(s, "no test\n"); break; - case TEST_J: + case USB_TEST_J: seq_printf(s, "test_j\n"); break; - case TEST_K: + case USB_TEST_K: seq_printf(s, "test_k\n"); break; - case TEST_SE0_NAK: + case USB_TEST_SE0_NAK: seq_printf(s, "test_se0_nak\n"); break; - case TEST_PACKET: + case USB_TEST_PACKET: seq_printf(s, "test_packet\n"); break; - case TEST_FORCE_EN: + case USB_TEST_FORCE_ENABLE: seq_printf(s, "test_force_enable\n"); break; default: @@ -506,15 +506,15 @@ static ssize_t dwc3_testmode_write(struct file *file, return -EFAULT; if (!strncmp(buf, "test_j", 6)) - testmode = TEST_J; + testmode = USB_TEST_J; else if (!strncmp(buf, "test_k", 6)) - testmode = TEST_K; + testmode = USB_TEST_K; else if (!strncmp(buf, "test_se0_nak", 12)) - testmode = TEST_SE0_NAK; + testmode = USB_TEST_SE0_NAK; else if (!strncmp(buf, "test_packet", 11)) - testmode = TEST_PACKET; + testmode = USB_TEST_PACKET; else if (!strncmp(buf, "test_force_enable", 17)) - testmode = TEST_FORCE_EN; + testmode = USB_TEST_FORCE_ENABLE; else testmode = 0; diff --git a/drivers/usb/dwc3/ep0.c b/drivers/usb/dwc3/ep0.c index 6dee4dabc0a4..8dd69728add3 100644 --- a/drivers/usb/dwc3/ep0.c +++ b/drivers/usb/dwc3/ep0.c @@ -425,11 +425,11 @@ static int dwc3_ep0_handle_test(struct dwc3 *dwc, enum usb_device_state state, return -EINVAL; switch (wIndex >> 8) { - case TEST_J: - case TEST_K: - case TEST_SE0_NAK: - case TEST_PACKET: - case TEST_FORCE_EN: + case USB_TEST_J: + case USB_TEST_K: + case USB_TEST_SE0_NAK: + case USB_TEST_PACKET: + case USB_TEST_FORCE_ENABLE: dwc->test_mode_nr = wIndex >> 8; dwc->test_mode = true; break; diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c index 80c3ef134e41..0b59b2f1cf26 100644 --- a/drivers/usb/dwc3/gadget.c +++ b/drivers/usb/dwc3/gadget.c @@ -46,11 +46,11 @@ int dwc3_gadget_set_test_mode(struct dwc3 *dwc, int mode) reg &= ~DWC3_DCTL_TSTCTRL_MASK; switch (mode) { - case TEST_J: - case TEST_K: - case TEST_SE0_NAK: - case TEST_PACKET: - case TEST_FORCE_EN: + case USB_TEST_J: + case USB_TEST_K: + case USB_TEST_SE0_NAK: + case USB_TEST_PACKET: + case USB_TEST_FORCE_ENABLE: reg |= mode << 1; break; default: diff --git a/drivers/usb/gadget/udc/bdc/bdc_ep.c b/drivers/usb/gadget/udc/bdc/bdc_ep.c index d49c6dc1082d..ba250cf75bef 100644 --- a/drivers/usb/gadget/udc/bdc/bdc_ep.c +++ b/drivers/usb/gadget/udc/bdc/bdc_ep.c @@ -927,11 +927,11 @@ static int bdc_set_test_mode(struct bdc *bdc) usb2_pm &= ~BDC_PTC_MASK; dev_dbg(bdc->dev, "%s\n", __func__); switch (bdc->test_mode) { - case TEST_J: - case TEST_K: - case TEST_SE0_NAK: - case TEST_PACKET: - case TEST_FORCE_EN: + case USB_TEST_J: + case USB_TEST_K: + case USB_TEST_SE0_NAK: + case USB_TEST_PACKET: + case USB_TEST_FORCE_ENABLE: usb2_pm |= bdc->test_mode << 28; break; default: diff --git a/drivers/usb/gadget/udc/gr_udc.c b/drivers/usb/gadget/udc/gr_udc.c index 7164ad9800f1..345e28d76709 100644 --- a/drivers/usb/gadget/udc/gr_udc.c +++ b/drivers/usb/gadget/udc/gr_udc.c @@ -912,9 +912,9 @@ static int gr_device_request(struct gr_udc *dev, u8 type, u8 request, return gr_ep0_respond_empty(dev); case USB_DEVICE_TEST_MODE: - /* The hardware does not support TEST_FORCE_EN */ + /* The hardware does not support USB_TEST_FORCE_ENABLE */ test = index >> 8; - if (test >= TEST_J && test <= TEST_PACKET) { + if (test >= USB_TEST_J && test <= USB_TEST_PACKET) { dev->test_mode = test; return gr_ep0_respond(dev, NULL, 0, gr_ep0_testmode_complete); diff --git a/drivers/usb/gadget/udc/mv_udc_core.c b/drivers/usb/gadget/udc/mv_udc_core.c index cafde053788b..69289717d856 100644 --- a/drivers/usb/gadget/udc/mv_udc_core.c +++ b/drivers/usb/gadget/udc/mv_udc_core.c @@ -1502,7 +1502,7 @@ out: static void mv_udc_testmode(struct mv_udc *udc, u16 index) { - if (index <= TEST_FORCE_EN) { + if (index <= USB_TEST_FORCE_ENABLE) { udc->test_mode = index; if (udc_prime_status(udc, EP_DIR_IN, 0, true)) ep0_stall(udc); diff --git a/drivers/usb/gadget/udc/net2272.c b/drivers/usb/gadget/udc/net2272.c index 928057b206f1..fbbe62513545 100644 --- a/drivers/usb/gadget/udc/net2272.c +++ b/drivers/usb/gadget/udc/net2272.c @@ -1688,7 +1688,7 @@ net2272_set_test_mode(struct net2272 *dev, int mode) net2272_write(dev, USBTEST, mode); /* load test packet */ - if (mode == TEST_PACKET) { + if (mode == USB_TEST_PACKET) { /* switch to 8 bit mode */ net2272_write(dev, LOCCTL, net2272_read(dev, LOCCTL) & ~(1 << DATA_WIDTH)); diff --git a/drivers/usb/gadget/udc/net2272.h b/drivers/usb/gadget/udc/net2272.h index 8e644627992d..87d0ab9ffeeb 100644 --- a/drivers/usb/gadget/udc/net2272.h +++ b/drivers/usb/gadget/udc/net2272.h @@ -105,11 +105,6 @@ #define USBTEST 0x32 #define TEST_MODE_SELECT 0 #define NORMAL_OPERATION 0 -#define TEST_J 1 -#define TEST_K 2 -#define TEST_SE0_NAK 3 -#define TEST_PACKET 4 -#define TEST_FORCE_ENABLE 5 #define XCVRDIAG 0x33 #define FORCE_FULL_SPEED 2 #define FORCE_HIGH_SPEED 3 diff --git a/drivers/usb/gadget/udc/udc-xilinx.c b/drivers/usb/gadget/udc/udc-xilinx.c index 709553bdb233..d5e9d20c097d 100644 --- a/drivers/usb/gadget/udc/udc-xilinx.c +++ b/drivers/usb/gadget/udc/udc-xilinx.c @@ -2097,9 +2097,9 @@ static int xudc_probe(struct platform_device *pdev) /* Check for IP endianness */ udc->write_fn = xudc_write32_be; udc->read_fn = xudc_read32_be; - udc->write_fn(udc->addr, XUSB_TESTMODE_OFFSET, TEST_J); + udc->write_fn(udc->addr, XUSB_TESTMODE_OFFSET, USB_TEST_J); if ((udc->read_fn(udc->addr + XUSB_TESTMODE_OFFSET)) - != TEST_J) { + != USB_TEST_J) { udc->write_fn = xudc_write32; udc->read_fn = xudc_read32; } diff --git a/drivers/usb/host/xhci-hub.c b/drivers/usb/host/xhci-hub.c index f37316d2c8fa..073c54e42223 100644 --- a/drivers/usb/host/xhci-hub.c +++ b/drivers/usb/host/xhci-hub.c @@ -612,7 +612,7 @@ static void xhci_port_set_test_mode(struct xhci_hcd *xhci, temp |= test_mode << PORT_TEST_MODE_SHIFT; writel(temp, port->addr + PORTPMSC); xhci->test_mode = test_mode; - if (test_mode == TEST_FORCE_EN) + if (test_mode == USB_TEST_FORCE_ENABLE) xhci_start(xhci); } @@ -666,7 +666,7 @@ static int xhci_exit_test_mode(struct xhci_hcd *xhci) xhci_err(xhci, "Not in test mode, do nothing.\n"); return 0; } - if (xhci->test_mode == TEST_FORCE_EN && + if (xhci->test_mode == USB_TEST_FORCE_ENABLE && !(xhci->xhc_state & XHCI_STATE_HALTED)) { retval = xhci_halt(xhci); if (retval) @@ -1421,7 +1421,8 @@ int xhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue, /* 4.19.6 Port Test Modes (USB2 Test Mode) */ if (hcd->speed != HCD_USB2) goto error; - if (test_mode > TEST_FORCE_EN || test_mode < TEST_J) + if (test_mode > USB_TEST_FORCE_ENABLE || + test_mode < USB_TEST_J) goto error; retval = xhci_enter_test_mode(xhci, test_mode, wIndex, &flags); diff --git a/drivers/usb/misc/ehset.c b/drivers/usb/misc/ehset.c index 7895d61e733b..2752e1f4f4d0 100644 --- a/drivers/usb/misc/ehset.c +++ b/drivers/usb/misc/ehset.c @@ -33,28 +33,28 @@ static int ehset_probe(struct usb_interface *intf, ret = usb_control_msg(hub_udev, usb_sndctrlpipe(hub_udev, 0), USB_REQ_SET_FEATURE, USB_RT_PORT, USB_PORT_FEAT_TEST, - (TEST_SE0_NAK << 8) | portnum, + (USB_TEST_SE0_NAK << 8) | portnum, NULL, 0, 1000); break; case TEST_J_PID: ret = usb_control_msg(hub_udev, usb_sndctrlpipe(hub_udev, 0), USB_REQ_SET_FEATURE, USB_RT_PORT, USB_PORT_FEAT_TEST, - (TEST_J << 8) | portnum, + (USB_TEST_J << 8) | portnum, NULL, 0, 1000); break; case TEST_K_PID: ret = usb_control_msg(hub_udev, usb_sndctrlpipe(hub_udev, 0), USB_REQ_SET_FEATURE, USB_RT_PORT, USB_PORT_FEAT_TEST, - (TEST_K << 8) | portnum, + (USB_TEST_K << 8) | portnum, NULL, 0, 1000); break; case TEST_PACKET_PID: ret = usb_control_msg(hub_udev, usb_sndctrlpipe(hub_udev, 0), USB_REQ_SET_FEATURE, USB_RT_PORT, USB_PORT_FEAT_TEST, - (TEST_PACKET << 8) | portnum, + (USB_TEST_PACKET << 8) | portnum, NULL, 0, 1000); break; case TEST_HS_HOST_PORT_SUSPEND_RESUME: diff --git a/drivers/usb/mtu3/mtu3_gadget_ep0.c b/drivers/usb/mtu3/mtu3_gadget_ep0.c index 2be182bd793a..563a0a2e970d 100644 --- a/drivers/usb/mtu3/mtu3_gadget_ep0.c +++ b/drivers/usb/mtu3/mtu3_gadget_ep0.c @@ -278,20 +278,20 @@ static int handle_test_mode(struct mtu3 *mtu, struct usb_ctrlrequest *setup) u32 value; switch (le16_to_cpu(setup->wIndex) >> 8) { - case TEST_J: - dev_dbg(mtu->dev, "TEST_J\n"); + case USB_TEST_J: + dev_dbg(mtu->dev, "USB_TEST_J\n"); mtu->test_mode_nr = TEST_J_MODE; break; - case TEST_K: - dev_dbg(mtu->dev, "TEST_K\n"); + case USB_TEST_K: + dev_dbg(mtu->dev, "USB_TEST_K\n"); mtu->test_mode_nr = TEST_K_MODE; break; - case TEST_SE0_NAK: - dev_dbg(mtu->dev, "TEST_SE0_NAK\n"); + case USB_TEST_SE0_NAK: + dev_dbg(mtu->dev, "USB_TEST_SE0_NAK\n"); mtu->test_mode_nr = TEST_SE0_NAK_MODE; break; - case TEST_PACKET: - dev_dbg(mtu->dev, "TEST_PACKET\n"); + case USB_TEST_PACKET: + dev_dbg(mtu->dev, "USB_TEST_PACKET\n"); mtu->test_mode_nr = TEST_PACKET_MODE; break; default: diff --git a/drivers/usb/musb/musb_gadget_ep0.c b/drivers/usb/musb/musb_gadget_ep0.c index 91a5027b5c1f..0ae3e0be043e 100644 --- a/drivers/usb/musb/musb_gadget_ep0.c +++ b/drivers/usb/musb/musb_gadget_ep0.c @@ -311,27 +311,23 @@ __acquires(musb->lock) goto stall; switch (ctrlrequest->wIndex >> 8) { - case 1: - pr_debug("TEST_J\n"); - /* TEST_J */ + case USB_TEST_J: + pr_debug("USB_TEST_J\n"); musb->test_mode_nr = MUSB_TEST_J; break; - case 2: - /* TEST_K */ - pr_debug("TEST_K\n"); + case USB_TEST_K: + pr_debug("USB_TEST_K\n"); musb->test_mode_nr = MUSB_TEST_K; break; - case 3: - /* TEST_SE0_NAK */ - pr_debug("TEST_SE0_NAK\n"); + case USB_TEST_SE0_NAK: + pr_debug("USB_TEST_SE0_NAK\n"); musb->test_mode_nr = MUSB_TEST_SE0_NAK; break; - case 4: - /* TEST_PACKET */ - pr_debug("TEST_PACKET\n"); + case USB_TEST_PACKET: + pr_debug("USB_TEST_PACKET\n"); musb->test_mode_nr = MUSB_TEST_PACKET; break; diff --git a/drivers/usb/musb/musb_virthub.c b/drivers/usb/musb/musb_virthub.c index a84ec27c4c12..cb7ae297a3af 100644 --- a/drivers/usb/musb/musb_virthub.c +++ b/drivers/usb/musb/musb_virthub.c @@ -385,25 +385,25 @@ int musb_hub_control( wIndex >>= 8; switch (wIndex) { - case 1: - pr_debug("TEST_J\n"); + case USB_TEST_J: + pr_debug("USB_TEST_J\n"); temp = MUSB_TEST_J; break; - case 2: - pr_debug("TEST_K\n"); + case USB_TEST_K: + pr_debug("USB_TEST_K\n"); temp = MUSB_TEST_K; break; - case 3: - pr_debug("TEST_SE0_NAK\n"); + case USB_TEST_SE0_NAK: + pr_debug("USB_TEST_SE0_NAK\n"); temp = MUSB_TEST_SE0_NAK; break; - case 4: - pr_debug("TEST_PACKET\n"); + case USB_TEST_PACKET: + pr_debug("USB_TEST_PACKET\n"); temp = MUSB_TEST_PACKET; musb_load_testpacket(musb); break; - case 5: - pr_debug("TEST_FORCE_ENABLE\n"); + case USB_TEST_FORCE_ENABLE: + pr_debug("USB_TEST_FORCE_ENABLE\n"); temp = MUSB_TEST_FORCE_HOST | MUSB_TEST_FORCE_HS; diff --git a/include/uapi/linux/usb/ch9.h b/include/uapi/linux/usb/ch9.h index b1ed2ccfe9cf..48766fdf6580 100644 --- a/include/uapi/linux/usb/ch9.h +++ b/include/uapi/linux/usb/ch9.h @@ -138,11 +138,11 @@ * Test Mode Selectors * See USB 2.0 spec Table 9-7 */ -#define TEST_J 1 -#define TEST_K 2 -#define TEST_SE0_NAK 3 -#define TEST_PACKET 4 -#define TEST_FORCE_EN 5 +#define USB_TEST_J 1 +#define USB_TEST_K 2 +#define USB_TEST_SE0_NAK 3 +#define USB_TEST_PACKET 4 +#define USB_TEST_FORCE_ENABLE 5 /* Status Type */ #define USB_STATUS_TYPE_STANDARD 0 -- cgit v1.2.3-59-g8ed1b From f9bcf96837f158db6ea982d15cd2c8161ca6bc23 Mon Sep 17 00:00:00 2001 From: Dmitry Yakunin Date: Sat, 20 Jun 2020 18:30:52 +0300 Subject: bpf: Add SO_KEEPALIVE and related options to bpf_setsockopt This patch adds support of SO_KEEPALIVE flag and TCP related options to bpf_setsockopt() routine. This is helpful if we want to enable or tune TCP keepalive for applications which don't do it in the userspace code. v3: - update kernel-doc in uapi (Nikita Vetoshkin ) v4: - update kernel-doc in tools too (Alexei Starovoitov) - add test to selftests (Alexei Starovoitov) Signed-off-by: Dmitry Yakunin Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200620153052.9439-3-zeil@yandex-team.ru --- include/uapi/linux/bpf.h | 7 +++-- net/core/filter.c | 36 ++++++++++++++++++++++- tools/include/uapi/linux/bpf.h | 7 +++-- tools/testing/selftests/bpf/progs/connect4_prog.c | 27 +++++++++++++++++ 4 files changed, 72 insertions(+), 5 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 9d3923e6b860..d9737d51dd19 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1621,10 +1621,13 @@ union bpf_attr { * * * **SOL_SOCKET**, which supports the following *optname*\ s: * **SO_RCVBUF**, **SO_SNDBUF**, **SO_MAX_PACING_RATE**, - * **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**. + * **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**, + * **SO_BINDTODEVICE**, **SO_KEEPALIVE**. * * **IPPROTO_TCP**, which supports the following *optname*\ s: * **TCP_CONGESTION**, **TCP_BPF_IW**, - * **TCP_BPF_SNDCWND_CLAMP**. + * **TCP_BPF_SNDCWND_CLAMP**, **TCP_SAVE_SYN**, + * **TCP_KEEPIDLE**, **TCP_KEEPINTVL**, **TCP_KEEPCNT**, + * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**. * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. * * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**. * Return diff --git a/net/core/filter.c b/net/core/filter.c index 73395384afe2..c713b6b8938f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4289,10 +4289,10 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen, u32 flags) { char devname[IFNAMSIZ]; + int val, valbool; struct net *net; int ifindex; int ret = 0; - int val; if (!sk_fullsock(sk)) return -EINVAL; @@ -4303,6 +4303,7 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, if (optlen != sizeof(int) && optname != SO_BINDTODEVICE) return -EINVAL; val = *((int *)optval); + valbool = val ? 1 : 0; /* Only some socketops are supported */ switch (optname) { @@ -4361,6 +4362,11 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, } ret = sock_bindtoindex(sk, ifindex, false); break; + case SO_KEEPALIVE: + if (sk->sk_prot->keepalive) + sk->sk_prot->keepalive(sk, valbool); + sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); + break; default: ret = -EINVAL; } @@ -4421,6 +4427,7 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, ret = tcp_set_congestion_control(sk, name, false, reinit, true); } else { + struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); if (optlen != sizeof(int)) @@ -4449,6 +4456,33 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, else tp->save_syn = val; break; + case TCP_KEEPIDLE: + ret = tcp_sock_set_keepidle_locked(sk, val); + break; + case TCP_KEEPINTVL: + if (val < 1 || val > MAX_TCP_KEEPINTVL) + ret = -EINVAL; + else + tp->keepalive_intvl = val * HZ; + break; + case TCP_KEEPCNT: + if (val < 1 || val > MAX_TCP_KEEPCNT) + ret = -EINVAL; + else + tp->keepalive_probes = val; + break; + case TCP_SYNCNT: + if (val < 1 || val > MAX_TCP_SYNCNT) + ret = -EINVAL; + else + icsk->icsk_syn_retries = val; + break; + case TCP_USER_TIMEOUT: + if (val < 0) + ret = -EINVAL; + else + icsk->icsk_user_timeout = val; + break; default: ret = -EINVAL; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 9d3923e6b860..d9737d51dd19 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1621,10 +1621,13 @@ union bpf_attr { * * * **SOL_SOCKET**, which supports the following *optname*\ s: * **SO_RCVBUF**, **SO_SNDBUF**, **SO_MAX_PACING_RATE**, - * **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**. + * **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**, + * **SO_BINDTODEVICE**, **SO_KEEPALIVE**. * * **IPPROTO_TCP**, which supports the following *optname*\ s: * **TCP_CONGESTION**, **TCP_BPF_IW**, - * **TCP_BPF_SNDCWND_CLAMP**. + * **TCP_BPF_SNDCWND_CLAMP**, **TCP_SAVE_SYN**, + * **TCP_KEEPIDLE**, **TCP_KEEPINTVL**, **TCP_KEEPCNT**, + * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**. * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. * * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**. * Return diff --git a/tools/testing/selftests/bpf/progs/connect4_prog.c b/tools/testing/selftests/bpf/progs/connect4_prog.c index 1ab2c5eba86c..b1b2773c0b9d 100644 --- a/tools/testing/selftests/bpf/progs/connect4_prog.c +++ b/tools/testing/selftests/bpf/progs/connect4_prog.c @@ -104,6 +104,30 @@ static __inline int bind_to_device(struct bpf_sock_addr *ctx) return 0; } +static __inline int set_keepalive(struct bpf_sock_addr *ctx) +{ + int zero = 0, one = 1; + + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_KEEPALIVE, &one, sizeof(one))) + return 1; + if (ctx->type == SOCK_STREAM) { + if (bpf_setsockopt(ctx, SOL_TCP, TCP_KEEPIDLE, &one, sizeof(one))) + return 1; + if (bpf_setsockopt(ctx, SOL_TCP, TCP_KEEPINTVL, &one, sizeof(one))) + return 1; + if (bpf_setsockopt(ctx, SOL_TCP, TCP_KEEPCNT, &one, sizeof(one))) + return 1; + if (bpf_setsockopt(ctx, SOL_TCP, TCP_SYNCNT, &one, sizeof(one))) + return 1; + if (bpf_setsockopt(ctx, SOL_TCP, TCP_USER_TIMEOUT, &one, sizeof(one))) + return 1; + } + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_KEEPALIVE, &zero, sizeof(zero))) + return 1; + + return 0; +} + SEC("cgroup/connect4") int connect_v4_prog(struct bpf_sock_addr *ctx) { @@ -121,6 +145,9 @@ int connect_v4_prog(struct bpf_sock_addr *ctx) if (bind_to_device(ctx)) return 0; + if (set_keepalive(ctx)) + return 0; + if (ctx->type != SOCK_STREAM && ctx->type != SOCK_DGRAM) return 0; else if (ctx->type == SOCK_STREAM) -- cgit v1.2.3-59-g8ed1b From 899426b3bdd947541ba4af8c767575889c8b842a Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 23 Jun 2020 23:47:16 +0300 Subject: net: neighbor: add fdb extended attribute Add an attribute to NDA which will contain all future fdb-specific attributes in order to avoid polluting the NDA namespace with e.g. bridge or vxlan specific attributes. The attribute is called NDA_FDB_EXT_ATTRS and the structure would look like: [NDA_FDB_EXT_ATTRS] = { [NFEA_xxx] } Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/neighbour.h | 12 ++++++++++++ net/core/neighbour.c | 1 + 2 files changed, 13 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h index eefcda8ca44e..540ff48402a1 100644 --- a/include/uapi/linux/neighbour.h +++ b/include/uapi/linux/neighbour.h @@ -30,6 +30,7 @@ enum { NDA_SRC_VNI, NDA_PROTOCOL, /* Originator of entry */ NDA_NH_ID, + NDA_FDB_EXT_ATTRS, __NDA_MAX }; @@ -172,4 +173,15 @@ enum { }; #define NDTA_MAX (__NDTA_MAX - 1) +/* embedded into NDA_FDB_EXT_ATTRS: + * [NDA_FDB_EXT_ATTRS] = { + * ... + * } + */ +enum { + NFEA_UNSPEC, + __NFEA_MAX +}; +#define NFEA_MAX (__NFEA_MAX - 1) + #endif diff --git a/net/core/neighbour.c b/net/core/neighbour.c index ef6b5a8f629c..8e39e28b0a8d 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1783,6 +1783,7 @@ const struct nla_policy nda_policy[NDA_MAX+1] = { [NDA_MASTER] = { .type = NLA_U32 }, [NDA_PROTOCOL] = { .type = NLA_U8 }, [NDA_NH_ID] = { .type = NLA_U32 }, + [NDA_FDB_EXT_ATTRS] = { .type = NLA_NESTED }, }; static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, -- cgit v1.2.3-59-g8ed1b From 31cbc39b6344916c20452e43a9171009214c409c Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 23 Jun 2020 23:47:17 +0300 Subject: net: bridge: add option to allow activity notifications for any fdb entries This patch adds the ability to notify about activity of any entries (static, permanent or ext_learn). EVPN multihoming peers need it to properly and efficiently handle mac sync (peer active/locally active). We add a new NFEA_ACTIVITY_NOTIFY attribute which is used to dump the current activity state and to control if static entries should be monitored at all. We use 2 bits - one to activate fdb entry tracking (disabled by default) and the second to denote that an entry is inactive. We need the second bit in order to avoid multiple notifications of inactivity. Obviously this makes no difference for dynamic entries since at the time of inactivity they get deleted, while the tracked non-dynamic entries get the inactive bit set and get a notification. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/neighbour.h | 11 ++++ net/bridge/br_fdb.c | 117 ++++++++++++++++++++++++++++++++++++----- net/bridge/br_private.h | 4 ++ 3 files changed, 119 insertions(+), 13 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h index 540ff48402a1..21e569297355 100644 --- a/include/uapi/linux/neighbour.h +++ b/include/uapi/linux/neighbour.h @@ -173,13 +173,24 @@ enum { }; #define NDTA_MAX (__NDTA_MAX - 1) + /* FDB activity notification bits used in NFEA_ACTIVITY_NOTIFY: + * - FDB_NOTIFY_BIT - notify on activity/expire for any entry + * - FDB_NOTIFY_INACTIVE_BIT - mark as inactive to avoid multiple notifications + */ +enum { + FDB_NOTIFY_BIT = (1 << 0), + FDB_NOTIFY_INACTIVE_BIT = (1 << 1) +}; + /* embedded into NDA_FDB_EXT_ATTRS: * [NDA_FDB_EXT_ATTRS] = { + * [NFEA_ACTIVITY_NOTIFY] * ... * } */ enum { NFEA_UNSPEC, + NFEA_ACTIVITY_NOTIFY, __NFEA_MAX }; #define NFEA_MAX (__NFEA_MAX - 1) diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index ed80d9ab0fb9..642deb57c064 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -349,12 +349,21 @@ void br_fdb_cleanup(struct work_struct *work) */ rcu_read_lock(); hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) { - unsigned long this_timer; + unsigned long this_timer = f->updated + delay; if (test_bit(BR_FDB_STATIC, &f->flags) || - test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &f->flags)) + test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &f->flags)) { + if (test_bit(BR_FDB_NOTIFY, &f->flags)) { + if (time_after(this_timer, now)) + work_delay = min(work_delay, + this_timer - now); + else if (!test_and_set_bit(BR_FDB_NOTIFY_INACTIVE, + &f->flags)) + fdb_notify(br, f, RTM_NEWNEIGH, false); + } continue; - this_timer = f->updated + delay; + } + if (time_after(this_timer, now)) { work_delay = min(work_delay, this_timer - now); } else { @@ -556,11 +565,17 @@ int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source, return ret; } +/* returns true if the fdb was modified */ +static bool __fdb_mark_active(struct net_bridge_fdb_entry *fdb) +{ + return !!(test_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags) && + test_and_clear_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags)); +} + void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, u16 vid, unsigned long flags) { struct net_bridge_fdb_entry *fdb; - bool fdb_modified = false; /* some users want to always flood. */ if (hold_time(br) == 0) @@ -575,6 +590,12 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, source->dev->name, addr, vid); } else { unsigned long now = jiffies; + bool fdb_modified = false; + + if (now != fdb->updated) { + fdb->updated = now; + fdb_modified = __fdb_mark_active(fdb); + } /* fastpath: update of existing entry */ if (unlikely(source != fdb->dst && @@ -587,8 +608,7 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, clear_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags); } - if (now != fdb->updated) - fdb->updated = now; + if (unlikely(test_bit(BR_FDB_ADDED_BY_USER, &flags))) set_bit(BR_FDB_ADDED_BY_USER, &fdb->flags); if (unlikely(fdb_modified)) { @@ -667,6 +687,23 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br, &fdb->key.vlan_id)) goto nla_put_failure; + if (test_bit(BR_FDB_NOTIFY, &fdb->flags)) { + struct nlattr *nest = nla_nest_start(skb, NDA_FDB_EXT_ATTRS); + u8 notify_bits = FDB_NOTIFY_BIT; + + if (!nest) + goto nla_put_failure; + if (test_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags)) + notify_bits |= FDB_NOTIFY_INACTIVE_BIT; + + if (nla_put_u8(skb, NFEA_ACTIVITY_NOTIFY, notify_bits)) { + nla_nest_cancel(skb, nest); + goto nla_put_failure; + } + + nla_nest_end(skb, nest); + } + nlmsg_end(skb, nlh); return 0; @@ -681,7 +718,9 @@ static inline size_t fdb_nlmsg_size(void) + nla_total_size(ETH_ALEN) /* NDA_LLADDR */ + nla_total_size(sizeof(u32)) /* NDA_MASTER */ + nla_total_size(sizeof(u16)) /* NDA_VLAN */ - + nla_total_size(sizeof(struct nda_cacheinfo)); + + nla_total_size(sizeof(struct nda_cacheinfo)) + + nla_total_size(0) /* NDA_FDB_EXT_ATTRS */ + + nla_total_size(sizeof(u8)); /* NFEA_ACTIVITY_NOTIFY */ } static void fdb_notify(struct net_bridge *br, @@ -791,14 +830,40 @@ errout: return err; } +/* returns true if the fdb is modified */ +static bool fdb_handle_notify(struct net_bridge_fdb_entry *fdb, u8 notify) +{ + bool modified = false; + + /* allow to mark an entry as inactive, usually done on creation */ + if ((notify & FDB_NOTIFY_INACTIVE_BIT) && + !test_and_set_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags)) + modified = true; + + if ((notify & FDB_NOTIFY_BIT) && + !test_and_set_bit(BR_FDB_NOTIFY, &fdb->flags)) { + /* enabled activity tracking */ + modified = true; + } else if (!(notify & FDB_NOTIFY_BIT) && + test_and_clear_bit(BR_FDB_NOTIFY, &fdb->flags)) { + /* disabled activity tracking, clear notify state */ + clear_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags); + modified = true; + } + + return modified; +} + /* Update (create or replace) forwarding database entry */ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, - const u8 *addr, struct ndmsg *ndm, u16 flags, u16 vid) + const u8 *addr, struct ndmsg *ndm, u16 flags, u16 vid, + struct nlattr *nfea_tb[]) { bool is_sticky = !!(ndm->ndm_flags & NTF_STICKY); struct net_bridge_fdb_entry *fdb; u16 state = ndm->ndm_state; bool modified = false; + u8 notify = 0; /* If the port cannot learn allow only local and static entries */ if (source && !(state & NUD_PERMANENT) && !(state & NUD_NOARP) && @@ -815,6 +880,13 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, if (is_sticky && (state & NUD_PERMANENT)) return -EINVAL; + if (nfea_tb[NFEA_ACTIVITY_NOTIFY]) { + notify = nla_get_u8(nfea_tb[NFEA_ACTIVITY_NOTIFY]); + if ((notify & ~BR_FDB_NOTIFY_SETTABLE_BITS) || + (notify & BR_FDB_NOTIFY_SETTABLE_BITS) == FDB_NOTIFY_INACTIVE_BIT) + return -EINVAL; + } + fdb = br_fdb_find(br, addr, vid); if (fdb == NULL) { if (!(flags & NLM_F_CREATE)) @@ -858,6 +930,9 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, modified = true; } + if (fdb_handle_notify(fdb, notify)) + modified = true; + set_bit(BR_FDB_ADDED_BY_USER, &fdb->flags); fdb->used = jiffies; @@ -871,7 +946,7 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br, struct net_bridge_port *p, const unsigned char *addr, - u16 nlh_flags, u16 vid) + u16 nlh_flags, u16 vid, struct nlattr *nfea_tb[]) { int err = 0; @@ -893,19 +968,24 @@ static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br, err = br_fdb_external_learn_add(br, p, addr, vid, true); } else { spin_lock_bh(&br->hash_lock); - err = fdb_add_entry(br, p, addr, ndm, nlh_flags, vid); + err = fdb_add_entry(br, p, addr, ndm, nlh_flags, vid, nfea_tb); spin_unlock_bh(&br->hash_lock); } return err; } +static const struct nla_policy br_nda_fdb_pol[NFEA_MAX + 1] = { + [NFEA_ACTIVITY_NOTIFY] = { .type = NLA_U8 }, +}; + /* Add new permanent fdb entry with RTM_NEWNEIGH */ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u16 nlh_flags, struct netlink_ext_ack *extack) { + struct nlattr *nfea_tb[NFEA_MAX + 1], *attr; struct net_bridge_vlan_group *vg; struct net_bridge_port *p = NULL; struct net_bridge_vlan *v; @@ -938,6 +1018,16 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], vg = nbp_vlan_group(p); } + if (tb[NDA_FDB_EXT_ATTRS]) { + attr = tb[NDA_FDB_EXT_ATTRS]; + err = nla_parse_nested(nfea_tb, NFEA_MAX, attr, + br_nda_fdb_pol, extack); + if (err) + return err; + } else { + memset(nfea_tb, 0, sizeof(struct nlattr *) * (NFEA_MAX + 1)); + } + if (vid) { v = br_vlan_find(vg, vid); if (!v || !br_vlan_should_use(v)) { @@ -946,9 +1036,9 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], } /* VID was specified, so use it. */ - err = __br_fdb_add(ndm, br, p, addr, nlh_flags, vid); + err = __br_fdb_add(ndm, br, p, addr, nlh_flags, vid, nfea_tb); } else { - err = __br_fdb_add(ndm, br, p, addr, nlh_flags, 0); + err = __br_fdb_add(ndm, br, p, addr, nlh_flags, 0, nfea_tb); if (err || !vg || !vg->num_vlans) goto out; @@ -959,7 +1049,8 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], list_for_each_entry(v, &vg->vlan_list, vlist) { if (!br_vlan_should_use(v)) continue; - err = __br_fdb_add(ndm, br, p, addr, nlh_flags, v->vid); + err = __br_fdb_add(ndm, br, p, addr, nlh_flags, v->vid, + nfea_tb); if (err) goto out; } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 7501be4eeba0..c0ae639e1b36 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -48,6 +48,8 @@ enum { /* Path to usermode spanning tree program */ #define BR_STP_PROG "/sbin/bridge-stp" +#define BR_FDB_NOTIFY_SETTABLE_BITS (FDB_NOTIFY_BIT | FDB_NOTIFY_INACTIVE_BIT) + typedef struct bridge_id bridge_id; typedef struct mac_addr mac_addr; typedef __u16 port_id; @@ -184,6 +186,8 @@ enum { BR_FDB_ADDED_BY_USER, BR_FDB_ADDED_BY_EXT_LEARN, BR_FDB_OFFLOADED, + BR_FDB_NOTIFY, + BR_FDB_NOTIFY_INACTIVE }; struct net_bridge_fdb_key { -- cgit v1.2.3-59-g8ed1b From b5f1d9ec283bd28a452cf61d7e5c2f2b1a9cccda Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 23 Jun 2020 23:47:18 +0300 Subject: net: bridge: add a flag to avoid refreshing fdb when changing/adding When we modify or create a new fdb entry sometimes we want to avoid refreshing its activity in order to track it properly. One example is when a mac is received from EVPN multi-homing peer by FRR, which doesn't want to change local activity accounting. It makes it static and sets a flag to track its activity. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/neighbour.h | 1 + net/bridge/br_fdb.c | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h index 21e569297355..dc8b72201f6c 100644 --- a/include/uapi/linux/neighbour.h +++ b/include/uapi/linux/neighbour.h @@ -191,6 +191,7 @@ enum { enum { NFEA_UNSPEC, NFEA_ACTIVITY_NOTIFY, + NFEA_DONT_REFRESH, __NFEA_MAX }; #define NFEA_MAX (__NFEA_MAX - 1) diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 642deb57c064..9db504baa094 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -860,6 +860,7 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, struct nlattr *nfea_tb[]) { bool is_sticky = !!(ndm->ndm_flags & NTF_STICKY); + bool refresh = !nfea_tb[NFEA_DONT_REFRESH]; struct net_bridge_fdb_entry *fdb; u16 state = ndm->ndm_state; bool modified = false; @@ -937,7 +938,8 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, fdb->used = jiffies; if (modified) { - fdb->updated = jiffies; + if (refresh) + fdb->updated = jiffies; fdb_notify(br, fdb, RTM_NEWNEIGH, true); } @@ -977,6 +979,7 @@ static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br, static const struct nla_policy br_nda_fdb_pol[NFEA_MAX + 1] = { [NFEA_ACTIVITY_NOTIFY] = { .type = NLA_U8 }, + [NFEA_DONT_REFRESH] = { .type = NLA_FLAG }, }; /* Add new permanent fdb entry with RTM_NEWNEIGH */ -- cgit v1.2.3-59-g8ed1b From af7ec13833619e17f03aa73a785a2f871da6d66b Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 23 Jun 2020 16:08:09 -0700 Subject: bpf: Add bpf_skc_to_tcp6_sock() helper The helper is used in tracing programs to cast a socket pointer to a tcp6_sock pointer. The return value could be NULL if the casting is illegal. A new helper return type RET_PTR_TO_BTF_ID_OR_NULL is added so the verifier is able to deduce proper return types for the helper. Different from the previous BTF_ID based helpers, the bpf_skc_to_tcp6_sock() argument can be several possible btf_ids. More specifically, all possible socket data structures with sock_common appearing in the first in the memory layout. This patch only added socket types related to tcp and udp. All possible argument btf_id and return value btf_id for helper bpf_skc_to_tcp6_sock() are pre-calculcated and cached. In the future, it is even possible to precompute these btf_id's at kernel build time. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200623230809.3988195-1-yhs@fb.com --- include/linux/bpf.h | 12 +++++++ include/uapi/linux/bpf.h | 9 ++++- kernel/bpf/btf.c | 1 + kernel/bpf/verifier.c | 43 ++++++++++++++++------ kernel/trace/bpf_trace.c | 2 ++ net/core/filter.c | 82 ++++++++++++++++++++++++++++++++++++++++++ scripts/bpf_helpers_doc.py | 2 ++ tools/include/uapi/linux/bpf.h | 9 ++++- 8 files changed, 148 insertions(+), 12 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 1e1501ee53ce..c0e38ad07848 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -265,6 +265,7 @@ enum bpf_return_type { RET_PTR_TO_TCP_SOCK_OR_NULL, /* returns a pointer to a tcp_sock or NULL */ RET_PTR_TO_SOCK_COMMON_OR_NULL, /* returns a pointer to a sock_common or NULL */ RET_PTR_TO_ALLOC_MEM_OR_NULL, /* returns a pointer to dynamically allocated memory or NULL */ + RET_PTR_TO_BTF_ID_OR_NULL, /* returns a pointer to a btf_id or NULL */ }; /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs @@ -287,6 +288,12 @@ struct bpf_func_proto { enum bpf_arg_type arg_type[5]; }; int *btf_id; /* BTF ids of arguments */ + bool (*check_btf_id)(u32 btf_id, u32 arg); /* if the argument btf_id is + * valid. Often used if more + * than one btf id is permitted + * for this argument. + */ + int *ret_btf_id; /* return value btf_id */ }; /* bpf_context is intentionally undefined structure. Pointer to bpf_context is @@ -1524,6 +1531,7 @@ static inline bool bpf_map_is_dev_bound(struct bpf_map *map) struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr); void bpf_map_offload_map_free(struct bpf_map *map); +void init_btf_sock_ids(struct btf *btf); #else static inline int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) @@ -1549,6 +1557,9 @@ static inline struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) static inline void bpf_map_offload_map_free(struct bpf_map *map) { } +static inline void init_btf_sock_ids(struct btf *btf) +{ +} #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */ #if defined(CONFIG_BPF_STREAM_PARSER) @@ -1638,6 +1649,7 @@ extern const struct bpf_func_proto bpf_ringbuf_reserve_proto; extern const struct bpf_func_proto bpf_ringbuf_submit_proto; extern const struct bpf_func_proto bpf_ringbuf_discard_proto; extern const struct bpf_func_proto bpf_ringbuf_query_proto; +extern const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d9737d51dd19..e90ad07b291a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3255,6 +3255,12 @@ union bpf_attr { * case of **BPF_CSUM_LEVEL_QUERY**, the current skb->csum_level * is returned or the error code -EACCES in case the skb is not * subject to CHECKSUM_UNNECESSARY. + * + * struct tcp6_sock *bpf_skc_to_tcp6_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp6_sock* pointer. + * Return + * *sk* if casting is valid, or NULL otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3392,7 +3398,8 @@ union bpf_attr { FN(ringbuf_submit), \ FN(ringbuf_discard), \ FN(ringbuf_query), \ - FN(csum_level), + FN(csum_level), \ + FN(skc_to_tcp6_sock), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index e377d1981730..4c3007f428b1 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3674,6 +3674,7 @@ struct btf *btf_parse_vmlinux(void) goto errout; bpf_struct_ops_init(btf, log); + init_btf_sock_ids(btf); btf_verifier_env_free(env); refcount_set(&btf->refcnt, 1); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7460f967cb75..7de98906ddf4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3800,12 +3800,14 @@ static int int_ptr_type_to_size(enum bpf_arg_type type) return -EINVAL; } -static int check_func_arg(struct bpf_verifier_env *env, u32 regno, - enum bpf_arg_type arg_type, - struct bpf_call_arg_meta *meta) +static int check_func_arg(struct bpf_verifier_env *env, u32 arg, + struct bpf_call_arg_meta *meta, + const struct bpf_func_proto *fn) { + u32 regno = BPF_REG_1 + arg; struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; enum bpf_reg_type expected_type, type = reg->type; + enum bpf_arg_type arg_type = fn->arg_type[arg]; int err = 0; if (arg_type == ARG_DONTCARE) @@ -3885,9 +3887,16 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, expected_type = PTR_TO_BTF_ID; if (type != expected_type) goto err_type; - if (reg->btf_id != meta->btf_id) { - verbose(env, "Helper has type %s got %s in R%d\n", - kernel_type_name(meta->btf_id), + if (!fn->check_btf_id) { + if (reg->btf_id != meta->btf_id) { + verbose(env, "Helper has type %s got %s in R%d\n", + kernel_type_name(meta->btf_id), + kernel_type_name(reg->btf_id), regno); + + return -EACCES; + } + } else if (!fn->check_btf_id(reg->btf_id, arg)) { + verbose(env, "Helper does not support %s in R%d\n", kernel_type_name(reg->btf_id), regno); return -EACCES; @@ -4709,10 +4718,12 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn meta.func_id = func_id; /* check args */ for (i = 0; i < 5; i++) { - err = btf_resolve_helper_id(&env->log, fn, i); - if (err > 0) - meta.btf_id = err; - err = check_func_arg(env, BPF_REG_1 + i, fn->arg_type[i], &meta); + if (!fn->check_btf_id) { + err = btf_resolve_helper_id(&env->log, fn, i); + if (err > 0) + meta.btf_id = err; + } + err = check_func_arg(env, i, &meta, fn); if (err) return err; } @@ -4815,6 +4826,18 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL; regs[BPF_REG_0].id = ++env->id_gen; regs[BPF_REG_0].mem_size = meta.mem_size; + } else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL) { + int ret_btf_id; + + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_BTF_ID_OR_NULL; + ret_btf_id = *fn->ret_btf_id; + if (ret_btf_id == 0) { + verbose(env, "invalid return type %d of func %s#%d\n", + fn->ret_type, func_id_name(func_id), func_id); + return -EINVAL; + } + regs[BPF_REG_0].btf_id = ret_btf_id; } else { verbose(env, "unknown return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0159f12d2af5..2a97a268f533 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1515,6 +1515,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skb_output_proto; case BPF_FUNC_xdp_output: return &bpf_xdp_output_proto; + case BPF_FUNC_skc_to_tcp6_sock: + return &bpf_skc_to_tcp6_sock_proto; #endif case BPF_FUNC_seq_printf: return prog->expected_attach_type == BPF_TRACE_ITER ? diff --git a/net/core/filter.c b/net/core/filter.c index c713b6b8938f..176e27d75c51 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include #include @@ -9225,3 +9226,84 @@ void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog) { bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(xdp), prev_prog, prog); } + +/* Define a list of socket types which can be the argument for + * skc_to_*_sock() helpers. All these sockets should have + * sock_common as the first argument in its memory layout. + */ +#define BTF_SOCK_TYPE_xxx \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET, "inet_sock") \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET_CONN, "inet_connection_sock") \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET_REQ, "inet_request_sock") \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET_TW, "inet_timewait_sock") \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_REQ, "request_sock") \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_SOCK, "sock") \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_SOCK_COMMON, "sock_common") \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP, "tcp_sock") \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP_REQ, "tcp_request_sock") \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP_TW, "tcp_timewait_sock") \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP6, "tcp6_sock") \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP, "udp_sock") \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP6, "udp6_sock") + +enum { +#define BTF_SOCK_TYPE(name, str) name, +BTF_SOCK_TYPE_xxx +#undef BTF_SOCK_TYPE +MAX_BTF_SOCK_TYPE, +}; + +static int btf_sock_ids[MAX_BTF_SOCK_TYPE]; + +#ifdef CONFIG_BPF_SYSCALL +static const char *bpf_sock_types[] = { +#define BTF_SOCK_TYPE(name, str) str, +BTF_SOCK_TYPE_xxx +#undef BTF_SOCK_TYPE +}; + +void init_btf_sock_ids(struct btf *btf) +{ + int i, btf_id; + + for (i = 0; i < MAX_BTF_SOCK_TYPE; i++) { + btf_id = btf_find_by_name_kind(btf, bpf_sock_types[i], + BTF_KIND_STRUCT); + if (btf_id > 0) + btf_sock_ids[i] = btf_id; + } +} +#endif + +static bool check_arg_btf_id(u32 btf_id, u32 arg) +{ + int i; + + /* only one argument, no need to check arg */ + for (i = 0; i < MAX_BTF_SOCK_TYPE; i++) + if (btf_sock_ids[i] == btf_id) + return true; + return false; +} + +BPF_CALL_1(bpf_skc_to_tcp6_sock, struct sock *, sk) +{ + /* tcp6_sock type is not generated in dwarf and hence btf, + * trigger an explicit type generation here. + */ + BTF_TYPE_EMIT(struct tcp6_sock); + if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP && + sk->sk_family == AF_INET6) + return (unsigned long)sk; + + return (unsigned long)NULL; +} + +const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto = { + .func = bpf_skc_to_tcp6_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_BTF_ID, + .check_btf_id = check_arg_btf_id, + .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP6], +}; diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 91fa668fa860..6c2f64118651 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -421,6 +421,7 @@ class PrinterHelpers(Printer): 'struct sockaddr', 'struct tcphdr', 'struct seq_file', + 'struct tcp6_sock', 'struct __sk_buff', 'struct sk_msg_md', @@ -458,6 +459,7 @@ class PrinterHelpers(Printer): 'struct sockaddr', 'struct tcphdr', 'struct seq_file', + 'struct tcp6_sock', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index d9737d51dd19..e90ad07b291a 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3255,6 +3255,12 @@ union bpf_attr { * case of **BPF_CSUM_LEVEL_QUERY**, the current skb->csum_level * is returned or the error code -EACCES in case the skb is not * subject to CHECKSUM_UNNECESSARY. + * + * struct tcp6_sock *bpf_skc_to_tcp6_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp6_sock* pointer. + * Return + * *sk* if casting is valid, or NULL otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3392,7 +3398,8 @@ union bpf_attr { FN(ringbuf_submit), \ FN(ringbuf_discard), \ FN(ringbuf_query), \ - FN(csum_level), + FN(csum_level), \ + FN(skc_to_tcp6_sock), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3-59-g8ed1b From 478cfbdf5f13dfe09cfd0b1cbac821f5e27f6108 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 23 Jun 2020 16:08:11 -0700 Subject: bpf: Add bpf_skc_to_{tcp, tcp_timewait, tcp_request}_sock() helpers Three more helpers are added to cast a sock_common pointer to an tcp_sock, tcp_timewait_sock or a tcp_request_sock for tracing programs. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200623230811.3988277-1-yhs@fb.com --- include/linux/bpf.h | 3 ++ include/uapi/linux/bpf.h | 23 +++++++++++++++- kernel/trace/bpf_trace.c | 6 ++++ net/core/filter.c | 62 ++++++++++++++++++++++++++++++++++++++++++ scripts/bpf_helpers_doc.py | 6 ++++ tools/include/uapi/linux/bpf.h | 23 +++++++++++++++- 6 files changed, 121 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c0e38ad07848..c23998cf6699 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1650,6 +1650,9 @@ extern const struct bpf_func_proto bpf_ringbuf_submit_proto; extern const struct bpf_func_proto bpf_ringbuf_discard_proto; extern const struct bpf_func_proto bpf_ringbuf_query_proto; extern const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto; +extern const struct bpf_func_proto bpf_skc_to_tcp_sock_proto; +extern const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto; +extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e90ad07b291a..b9412ab275f3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3261,6 +3261,24 @@ union bpf_attr { * Dynamically cast a *sk* pointer to a *tcp6_sock* pointer. * Return * *sk* if casting is valid, or NULL otherwise. + * + * struct tcp_sock *bpf_skc_to_tcp_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp_sock* pointer. + * Return + * *sk* if casting is valid, or NULL otherwise. + * + * struct tcp_timewait_sock *bpf_skc_to_tcp_timewait_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp_timewait_sock* pointer. + * Return + * *sk* if casting is valid, or NULL otherwise. + * + * struct tcp_request_sock *bpf_skc_to_tcp_request_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp_request_sock* pointer. + * Return + * *sk* if casting is valid, or NULL otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3399,7 +3417,10 @@ union bpf_attr { FN(ringbuf_discard), \ FN(ringbuf_query), \ FN(csum_level), \ - FN(skc_to_tcp6_sock), + FN(skc_to_tcp6_sock), \ + FN(skc_to_tcp_sock), \ + FN(skc_to_tcp_timewait_sock), \ + FN(skc_to_tcp_request_sock), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 2a97a268f533..48d935b0d87c 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1517,6 +1517,12 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_xdp_output_proto; case BPF_FUNC_skc_to_tcp6_sock: return &bpf_skc_to_tcp6_sock_proto; + case BPF_FUNC_skc_to_tcp_sock: + return &bpf_skc_to_tcp_sock_proto; + case BPF_FUNC_skc_to_tcp_timewait_sock: + return &bpf_skc_to_tcp_timewait_sock_proto; + case BPF_FUNC_skc_to_tcp_request_sock: + return &bpf_skc_to_tcp_request_sock_proto; #endif case BPF_FUNC_seq_printf: return prog->expected_attach_type == BPF_TRACE_ITER ? diff --git a/net/core/filter.c b/net/core/filter.c index 176e27d75c51..0b4e5aed7e20 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -74,6 +74,7 @@ #include #include #include +#include /** * sk_filter_trim_cap - run a packet through a socket filter @@ -9307,3 +9308,64 @@ const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto = { .check_btf_id = check_arg_btf_id, .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP6], }; + +BPF_CALL_1(bpf_skc_to_tcp_sock, struct sock *, sk) +{ + if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP) + return (unsigned long)sk; + + return (unsigned long)NULL; +} + +const struct bpf_func_proto bpf_skc_to_tcp_sock_proto = { + .func = bpf_skc_to_tcp_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_BTF_ID, + .check_btf_id = check_arg_btf_id, + .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP], +}; + +BPF_CALL_1(bpf_skc_to_tcp_timewait_sock, struct sock *, sk) +{ + if (sk->sk_prot == &tcp_prot && sk->sk_state == TCP_TIME_WAIT) + return (unsigned long)sk; + +#if IS_BUILTIN(CONFIG_IPV6) + if (sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_TIME_WAIT) + return (unsigned long)sk; +#endif + + return (unsigned long)NULL; +} + +const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto = { + .func = bpf_skc_to_tcp_timewait_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_BTF_ID, + .check_btf_id = check_arg_btf_id, + .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_TW], +}; + +BPF_CALL_1(bpf_skc_to_tcp_request_sock, struct sock *, sk) +{ + if (sk->sk_prot == &tcp_prot && sk->sk_state == TCP_NEW_SYN_RECV) + return (unsigned long)sk; + +#if IS_BUILTIN(CONFIG_IPV6) + if (sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_NEW_SYN_RECV) + return (unsigned long)sk; +#endif + + return (unsigned long)NULL; +} + +const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto = { + .func = bpf_skc_to_tcp_request_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_BTF_ID, + .check_btf_id = check_arg_btf_id, + .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_REQ], +}; diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 6c2f64118651..d886657c6aaa 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -422,6 +422,9 @@ class PrinterHelpers(Printer): 'struct tcphdr', 'struct seq_file', 'struct tcp6_sock', + 'struct tcp_sock', + 'struct tcp_timewait_sock', + 'struct tcp_request_sock', 'struct __sk_buff', 'struct sk_msg_md', @@ -460,6 +463,9 @@ class PrinterHelpers(Printer): 'struct tcphdr', 'struct seq_file', 'struct tcp6_sock', + 'struct tcp_sock', + 'struct tcp_timewait_sock', + 'struct tcp_request_sock', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index e90ad07b291a..b9412ab275f3 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3261,6 +3261,24 @@ union bpf_attr { * Dynamically cast a *sk* pointer to a *tcp6_sock* pointer. * Return * *sk* if casting is valid, or NULL otherwise. + * + * struct tcp_sock *bpf_skc_to_tcp_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp_sock* pointer. + * Return + * *sk* if casting is valid, or NULL otherwise. + * + * struct tcp_timewait_sock *bpf_skc_to_tcp_timewait_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp_timewait_sock* pointer. + * Return + * *sk* if casting is valid, or NULL otherwise. + * + * struct tcp_request_sock *bpf_skc_to_tcp_request_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp_request_sock* pointer. + * Return + * *sk* if casting is valid, or NULL otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3399,7 +3417,10 @@ union bpf_attr { FN(ringbuf_discard), \ FN(ringbuf_query), \ FN(csum_level), \ - FN(skc_to_tcp6_sock), + FN(skc_to_tcp6_sock), \ + FN(skc_to_tcp_sock), \ + FN(skc_to_tcp_timewait_sock), \ + FN(skc_to_tcp_request_sock), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3-59-g8ed1b From 0d4fad3e57df2bf61e8ffc8d12a34b1caf9b8835 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 23 Jun 2020 16:08:15 -0700 Subject: bpf: Add bpf_skc_to_udp6_sock() helper The helper is used in tracing programs to cast a socket pointer to a udp6_sock pointer. The return value could be NULL if the casting is illegal. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Cc: Eric Dumazet Link: https://lore.kernel.org/bpf/20200623230815.3988481-1-yhs@fb.com --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 9 ++++++++- kernel/trace/bpf_trace.c | 2 ++ net/core/filter.c | 22 ++++++++++++++++++++++ scripts/bpf_helpers_doc.py | 2 ++ tools/include/uapi/linux/bpf.h | 9 ++++++++- 6 files changed, 43 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c23998cf6699..3d2ade703a35 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1653,6 +1653,7 @@ extern const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto; +extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b9412ab275f3..0cb8ec948816 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3279,6 +3279,12 @@ union bpf_attr { * Dynamically cast a *sk* pointer to a *tcp_request_sock* pointer. * Return * *sk* if casting is valid, or NULL otherwise. + * + * struct udp6_sock *bpf_skc_to_udp6_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *udp6_sock* pointer. + * Return + * *sk* if casting is valid, or NULL otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3420,7 +3426,8 @@ union bpf_attr { FN(skc_to_tcp6_sock), \ FN(skc_to_tcp_sock), \ FN(skc_to_tcp_timewait_sock), \ - FN(skc_to_tcp_request_sock), + FN(skc_to_tcp_request_sock), \ + FN(skc_to_udp6_sock), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 48d935b0d87c..5d59dda5f661 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1523,6 +1523,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skc_to_tcp_timewait_sock_proto; case BPF_FUNC_skc_to_tcp_request_sock: return &bpf_skc_to_tcp_request_sock_proto; + case BPF_FUNC_skc_to_udp6_sock: + return &bpf_skc_to_udp6_sock_proto; #endif case BPF_FUNC_seq_printf: return prog->expected_attach_type == BPF_TRACE_ITER ? diff --git a/net/core/filter.c b/net/core/filter.c index 0b4e5aed7e20..c796e141ea8e 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -9369,3 +9369,25 @@ const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto = { .check_btf_id = check_arg_btf_id, .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_REQ], }; + +BPF_CALL_1(bpf_skc_to_udp6_sock, struct sock *, sk) +{ + /* udp6_sock type is not generated in dwarf and hence btf, + * trigger an explicit type generation here. + */ + BTF_TYPE_EMIT(struct udp6_sock); + if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_UDP && + sk->sk_type == SOCK_DGRAM && sk->sk_family == AF_INET6) + return (unsigned long)sk; + + return (unsigned long)NULL; +} + +const struct bpf_func_proto bpf_skc_to_udp6_sock_proto = { + .func = bpf_skc_to_udp6_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_BTF_ID, + .check_btf_id = check_arg_btf_id, + .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_UDP6], +}; diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index d886657c6aaa..6bab40ff442e 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -425,6 +425,7 @@ class PrinterHelpers(Printer): 'struct tcp_sock', 'struct tcp_timewait_sock', 'struct tcp_request_sock', + 'struct udp6_sock', 'struct __sk_buff', 'struct sk_msg_md', @@ -466,6 +467,7 @@ class PrinterHelpers(Printer): 'struct tcp_sock', 'struct tcp_timewait_sock', 'struct tcp_request_sock', + 'struct udp6_sock', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index b9412ab275f3..0cb8ec948816 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3279,6 +3279,12 @@ union bpf_attr { * Dynamically cast a *sk* pointer to a *tcp_request_sock* pointer. * Return * *sk* if casting is valid, or NULL otherwise. + * + * struct udp6_sock *bpf_skc_to_udp6_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *udp6_sock* pointer. + * Return + * *sk* if casting is valid, or NULL otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3420,7 +3426,8 @@ union bpf_attr { FN(skc_to_tcp6_sock), \ FN(skc_to_tcp_sock), \ FN(skc_to_tcp_timewait_sock), \ - FN(skc_to_tcp_request_sock), + FN(skc_to_tcp_request_sock), \ + FN(skc_to_udp6_sock), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3-59-g8ed1b From bccb48c89fe3c09f1cbb7c8612e31f5daa1d4541 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Mon, 1 Jun 2020 20:13:21 +0200 Subject: batman-adv: Fix typos and grammar in documentation Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- Documentation/networking/batman-adv.rst | 6 ++-- include/uapi/linux/batadv_packet.h | 50 ++++++++++++++++----------------- include/uapi/linux/batman_adv.h | 4 +-- net/batman-adv/bat_iv_ogm.c | 8 +++--- net/batman-adv/bat_v_elp.c | 10 +++---- net/batman-adv/bat_v_ogm.c | 14 ++++----- net/batman-adv/bridge_loop_avoidance.c | 6 ++-- net/batman-adv/distributed-arp-table.c | 2 +- net/batman-adv/fragmentation.c | 6 ++-- net/batman-adv/hard-interface.c | 14 ++++----- net/batman-adv/log.h | 6 ++-- net/batman-adv/main.c | 2 +- net/batman-adv/main.h | 6 ++-- net/batman-adv/multicast.c | 21 +++++++------- net/batman-adv/netlink.c | 2 +- net/batman-adv/network-coding.c | 14 ++++----- net/batman-adv/originator.c | 8 +++--- net/batman-adv/routing.c | 4 +-- net/batman-adv/send.c | 4 +-- net/batman-adv/soft-interface.c | 2 +- net/batman-adv/tp_meter.c | 12 ++++---- net/batman-adv/translation-table.c | 10 +++---- net/batman-adv/tvlv.c | 4 +-- net/batman-adv/types.h | 12 ++++---- 24 files changed, 114 insertions(+), 113 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/networking/batman-adv.rst b/Documentation/networking/batman-adv.rst index 02af49b08635..74821d29a22f 100644 --- a/Documentation/networking/batman-adv.rst +++ b/Documentation/networking/batman-adv.rst @@ -73,7 +73,7 @@ lower value. This will make the mesh more responsive to topology changes, but will also increase the overhead. Information about the current state can be accessed via the batadv generic -netlink family. batctl provides human readable version via its debug tables +netlink family. batctl provides a human readable version via its debug tables subcommands. @@ -115,8 +115,8 @@ are prefixed with "batman-adv:" So to see just these messages try:: $ dmesg | grep batman-adv When investigating problems with your mesh network, it is sometimes necessary to -see more detail debug messages. This must be enabled when compiling the -batman-adv module. When building batman-adv as part of kernel, use "make +see more detailed debug messages. This must be enabled when compiling the +batman-adv module. When building batman-adv as part of the kernel, use "make menuconfig" and enable the option ``B.A.T.M.A.N. debugging`` (``CONFIG_BATMAN_ADV_DEBUG=y``). diff --git a/include/uapi/linux/batadv_packet.h b/include/uapi/linux/batadv_packet.h index 0ae34c85ef9e..9c8604c5b5f6 100644 --- a/include/uapi/linux/batadv_packet.h +++ b/include/uapi/linux/batadv_packet.h @@ -72,8 +72,8 @@ enum batadv_subtype { /** * enum batadv_iv_flags - flags used in B.A.T.M.A.N. IV OGM packets - * @BATADV_NOT_BEST_NEXT_HOP: flag is set when ogm packet is forwarded and was - * previously received from someone else than the best neighbor. + * @BATADV_NOT_BEST_NEXT_HOP: flag is set when the ogm packet is forwarded and + * was previously received from someone other than the best neighbor. * @BATADV_PRIMARIES_FIRST_HOP: flag unused. * @BATADV_DIRECTLINK: flag is for the first hop or if rebroadcasted from a * one hop neighbor on the interface where it was originally received. @@ -195,8 +195,8 @@ struct batadv_bla_claim_dst { /** * struct batadv_ogm_packet - ogm (routing protocol) packet * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @ttl: time to live for this packet, part of the genereal header + * @version: batman-adv protocol version, part of the general header + * @ttl: time to live for this packet, part of the general header * @flags: contains routing relevant flags - see enum batadv_iv_flags * @seqno: sequence identification * @orig: address of the source node @@ -247,7 +247,7 @@ struct batadv_ogm2_packet { /** * struct batadv_elp_packet - elp (neighbor discovery) packet * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header + * @version: batman-adv protocol version, part of the general header * @orig: originator mac address * @seqno: sequence number * @elp_interval: currently used ELP sending interval in ms @@ -265,15 +265,15 @@ struct batadv_elp_packet { /** * struct batadv_icmp_header - common members among all the ICMP packets * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @ttl: time to live for this packet, part of the genereal header + * @version: batman-adv protocol version, part of the general header + * @ttl: time to live for this packet, part of the general header * @msg_type: ICMP packet type * @dst: address of the destination node * @orig: address of the source node * @uid: local ICMP socket identifier * @align: not used - useful for alignment purposes only * - * This structure is used for ICMP packets parsing only and it is never sent + * This structure is used for ICMP packet parsing only and it is never sent * over the wire. The alignment field at the end is there to ensure that * members are padded the same way as they are in real packets. */ @@ -291,8 +291,8 @@ struct batadv_icmp_header { /** * struct batadv_icmp_packet - ICMP packet * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @ttl: time to live for this packet, part of the genereal header + * @version: batman-adv protocol version, part of the general header + * @ttl: time to live for this packet, part of the general header * @msg_type: ICMP packet type * @dst: address of the destination node * @orig: address of the source node @@ -315,8 +315,8 @@ struct batadv_icmp_packet { /** * struct batadv_icmp_tp_packet - ICMP TP Meter packet * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @ttl: time to live for this packet, part of the genereal header + * @version: batman-adv protocol version, part of the general header + * @ttl: time to live for this packet, part of the general header * @msg_type: ICMP packet type * @dst: address of the destination node * @orig: address of the source node @@ -358,8 +358,8 @@ enum batadv_icmp_tp_subtype { /** * struct batadv_icmp_packet_rr - ICMP RouteRecord packet * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @ttl: time to live for this packet, part of the genereal header + * @version: batman-adv protocol version, part of the general header + * @ttl: time to live for this packet, part of the general header * @msg_type: ICMP packet type * @dst: address of the destination node * @orig: address of the source node @@ -397,8 +397,8 @@ struct batadv_icmp_packet_rr { /** * struct batadv_unicast_packet - unicast packet for network payload * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @ttl: time to live for this packet, part of the genereal header + * @version: batman-adv protocol version, part of the general header + * @ttl: time to live for this packet, part of the general header * @ttvn: translation table version number * @dest: originator destination of the unicast packet */ @@ -433,8 +433,8 @@ struct batadv_unicast_4addr_packet { /** * struct batadv_frag_packet - fragmented packet * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @ttl: time to live for this packet, part of the genereal header + * @version: batman-adv protocol version, part of the general header + * @ttl: time to live for this packet, part of the general header * @dest: final destination used when routing fragments * @orig: originator of the fragment used when merging the packet * @no: fragment number within this sequence @@ -467,8 +467,8 @@ struct batadv_frag_packet { /** * struct batadv_bcast_packet - broadcast packet for network payload * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @ttl: time to live for this packet, part of the genereal header + * @version: batman-adv protocol version, part of the general header + * @ttl: time to live for this packet, part of the general header * @reserved: reserved byte for alignment * @seqno: sequence identification * @orig: originator of the broadcast packet @@ -488,10 +488,10 @@ struct batadv_bcast_packet { /** * struct batadv_coded_packet - network coded packet * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @ttl: time to live for this packet, part of the genereal header + * @version: batman-adv protocol version, part of the general header + * @ttl: time to live for this packet, part of the general header * @first_source: original source of first included packet - * @first_orig_dest: original destinal of first included packet + * @first_orig_dest: original destination of first included packet * @first_crc: checksum of first included packet * @first_ttvn: tt-version number of first included packet * @second_ttl: ttl of second packet @@ -523,8 +523,8 @@ struct batadv_coded_packet { /** * struct batadv_unicast_tvlv_packet - generic unicast packet with tvlv payload * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @ttl: time to live for this packet, part of the genereal header + * @version: batman-adv protocol version, part of the general header + * @ttl: time to live for this packet, part of the general header * @reserved: reserved field (for packet alignment) * @src: address of the source * @dst: address of the destination diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index 617c180ff0c8..8cf2ad11ead9 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -69,7 +69,7 @@ enum batadv_tt_client_flags { /** * @BATADV_TT_CLIENT_TEMP: this global client has been detected to be - * part of the network but no nnode has already announced it + * part of the network but no node has already announced it */ BATADV_TT_CLIENT_TEMP = (1 << 11), }; @@ -131,7 +131,7 @@ enum batadv_gw_modes { /** @BATADV_GW_MODE_CLIENT: send DHCP requests to gw servers */ BATADV_GW_MODE_CLIENT, - /** @BATADV_GW_MODE_SERVER: announce itself as gatway server */ + /** @BATADV_GW_MODE_SERVER: announce itself as gateway server */ BATADV_GW_MODE_SERVER, }; diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index e87f19c82e8d..5b3a41983156 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -134,7 +134,7 @@ static u8 batadv_ring_buffer_avg(const u8 lq_recv[]) * * Return: the originator object corresponding to the passed mac address or NULL * on failure. - * If the object does not exists it is created an initialised. + * If the object does not exist, it is created and initialised. */ static struct batadv_orig_node * batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr) @@ -871,7 +871,7 @@ static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface) } /** - * batadv_iv_orig_ifinfo_sum() - Get bcast_own sum for originator over iterface + * batadv_iv_orig_ifinfo_sum() - Get bcast_own sum for originator over interface * @orig_node: originator which reproadcasted the OGMs directly * @if_outgoing: interface which transmitted the original OGM and received the * direct rebroadcast @@ -1554,7 +1554,7 @@ static void batadv_iv_ogm_process_reply(struct batadv_ogm_packet *ogm_packet, * batadv_iv_ogm_process() - process an incoming batman iv OGM * @skb: the skb containing the OGM * @ogm_offset: offset to the OGM which should be processed (for aggregates) - * @if_incoming: the interface where this packet was receved + * @if_incoming: the interface where this packet was received */ static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset, struct batadv_hard_iface *if_incoming) @@ -2288,7 +2288,7 @@ batadv_iv_ogm_neigh_dump_hardif(struct sk_buff *msg, u32 portid, u32 seq, * @msg: Netlink message to dump into * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information - * @single_hardif: Limit dump to this hard interfaace + * @single_hardif: Limit dump to this hard interface */ static void batadv_iv_ogm_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb, diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index 0bdefa35da98..d35aca0e969a 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -60,7 +60,7 @@ static void batadv_v_elp_start_timer(struct batadv_hard_iface *hard_iface) * @neigh: the neighbour for which the throughput has to be obtained * * Return: The throughput towards the given neighbour in multiples of 100kpbs - * (a value of '1' equals to 0.1Mbps, '10' equals 1Mbps, etc). + * (a value of '1' equals 0.1Mbps, '10' equals 1Mbps, etc). */ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh) { @@ -183,8 +183,8 @@ void batadv_v_elp_throughput_metric_update(struct work_struct *work) * * Sends a predefined number of unicast wifi packets to a given neighbour in * order to trigger the throughput estimation on this link by the RC algorithm. - * Packets are sent only if there there is not enough payload unicast traffic - * towards this neighbour.. + * Packets are sent only if there is not enough payload unicast traffic towards + * this neighbour.. * * Return: True on success and false in case of error during skb preparation. */ @@ -244,7 +244,7 @@ batadv_v_elp_wifi_neigh_probe(struct batadv_hardif_neigh_node *neigh) * batadv_v_elp_periodic_work() - ELP periodic task per interface * @work: work queue item * - * Emits broadcast ELP message in regular intervals. + * Emits broadcast ELP messages in regular intervals. */ static void batadv_v_elp_periodic_work(struct work_struct *work) { @@ -499,7 +499,7 @@ orig_free: * @skb: the received packet * @if_incoming: the interface this packet was received through * - * Return: NET_RX_SUCCESS and consumes the skb if the packet was peoperly + * Return: NET_RX_SUCCESS and consumes the skb if the packet was properly * processed or NET_RX_DROP in case of failure. */ int batadv_v_elp_packet_recv(struct sk_buff *skb, diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index 18028b9f95f0..0d404f7bcd9f 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -47,9 +47,9 @@ * @bat_priv: the bat priv with all the soft interface information * @addr: the address of the originator * - * Return: the orig_node corresponding to the specified address. If such object - * does not exist it is allocated here. In case of allocation failure returns - * NULL. + * Return: the orig_node corresponding to the specified address. If such an + * object does not exist, it is allocated here. In case of allocation failure + * returns NULL. */ struct batadv_orig_node *batadv_v_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr) @@ -172,7 +172,7 @@ static bool batadv_v_ogm_queue_left(struct sk_buff *skb, * batadv_v_ogm_aggr_list_free - free all elements in an aggregation queue * @hard_iface: the interface holding the aggregation queue * - * Empties the OGMv2 aggregation queue and frees all the skbs it contained. + * Empties the OGMv2 aggregation queue and frees all the skbs it contains. * * Caller needs to hold the hard_iface->bat_v.aggr_list.lock. */ @@ -378,7 +378,7 @@ static void batadv_v_ogm_send(struct work_struct *work) * batadv_v_ogm_aggr_work() - OGM queue periodic task per interface * @work: work queue item * - * Emits aggregated OGM message in regular intervals. + * Emits aggregated OGM messages in regular intervals. */ void batadv_v_ogm_aggr_work(struct work_struct *work) { @@ -399,7 +399,7 @@ void batadv_v_ogm_aggr_work(struct work_struct *work) * batadv_v_ogm_iface_enable() - prepare an interface for B.A.T.M.A.N. V * @hard_iface: the interface to prepare * - * Takes care of scheduling own OGM sending routine for this interface. + * Takes care of scheduling its own OGM sending routine for this interface. * * Return: 0 on success or a negative error code otherwise */ @@ -847,7 +847,7 @@ batadv_v_ogm_aggr_packet(int buff_pos, int packet_len, * batadv_v_ogm_process() - process an incoming batman v OGM * @skb: the skb containing the OGM * @ogm_offset: offset to the OGM which should be processed (for aggregates) - * @if_incoming: the interface where this packet was receved + * @if_incoming: the interface where this packet was received */ static void batadv_v_ogm_process(const struct sk_buff *skb, int ogm_offset, struct batadv_hard_iface *if_incoming) diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 41cc87f06b14..91a04ca373dc 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -992,7 +992,7 @@ static bool batadv_handle_claim(struct batadv_priv *bat_priv, * @hw_dst: the Hardware destination in the ARP Header * @ethhdr: pointer to the Ethernet header of the claim frame * - * checks if it is a claim packet and if its on the same group. + * checks if it is a claim packet and if it's on the same group. * This function also applies the group ID of the sender * if it is in the same mesh. * @@ -1757,7 +1757,7 @@ void batadv_bla_free(struct batadv_priv *bat_priv) * @vid: the VLAN ID of the frame * * Checks if this packet is a loop detect frame which has been sent by us, - * throw an uevent and log the event if that is the case. + * throws an uevent and logs the event if that is the case. * * Return: true if it is a loop detect frame which is to be dropped, false * otherwise. @@ -1815,7 +1815,7 @@ batadv_bla_loopdetect_check(struct batadv_priv *bat_priv, struct sk_buff *skb, * * we have to race for a claim * * if the frame is allowed on the LAN * - * in these cases, the skb is further handled by this function + * In these cases, the skb is further handled by this function * * Return: true if handled, otherwise it returns false and the caller shall * further process the skb. diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index b85da4b7a77b..0e6e53e9b5f3 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -666,7 +666,7 @@ batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst, * @vid: VLAN identifier * @packet_subtype: unicast4addr packet subtype to use * - * This function copies the skb with pskb_copy() and is sent as unicast packet + * This function copies the skb with pskb_copy() and is sent as a unicast packet * to each of the selected candidates. * * Return: true if the packet is sent to at least one candidate, false diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index 7cad97644d05..9fdbe3068153 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -102,8 +102,8 @@ static int batadv_frag_size_limit(void) * * Caller must hold chain->lock. * - * Return: true if chain is empty and caller can just insert the new fragment - * without searching for the right position. + * Return: true if chain is empty and the caller can just insert the new + * fragment without searching for the right position. */ static bool batadv_frag_init_chain(struct batadv_frag_table_entry *chain, u16 seqno) @@ -306,7 +306,7 @@ free: * set *skb to merged packet; 2) Packet is buffered: Return true and set *skb * to NULL; 3) Error: Return false and free skb. * - * Return: true when packet is merged or buffered, false when skb is not not + * Return: true when the packet is merged or buffered, false when skb is not not * used. */ bool batadv_frag_skb_buffer(struct sk_buff **skb, diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 3a256af92784..53c27c67cc11 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -138,10 +138,10 @@ static bool batadv_mutual_parents(const struct net_device *dev1, * @net_dev: the device to check * * If the user creates any virtual device on top of a batman-adv interface, it - * is important to prevent this new interface to be used to create a new mesh - * network (this behaviour would lead to a batman-over-batman configuration). - * This function recursively checks all the fathers of the device passed as - * argument looking for a batman-adv soft interface. + * is important to prevent this new interface from being used to create a new + * mesh network (this behaviour would lead to a batman-over-batman + * configuration). This function recursively checks all the fathers of the + * device passed as argument looking for a batman-adv soft interface. * * Return: true if the device is descendant of a batman-adv mesh interface (or * if it is a batman-adv interface itself), false otherwise @@ -680,8 +680,8 @@ batadv_hardif_deactivate_interface(struct batadv_hard_iface *hard_iface) * @slave: the interface enslaved in another master * @master: the master from which slave has to be removed * - * Invoke ndo_del_slave on master passing slave as argument. In this way slave - * is free'd and master can correctly change its internal state. + * Invoke ndo_del_slave on master passing slave as argument. In this way the + * slave is free'd and the master can correctly change its internal state. * * Return: 0 on success, a negative value representing the error otherwise */ @@ -818,7 +818,7 @@ err: * @soft_iface: soft interface to check * * This function is only using RCU for locking - the result can therefore be - * off when another functions is modifying the list at the same time. The + * off when another function is modifying the list at the same time. The * caller can use the rtnl_lock to make sure that the count is accurate. * * Return: number of connected/enslaved hard interfaces diff --git a/net/batman-adv/log.h b/net/batman-adv/log.h index f9884dc56cf3..979864c0fa6b 100644 --- a/net/batman-adv/log.h +++ b/net/batman-adv/log.h @@ -69,7 +69,7 @@ int batadv_debug_log(struct batadv_priv *bat_priv, const char *fmt, ...) __printf(2, 3); /** - * _batadv_dbg() - Store debug output with(out) ratelimiting + * _batadv_dbg() - Store debug output with(out) rate limiting * @type: type of debug message * @bat_priv: the bat priv with all the soft interface information * @ratelimited: whether output should be rate limited @@ -95,7 +95,7 @@ static inline void _batadv_dbg(int type __always_unused, #endif /** - * batadv_dbg() - Store debug output without ratelimiting + * batadv_dbg() - Store debug output without rate limiting * @type: type of debug message * @bat_priv: the bat priv with all the soft interface information * @arg: format string and variable arguments @@ -104,7 +104,7 @@ static inline void _batadv_dbg(int type __always_unused, _batadv_dbg(type, bat_priv, 0, ## arg) /** - * batadv_dbg_ratelimited() - Store debug output with ratelimiting + * batadv_dbg_ratelimited() - Store debug output with rate limiting * @type: type of debug message * @bat_priv: the bat priv with all the soft interface information * @arg: format string and variable arguments diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index d8a255c85e77..519c08c2cfba 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -666,7 +666,7 @@ unsigned short batadv_get_vid(struct sk_buff *skb, size_t header_len) * @vid: the VLAN identifier for which the AP isolation attributed as to be * looked up * - * Return: true if AP isolation is on for the VLAN idenfied by vid, false + * Return: true if AP isolation is on for the VLAN identified by vid, false * otherwise */ bool batadv_vlan_ap_isola_get(struct batadv_priv *bat_priv, unsigned short vid) diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 42b8d1e76dea..0393bb9ed3d0 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -308,7 +308,7 @@ static inline bool batadv_has_timed_out(unsigned long timestamp, * @y: value to compare @x against * * It handles overflows/underflows and can correctly check for a predecessor - * unless the variable sequence number has grown by more then + * unless the variable sequence number has grown by more than * 2**(bitwidth(x)-1)-1. * * This means that for a u8 with the maximum value 255, it would think: @@ -330,11 +330,11 @@ static inline bool batadv_has_timed_out(unsigned long timestamp, /** * batadv_seq_after() - Checks if a sequence number x is a successor of y - * @x: potential sucessor of @y + * @x: potential successor of @y * @y: value to compare @x against * * It handles overflows/underflows and can correctly check for a successor - * unless the variable sequence number has grown by more then + * unless the variable sequence number has grown by more than * 2**(bitwidth(x)-1)-1. * * This means that for a u8 with the maximum value 255, it would think: diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 9ebdc1e864b9..bdc4a1fba1c6 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -510,7 +510,7 @@ batadv_mcast_mla_softif_get_ipv6(struct net_device *dev, * the given mcast_list. In general, multicast listeners provided by * your multicast receiving applications run directly on this node. * - * If there is a bridge interface on top of dev, collects from that one + * If there is a bridge interface on top of dev, collect from that one * instead. Just like with IP addresses and routes, multicast listeners * will(/should) register to the bridge interface instead of an * enslaved bat0. @@ -832,8 +832,8 @@ batadv_mcast_bridge_log(struct batadv_priv *bat_priv, * @bat_priv: the bat priv with all the soft interface information * @flags: TVLV flags indicating the new multicast state * - * Whenever the multicast TVLV flags this nodes announces change this notifies - * userspace via the 'mcast' log level. + * Whenever the multicast TVLV flags this node announces change, this function + * should be used to notify userspace about the change. */ static void batadv_mcast_flags_log(struct batadv_priv *bat_priv, u8 flags) { @@ -1244,7 +1244,7 @@ batadv_mcast_forw_ipv6_node_get(struct batadv_priv *bat_priv) * @ethhdr: an ethernet header to determine the protocol family from * * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_IPV4 or - * BATADV_MCAST_WANT_ALL_IPV6 flag, depending on the provided ethhdr, set and + * BATADV_MCAST_WANT_ALL_IPV6 flag, depending on the provided ethhdr, sets and * increases its refcount. */ static struct batadv_orig_node * @@ -1693,7 +1693,7 @@ batadv_mcast_forw_want_rtr(struct batadv_priv *bat_priv, } /** - * batadv_mcast_forw_send() - send packet to any detected multicast recpient + * batadv_mcast_forw_send() - send packet to any detected multicast recipient * @bat_priv: the bat priv with all the soft interface information * @skb: the multicast packet to transmit * @vid: the vlan identifier @@ -1742,7 +1742,8 @@ int batadv_mcast_forw_send(struct batadv_priv *bat_priv, struct sk_buff *skb, * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_ALL_UNSNOOPABLES flag of this originator, - * orig, has toggled then this method updates counter and list accordingly. + * orig, has toggled then this method updates the counter and the list + * accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ @@ -1787,7 +1788,7 @@ static void batadv_mcast_want_unsnoop_update(struct batadv_priv *bat_priv, * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_ALL_IPV4 flag of this originator, orig, has - * toggled then this method updates counter and list accordingly. + * toggled then this method updates the counter and the list accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ @@ -1832,7 +1833,7 @@ static void batadv_mcast_want_ipv4_update(struct batadv_priv *bat_priv, * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_ALL_IPV6 flag of this originator, orig, has - * toggled then this method updates counter and list accordingly. + * toggled then this method updates the counter and the list accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ @@ -1877,7 +1878,7 @@ static void batadv_mcast_want_ipv6_update(struct batadv_priv *bat_priv, * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_NO_RTR4 flag of this originator, orig, has - * toggled then this method updates counter and list accordingly. + * toggled then this method updates the counter and the list accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ @@ -1922,7 +1923,7 @@ static void batadv_mcast_want_rtr4_update(struct batadv_priv *bat_priv, * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_NO_RTR6 flag of this originator, orig, has - * toggled then this method updates counter and list accordingly. + * toggled then this method updates the counter and the list accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 02ed073f95a9..cfb00dfa468a 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -640,7 +640,7 @@ batadv_netlink_tp_meter_put(struct sk_buff *msg, u32 cookie) * @bat_priv: the bat priv with all the soft interface information * @dst: destination of tp_meter session * @result: reason for tp meter session stop - * @test_time: total time ot the tp_meter session + * @test_time: total time of the tp_meter session * @total_bytes: bytes acked to the receiver * @cookie: cookie of tp_meter session * diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index b0469d15da0e..48d707850f3e 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -134,7 +134,7 @@ static void batadv_nc_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, } /** - * batadv_nc_mesh_init() - initialise coding hash table and start house keeping + * batadv_nc_mesh_init() - initialise coding hash table and start housekeeping * @bat_priv: the bat priv with all the soft interface information * * Return: 0 on success or negative error number in case of failure @@ -700,7 +700,7 @@ batadv_nc_process_nc_paths(struct batadv_priv *bat_priv, } /** - * batadv_nc_worker() - periodic task for house keeping related to network + * batadv_nc_worker() - periodic task for housekeeping related to network * coding * @work: kernel work struct */ @@ -1316,7 +1316,7 @@ batadv_nc_path_search(struct batadv_priv *bat_priv, } /** - * batadv_nc_skb_src_search() - Loops through the list of neighoring nodes of + * batadv_nc_skb_src_search() - Loops through the list of neighboring nodes of * the skb's sender (may be equal to the originator). * @bat_priv: the bat priv with all the soft interface information * @skb: data skb to forward @@ -1402,10 +1402,10 @@ static void batadv_nc_skb_store_before_coding(struct batadv_priv *bat_priv, * @neigh_node: next hop to forward packet to * @ethhdr: pointer to the ethernet header inside the skb * - * Loops through list of neighboring nodes the next hop has a good connection to - * (receives OGMs with a sufficient quality). We need to find a neighbor of our - * next hop that potentially sent a packet which our next hop also received - * (overheard) and has stored for later decoding. + * Loops through the list of neighboring nodes the next hop has a good + * connection to (receives OGMs with a sufficient quality). We need to find a + * neighbor of our next hop that potentially sent a packet which our next hop + * also received (overheard) and has stored for later decoding. * * Return: true if the skb was consumed (encoded packet sent) or false otherwise */ diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 5b0c2fffc214..805d8969bdfb 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -325,7 +325,7 @@ void batadv_neigh_node_put(struct batadv_neigh_node *neigh_node) * @if_outgoing: the interface where the payload packet has been received or * the OGM should be sent to * - * Return: the neighbor which should be router for this orig_node/iface. + * Return: the neighbor which should be the router for this orig_node/iface. * * The object is returned with refcounter increased by 1. */ @@ -515,7 +515,7 @@ out: * Looks for and possibly returns a neighbour belonging to this originator list * which is connected through the provided hard interface. * - * Return: neighbor when found. Othwerwise NULL + * Return: neighbor when found. Otherwise NULL */ static struct batadv_neigh_node * batadv_neigh_node_get(const struct batadv_orig_node *orig_node, @@ -620,7 +620,7 @@ batadv_hardif_neigh_get_or_create(struct batadv_hard_iface *hard_iface, * * Looks for and possibly returns a neighbour belonging to this hard interface. * - * Return: neighbor when found. Othwerwise NULL + * Return: neighbor when found. Otherwise NULL */ struct batadv_hardif_neigh_node * batadv_hardif_neigh_get(const struct batadv_hard_iface *hard_iface, @@ -999,7 +999,7 @@ void batadv_originator_free(struct batadv_priv *bat_priv) * @bat_priv: the bat priv with all the soft interface information * @addr: the mac address of the originator * - * Creates a new originator object and initialise all the generic fields. + * Creates a new originator object and initialises all the generic fields. * The new object is not added to the originator list. * * Return: the newly created object or NULL on failure. diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index d343382e9664..27cdf5e4349a 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -449,7 +449,7 @@ free_skb: * @skb: packet to check * @hdr_size: size of header to pull * - * Check for short header and bad addresses in given packet. + * Checks for short header and bad addresses in the given packet. * * Return: negative value when check fails and 0 otherwise. The negative value * depends on the reason: -ENODATA for bad header, -EBADR for broadcast @@ -1113,7 +1113,7 @@ free_skb: * @recv_if: interface that the skb is received on * * This function does one of the three following things: 1) Forward fragment, if - * the assembled packet will exceed our MTU; 2) Buffer fragment, if we till + * the assembled packet will exceed our MTU; 2) Buffer fragment, if we still * lack further fragments; 3) Merge fragments, if we have all needed parts. * * Return: NET_RX_DROP if the skb is not consumed, NET_RX_SUCCESS otherwise. diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 7f8ade04e08e..d267b94800d6 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -605,8 +605,8 @@ bool batadv_forw_packet_steal(struct batadv_forw_packet *forw_packet, * given hard_iface. If hard_iface is NULL forwarding packets on all hard * interfaces will be claimed. * - * The packets are being moved from the forw_list to the cleanup_list and - * by that allows already running threads to notice the claiming. + * The packets are being moved from the forw_list to the cleanup_list. This + * makes it possible for already running threads to notice the claim. */ static void batadv_forw_packet_list_steal(struct hlist_head *forw_list, diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index f1f1c86f3419..23833a0ba5e6 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -406,7 +406,7 @@ end: * @hdr_size: size of already parsed batman-adv header * @orig_node: originator from which the batman-adv packet was sent * - * Sends a ethernet frame to the receive path of the local @soft_iface. + * Sends an ethernet frame to the receive path of the local @soft_iface. * skb->data has still point to the batman-adv header with the size @hdr_size. * The caller has to have parsed this header already and made sure that at least * @hdr_size bytes are still available for pull in @skb. diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c index bd2ac570c42c..db7e3774825b 100644 --- a/net/batman-adv/tp_meter.c +++ b/net/batman-adv/tp_meter.c @@ -66,7 +66,7 @@ /** * BATADV_TP_MAX_RTO - Maximum sender timeout. If the sender RTO gets beyond - * such amound of milliseconds, the receiver is considered unreachable and the + * such amount of milliseconds, the receiver is considered unreachable and the * connection is killed */ #define BATADV_TP_MAX_RTO 30000 @@ -108,10 +108,10 @@ static u32 batadv_tp_session_cookie(const u8 session[2], u8 icmp_uid) * batadv_tp_cwnd() - compute the new cwnd size * @base: base cwnd size value * @increment: the value to add to base to get the new size - * @min: minumim cwnd value (usually MSS) + * @min: minimum cwnd value (usually MSS) * - * Return the new cwnd size and ensures it does not exceed the Advertised - * Receiver Window size. It is wrap around safe. + * Return the new cwnd size and ensure it does not exceed the Advertised + * Receiver Window size. It is wrapped around safely. * For details refer to Section 3.1 of RFC5681 * * Return: new congestion window size in bytes @@ -254,7 +254,7 @@ static void batadv_tp_batctl_error_notify(enum batadv_tp_meter_reason reason, * @dst: the other endpoint MAC address to look for * * Look for a tp_vars object matching dst as end_point and return it after - * having incremented the refcounter. Return NULL is not found + * having increment the refcounter. Return NULL is not found * * Return: matching tp_vars or NULL when no tp_vars with @dst was found */ @@ -291,7 +291,7 @@ static struct batadv_tp_vars *batadv_tp_list_find(struct batadv_priv *bat_priv, * @session: session identifier * * Look for a tp_vars object matching dst as end_point, session as tp meter - * session and return it after having incremented the refcounter. Return NULL + * session and return it after having increment the refcounter. Return NULL * is not found * * Return: matching tp_vars or NULL when no tp_vars was found diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index a9635c882fe0..98a0aaaf0d50 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -301,7 +301,7 @@ void batadv_tt_global_entry_put(struct batadv_tt_global_entry *tt_global_entry) * @vid: VLAN identifier * * Return: the number of originators advertising the given address/data - * (excluding ourself). + * (excluding our self). */ int batadv_tt_global_hash_count(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid) @@ -842,7 +842,7 @@ out: * table. In case of success the value is updated with the real amount of * reserved bytes * Allocate the needed amount of memory for the entire TT TVLV and write its - * header made up by one tvlv_tt_data object and a series of tvlv_tt_vlan_data + * header made up of one tvlv_tt_data object and a series of tvlv_tt_vlan_data * objects, one per active VLAN served by the originator node. * * Return: the size of the allocated buffer or 0 in case of failure. @@ -1674,7 +1674,7 @@ out: * the function argument. * If a TT local entry exists for this non-mesh client remove it. * - * The caller must hold orig_node refcount. + * The caller must hold the orig_node refcount. * * Return: true if the new entry has been added, false otherwise */ @@ -1839,7 +1839,7 @@ out: * @bat_priv: the bat priv with all the soft interface information * @tt_global_entry: global translation table entry to be analyzed * - * This functon assumes the caller holds rcu_read_lock(). + * This function assumes the caller holds rcu_read_lock(). * Return: best originator list entry or NULL on errors. */ static struct batadv_tt_orig_list_entry * @@ -1887,7 +1887,7 @@ batadv_transtable_best_orig(struct batadv_priv *bat_priv, * @tt_global_entry: global translation table entry to be printed * @seq: debugfs table seq_file struct * - * This functon assumes the caller holds rcu_read_lock(). + * This function assumes the caller holds rcu_read_lock(). */ static void batadv_tt_global_print_entry(struct batadv_priv *bat_priv, diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c index 0963a43ad996..6a23a566cde1 100644 --- a/net/batman-adv/tvlv.c +++ b/net/batman-adv/tvlv.c @@ -353,8 +353,8 @@ end: * @tvlv_value: tvlv content * @tvlv_value_len: tvlv content length * - * Return: success if handler was not found or the return value of the handler - * callback. + * Return: success if the handler was not found or the return value of the + * handler callback. */ static int batadv_tvlv_call_handler(struct batadv_priv *bat_priv, struct batadv_tvlv_handler *tvlv_handler, diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index d152b8e81f61..cc151e1f23b2 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -455,8 +455,8 @@ struct batadv_orig_node { spinlock_t tt_buff_lock; /** - * @tt_lock: prevents from updating the table while reading it. Table - * update is made up by two operations (data structure update and + * @tt_lock: avoids concurrent read from and write to the table. Table + * update is made up of two operations (data structure update and * metadata -CRC/TTVN-recalculation) and they have to be executed * atomically in order to avoid another thread to read the * table/metadata between those. @@ -748,7 +748,7 @@ struct batadv_neigh_ifinfo { * struct batadv_bcast_duplist_entry - structure for LAN broadcast suppression */ struct batadv_bcast_duplist_entry { - /** @orig: mac address of orig node orginating the broadcast */ + /** @orig: mac address of orig node originating the broadcast */ u8 orig[ETH_ALEN]; /** @crc: crc32 checksum of broadcast payload */ @@ -1010,7 +1010,7 @@ struct batadv_priv_tt { /** * @commit_lock: prevents from executing a local TT commit while reading - * the local table. The local TT commit is made up by two operations + * the local table. The local TT commit is made up of two operations * (data structure update and metadata -CRC/TTVN- recalculation) and * they have to be executed atomically in order to avoid another thread * to read the table/metadata between those. @@ -1024,7 +1024,7 @@ struct batadv_priv_tt { #ifdef CONFIG_BATMAN_ADV_BLA /** - * struct batadv_priv_bla - per mesh interface bridge loope avoidance data + * struct batadv_priv_bla - per mesh interface bridge loop avoidance data */ struct batadv_priv_bla { /** @num_requests: number of bla requests in flight */ @@ -1718,7 +1718,7 @@ struct batadv_priv { spinlock_t softif_vlan_list_lock; #ifdef CONFIG_BATMAN_ADV_BLA - /** @bla: bridge loope avoidance data */ + /** @bla: bridge loop avoidance data */ struct batadv_priv_bla bla; #endif -- cgit v1.2.3-59-g8ed1b From 3bda14d09dc5789a895ab02b7dcfcec19b0a65b3 Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Mon, 1 Jun 2020 22:35:22 +0200 Subject: batman-adv: Introduce a configurable per interface hop penalty MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In some setups multiple hard interfaces with similar link qualities or throughput values are available. But people have expressed the desire to consider one of them as a backup only. Some creative solutions are currently in use: Such people are configuring multiple batman-adv mesh/soft interfaces, wire them together with some veth pairs and then tune the hop penalty to achieve an effect similar to a tunable per interface hop penalty. This patch introduces a new, configurable, per hard interface hop penalty to simplify such setups. Signed-off-by: Linus Lüssing Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- include/uapi/linux/batman_adv.h | 3 ++- net/batman-adv/bat_iv_ogm.c | 17 +++++++++-------- net/batman-adv/bat_v_ogm.c | 13 ++++++++++--- net/batman-adv/hard-interface.c | 2 ++ net/batman-adv/netlink.c | 12 +++++++++++- net/batman-adv/types.h | 6 ++++++ 6 files changed, 40 insertions(+), 13 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index 8cf2ad11ead9..bb0ae945b36a 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -427,7 +427,8 @@ enum batadv_nl_attrs { /** * @BATADV_ATTR_HOP_PENALTY: defines the penalty which will be applied - * to an originator message's tq-field on every hop. + * to an originator message's tq-field on every hop and/or per + * hard interface */ BATADV_ATTR_HOP_PENALTY, diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 5b3a41983156..a4faf5f904d9 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -1075,10 +1075,10 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, struct batadv_neigh_ifinfo *neigh_ifinfo; u8 total_count; u8 orig_eq_count, neigh_rq_count, neigh_rq_inv, tq_own; + unsigned int tq_iface_hop_penalty = BATADV_TQ_MAX_VALUE; unsigned int neigh_rq_inv_cube, neigh_rq_max_cube; unsigned int tq_asym_penalty, inv_asym_penalty; unsigned int combined_tq; - unsigned int tq_iface_penalty; bool ret = false; /* find corresponding one hop neighbor */ @@ -1157,31 +1157,32 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, inv_asym_penalty = BATADV_TQ_MAX_VALUE * neigh_rq_inv_cube; inv_asym_penalty /= neigh_rq_max_cube; tq_asym_penalty = BATADV_TQ_MAX_VALUE - inv_asym_penalty; + tq_iface_hop_penalty -= atomic_read(&if_incoming->hop_penalty); /* penalize if the OGM is forwarded on the same interface. WiFi * interfaces and other half duplex devices suffer from throughput * drops as they can't send and receive at the same time. */ - tq_iface_penalty = BATADV_TQ_MAX_VALUE; if (if_outgoing && if_incoming == if_outgoing && batadv_is_wifi_hardif(if_outgoing)) - tq_iface_penalty = batadv_hop_penalty(BATADV_TQ_MAX_VALUE, - bat_priv); + tq_iface_hop_penalty = batadv_hop_penalty(tq_iface_hop_penalty, + bat_priv); combined_tq = batadv_ogm_packet->tq * tq_own * tq_asym_penalty * - tq_iface_penalty; + tq_iface_hop_penalty; combined_tq /= BATADV_TQ_MAX_VALUE * BATADV_TQ_MAX_VALUE * BATADV_TQ_MAX_VALUE; batadv_ogm_packet->tq = combined_tq; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, - "bidirectional: orig = %pM neigh = %pM => own_bcast = %2i, real recv = %2i, local tq: %3i, asym_penalty: %3i, iface_penalty: %3i, total tq: %3i, if_incoming = %s, if_outgoing = %s\n", + "bidirectional: orig = %pM neigh = %pM => own_bcast = %2i, real recv = %2i, local tq: %3i, asym_penalty: %3i, iface_hop_penalty: %3i, total tq: %3i, if_incoming = %s, if_outgoing = %s\n", orig_node->orig, orig_neigh_node->orig, total_count, - neigh_rq_count, tq_own, tq_asym_penalty, tq_iface_penalty, - batadv_ogm_packet->tq, if_incoming->net_dev->name, + neigh_rq_count, tq_own, tq_asym_penalty, + tq_iface_hop_penalty, batadv_ogm_packet->tq, + if_incoming->net_dev->name, if_outgoing ? if_outgoing->net_dev->name : "DEFAULT"); /* if link has the minimum required transmission quality diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index 0d404f7bcd9f..0f8495b9eeb1 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -455,15 +455,17 @@ unlock: * @throughput: the current throughput * * Apply a penalty on the current throughput metric value based on the - * characteristic of the interface where the OGM has been received. The return - * value is computed as follows: + * characteristic of the interface where the OGM has been received. + * + * Initially the per hardif hop penalty is applied to the throughput. After + * that the return value is then computed as follows: * - throughput * 50% if the incoming and outgoing interface are the * same WiFi interface and the throughput is above * 1MBit/s * - throughput if the outgoing interface is the default * interface (i.e. this OGM is processed for the * internal table and not forwarded) - * - throughput * hop penalty otherwise + * - throughput * node hop penalty otherwise * * Return: the penalised throughput metric. */ @@ -472,9 +474,14 @@ static u32 batadv_v_forward_penalty(struct batadv_priv *bat_priv, struct batadv_hard_iface *if_outgoing, u32 throughput) { + int if_hop_penalty = atomic_read(&if_incoming->hop_penalty); int hop_penalty = atomic_read(&bat_priv->hop_penalty); int hop_penalty_max = BATADV_TQ_MAX_VALUE; + /* Apply per hardif hop penalty */ + throughput = throughput * (hop_penalty_max - if_hop_penalty) / + hop_penalty_max; + /* Don't apply hop penalty in default originator table. */ if (if_outgoing == BATADV_IF_DEFAULT) return throughput; diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 53c27c67cc11..fa06b51c0144 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -939,6 +939,8 @@ batadv_hardif_add_interface(struct net_device *net_dev) if (batadv_is_wifi_hardif(hard_iface)) hard_iface->num_bcasts = BATADV_NUM_BCASTS_WIRELESS; + atomic_set(&hard_iface->hop_penalty, 0); + batadv_v_hardif_init(hard_iface); batadv_check_known_mac_addr(hard_iface->net_dev); diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index cfb00dfa468a..dc193618a761 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -826,6 +826,10 @@ static int batadv_netlink_hardif_fill(struct sk_buff *msg, goto nla_put_failure; } + if (nla_put_u8(msg, BATADV_ATTR_HOP_PENALTY, + atomic_read(&hard_iface->hop_penalty))) + goto nla_put_failure; + #ifdef CONFIG_BATMAN_ADV_BATMAN_V if (nla_put_u32(msg, BATADV_ATTR_ELP_INTERVAL, atomic_read(&hard_iface->bat_v.elp_interval))) @@ -920,9 +924,15 @@ static int batadv_netlink_set_hardif(struct sk_buff *skb, { struct batadv_hard_iface *hard_iface = info->user_ptr[1]; struct batadv_priv *bat_priv = info->user_ptr[0]; + struct nlattr *attr; + + if (info->attrs[BATADV_ATTR_HOP_PENALTY]) { + attr = info->attrs[BATADV_ATTR_HOP_PENALTY]; + + atomic_set(&hard_iface->hop_penalty, nla_get_u8(attr)); + } #ifdef CONFIG_BATMAN_ADV_BATMAN_V - struct nlattr *attr; if (info->attrs[BATADV_ATTR_ELP_INTERVAL]) { attr = info->attrs[BATADV_ATTR_ELP_INTERVAL]; diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index cc151e1f23b2..ed519efa3c36 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -208,6 +208,12 @@ struct batadv_hard_iface { /** @rcu: struct used for freeing in an RCU-safe manner */ struct rcu_head rcu; + /** + * @hop_penalty: penalty which will be applied to the tq-field + * of an OGM received via this interface + */ + atomic_t hop_penalty; + /** @bat_iv: per hard-interface B.A.T.M.A.N. IV data */ struct batadv_hard_iface_bat_iv bat_iv; -- cgit v1.2.3-59-g8ed1b From 322b598be4d9b9090cda560c4caab78704615ab4 Mon Sep 17 00:00:00 2001 From: Xu Yilun Date: Tue, 16 Jun 2020 12:08:44 +0800 Subject: fpga: dfl: introduce interrupt trigger setting API FPGA user applications may be interested in interrupts generated by DFL features. For example, users can implement their own FPGA logics with interrupts enabled in AFU (Accelerated Function Unit, dynamic region of DFL based FPGA). So user applications need to be notified to handle these interrupts. In order to allow userspace applications to monitor interrupts, driver requires userspace to provide eventfds as interrupt notification channels. Applications then poll/select on the eventfds to get notified. This patch introduces a generic helper functions to do eventfds binding with given interrupts. Sub feature drivers are expected to use XXX_GET_IRQ_NUM to query irq info, and XXX_SET_IRQ to set eventfds for interrupts. This patch also introduces helper functions for these 2 ioctls. Signed-off-by: Luwei Kang Signed-off-by: Wu Hao Signed-off-by: Xu Yilun Signed-off-by: Tom Rix Reviewed-by: Marcelo Tosatti Acked-by: Wu Hao Signed-off-by: Moritz Fischer --- drivers/fpga/dfl.c | 157 ++++++++++++++++++++++++++++++++++++++++++ drivers/fpga/dfl.h | 16 +++++ include/uapi/linux/fpga-dfl.h | 13 ++++ 3 files changed, 186 insertions(+) (limited to 'include/uapi') diff --git a/drivers/fpga/dfl.c b/drivers/fpga/dfl.c index d915331fb52c..649958a36e62 100644 --- a/drivers/fpga/dfl.c +++ b/drivers/fpga/dfl.c @@ -10,7 +10,9 @@ * Wu Hao * Xiao Guangrong */ +#include #include +#include #include "dfl.h" @@ -533,6 +535,7 @@ static int build_info_commit_dev(struct build_feature_devs_info *binfo) unsigned int i; /* save resource information for each feature */ + feature->dev = fdev; feature->id = finfo->fid; feature->resource_index = index; feature->ioaddr = finfo->ioaddr; @@ -1393,6 +1396,160 @@ done: } EXPORT_SYMBOL_GPL(dfl_fpga_cdev_config_ports_vf); +static irqreturn_t dfl_irq_handler(int irq, void *arg) +{ + struct eventfd_ctx *trigger = arg; + + eventfd_signal(trigger, 1); + return IRQ_HANDLED; +} + +static int do_set_irq_trigger(struct dfl_feature *feature, unsigned int idx, + int fd) +{ + struct platform_device *pdev = feature->dev; + struct eventfd_ctx *trigger; + int irq, ret; + + irq = feature->irq_ctx[idx].irq; + + if (feature->irq_ctx[idx].trigger) { + free_irq(irq, feature->irq_ctx[idx].trigger); + kfree(feature->irq_ctx[idx].name); + eventfd_ctx_put(feature->irq_ctx[idx].trigger); + feature->irq_ctx[idx].trigger = NULL; + } + + if (fd < 0) + return 0; + + feature->irq_ctx[idx].name = + kasprintf(GFP_KERNEL, "fpga-irq[%u](%s-%llx)", idx, + dev_name(&pdev->dev), feature->id); + if (!feature->irq_ctx[idx].name) + return -ENOMEM; + + trigger = eventfd_ctx_fdget(fd); + if (IS_ERR(trigger)) { + ret = PTR_ERR(trigger); + goto free_name; + } + + ret = request_irq(irq, dfl_irq_handler, 0, + feature->irq_ctx[idx].name, trigger); + if (!ret) { + feature->irq_ctx[idx].trigger = trigger; + return ret; + } + + eventfd_ctx_put(trigger); +free_name: + kfree(feature->irq_ctx[idx].name); + + return ret; +} + +/** + * dfl_fpga_set_irq_triggers - set eventfd triggers for dfl feature interrupts + * + * @feature: dfl sub feature. + * @start: start of irq index in this dfl sub feature. + * @count: number of irqs. + * @fds: eventfds to bind with irqs. unbind related irq if fds[n] is negative. + * unbind "count" specified number of irqs if fds ptr is NULL. + * + * Bind given eventfds with irqs in this dfl sub feature. Unbind related irq if + * fds[n] is negative. Unbind "count" specified number of irqs if fds ptr is + * NULL. + * + * Return: 0 on success, negative error code otherwise. + */ +int dfl_fpga_set_irq_triggers(struct dfl_feature *feature, unsigned int start, + unsigned int count, int32_t *fds) +{ + unsigned int i; + int ret = 0; + + /* overflow */ + if (unlikely(start + count < start)) + return -EINVAL; + + /* exceeds nr_irqs */ + if (start + count > feature->nr_irqs) + return -EINVAL; + + for (i = 0; i < count; i++) { + int fd = fds ? fds[i] : -1; + + ret = do_set_irq_trigger(feature, start + i, fd); + if (ret) { + while (i--) + do_set_irq_trigger(feature, start + i, -1); + break; + } + } + + return ret; +} +EXPORT_SYMBOL_GPL(dfl_fpga_set_irq_triggers); + +/** + * dfl_feature_ioctl_get_num_irqs - dfl feature _GET_IRQ_NUM ioctl interface. + * @pdev: the feature device which has the sub feature + * @feature: the dfl sub feature + * @arg: ioctl argument + * + * Return: 0 on success, negative error code otherwise. + */ +long dfl_feature_ioctl_get_num_irqs(struct platform_device *pdev, + struct dfl_feature *feature, + unsigned long arg) +{ + return put_user(feature->nr_irqs, (__u32 __user *)arg); +} +EXPORT_SYMBOL_GPL(dfl_feature_ioctl_get_num_irqs); + +/** + * dfl_feature_ioctl_set_irq - dfl feature _SET_IRQ ioctl interface. + * @pdev: the feature device which has the sub feature + * @feature: the dfl sub feature + * @arg: ioctl argument + * + * Return: 0 on success, negative error code otherwise. + */ +long dfl_feature_ioctl_set_irq(struct platform_device *pdev, + struct dfl_feature *feature, + unsigned long arg) +{ + struct dfl_feature_platform_data *pdata = dev_get_platdata(&pdev->dev); + struct dfl_fpga_irq_set hdr; + s32 *fds; + long ret; + + if (!feature->nr_irqs) + return -ENOENT; + + if (copy_from_user(&hdr, (void __user *)arg, sizeof(hdr))) + return -EFAULT; + + if (!hdr.count || (hdr.start + hdr.count > feature->nr_irqs) || + (hdr.start + hdr.count < hdr.start)) + return -EINVAL; + + fds = memdup_user((void __user *)(arg + sizeof(hdr)), + hdr.count * sizeof(s32)); + if (IS_ERR(fds)) + return PTR_ERR(fds); + + mutex_lock(&pdata->lock); + ret = dfl_fpga_set_irq_triggers(feature, hdr.start, hdr.count, fds); + mutex_unlock(&pdata->lock); + + kfree(fds); + return ret; +} +EXPORT_SYMBOL_GPL(dfl_feature_ioctl_set_irq); + static void __exit dfl_fpga_exit(void) { dfl_chardev_uinit(); diff --git a/drivers/fpga/dfl.h b/drivers/fpga/dfl.h index e1f3ab86560e..a32dfba2a88b 100644 --- a/drivers/fpga/dfl.h +++ b/drivers/fpga/dfl.h @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -214,14 +215,19 @@ struct dfl_feature_driver { * struct dfl_feature_irq_ctx - dfl private feature interrupt context * * @irq: Linux IRQ number of this interrupt. + * @trigger: eventfd context to signal when interrupt happens. + * @name: irq name needed when requesting irq. */ struct dfl_feature_irq_ctx { int irq; + struct eventfd_ctx *trigger; + char *name; }; /** * struct dfl_feature - sub feature of the feature devices * + * @dev: ptr to pdev of the feature device which has the sub feature. * @id: sub feature id. * @resource_index: each sub feature has one mmio resource for its registers. * this index is used to find its mmio resource from the @@ -233,6 +239,7 @@ struct dfl_feature_irq_ctx { * @priv: priv data of this feature. */ struct dfl_feature { + struct platform_device *dev; u64 id; int resource_index; void __iomem *ioaddr; @@ -503,4 +510,13 @@ int dfl_fpga_cdev_release_port(struct dfl_fpga_cdev *cdev, int port_id); int dfl_fpga_cdev_assign_port(struct dfl_fpga_cdev *cdev, int port_id); void dfl_fpga_cdev_config_ports_pf(struct dfl_fpga_cdev *cdev); int dfl_fpga_cdev_config_ports_vf(struct dfl_fpga_cdev *cdev, int num_vf); +int dfl_fpga_set_irq_triggers(struct dfl_feature *feature, unsigned int start, + unsigned int count, int32_t *fds); +long dfl_feature_ioctl_get_num_irqs(struct platform_device *pdev, + struct dfl_feature *feature, + unsigned long arg); +long dfl_feature_ioctl_set_irq(struct platform_device *pdev, + struct dfl_feature *feature, + unsigned long arg); + #endif /* __FPGA_DFL_H */ diff --git a/include/uapi/linux/fpga-dfl.h b/include/uapi/linux/fpga-dfl.h index ec70a0746e59..7331350f3067 100644 --- a/include/uapi/linux/fpga-dfl.h +++ b/include/uapi/linux/fpga-dfl.h @@ -151,6 +151,19 @@ struct dfl_fpga_port_dma_unmap { #define DFL_FPGA_PORT_DMA_UNMAP _IO(DFL_FPGA_MAGIC, DFL_PORT_BASE + 4) +/** + * struct dfl_fpga_irq_set - the argument for DFL_FPGA_XXX_SET_IRQ ioctl. + * + * @start: Index of the first irq. + * @count: The number of eventfd handler. + * @evtfds: Eventfd handlers. + */ +struct dfl_fpga_irq_set { + __u32 start; + __u32 count; + __s32 evtfds[]; +}; + /* IOCTLs for FME file descriptor */ /** -- cgit v1.2.3-59-g8ed1b From fe80536acf8397827be77f9b8ada384b90e790d0 Mon Sep 17 00:00:00 2001 From: Martin Date: Sun, 28 Jun 2020 23:18:23 +0530 Subject: bareudp: Added attribute to enable & disable rx metadata collection Metadata need not be collected in receive if the packet from bareudp device is not targeted to openvswitch. Signed-off-by: Martin Signed-off-by: David S. Miller --- Documentation/networking/bareudp.rst | 6 ++++-- drivers/net/bareudp.c | 23 +++++++++++++++++------ include/net/bareudp.h | 1 + include/uapi/linux/if_link.h | 1 + 4 files changed, 23 insertions(+), 8 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/networking/bareudp.rst b/Documentation/networking/bareudp.rst index 465a8b251bfe..0e00636d8d74 100644 --- a/Documentation/networking/bareudp.rst +++ b/Documentation/networking/bareudp.rst @@ -48,5 +48,7 @@ enabled. The bareudp device could be used along with OVS or flower filter in TC. The OVS or TC flower layer must set the tunnel information in SKB dst field before sending packet buffer to the bareudp device for transmission. On reception the -bareudp device extracts and stores the tunnel information in SKB dst field before -passing the packet buffer to the network stack. +bareudp device decapsulates the udp header and passes the inner packet to the +network stack. If RX_COLLECT_METADATA flag is enabled in the device the tunnel +information will be stored in the SKB dst field before the packet buffer is +passed to the network stack. diff --git a/drivers/net/bareudp.c b/drivers/net/bareudp.c index 3dd46cd55114..108a8cafc4f8 100644 --- a/drivers/net/bareudp.c +++ b/drivers/net/bareudp.c @@ -46,6 +46,7 @@ struct bareudp_dev { __be16 port; u16 sport_min; bool multi_proto_mode; + bool rx_collect_metadata; struct socket __rcu *sock; struct list_head next; /* bareudp node on namespace list */ struct gro_cells gro_cells; @@ -125,13 +126,14 @@ static int bareudp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) bareudp->dev->stats.rx_dropped++; goto drop; } - - tun_dst = udp_tun_rx_dst(skb, family, TUNNEL_KEY, 0, 0); - if (!tun_dst) { - bareudp->dev->stats.rx_dropped++; - goto drop; + if (bareudp->rx_collect_metadata) { + tun_dst = udp_tun_rx_dst(skb, family, TUNNEL_KEY, 0, 0); + if (!tun_dst) { + bareudp->dev->stats.rx_dropped++; + goto drop; + } + skb_dst_set(skb, &tun_dst->dst); } - skb_dst_set(skb, &tun_dst->dst); skb->dev = bareudp->dev; oiph = skb_network_header(skb); skb_reset_network_header(skb); @@ -575,6 +577,9 @@ static int bareudp2info(struct nlattr *data[], struct bareudp_conf *conf, if (data[IFLA_BAREUDP_MULTIPROTO_MODE]) conf->multi_proto_mode = true; + if (data[IFLA_BAREUDP_RX_COLLECT_METADATA]) + conf->rx_collect_metadata = true; + return 0; } @@ -612,6 +617,8 @@ static int bareudp_configure(struct net *net, struct net_device *dev, bareudp->ethertype = conf->ethertype; bareudp->sport_min = conf->sport_min; bareudp->multi_proto_mode = conf->multi_proto_mode; + bareudp->rx_collect_metadata = conf->rx_collect_metadata; + err = register_netdevice(dev); if (err) return err; @@ -669,6 +676,7 @@ static size_t bareudp_get_size(const struct net_device *dev) nla_total_size(sizeof(__be16)) + /* IFLA_BAREUDP_ETHERTYPE */ nla_total_size(sizeof(__u16)) + /* IFLA_BAREUDP_SRCPORT_MIN */ nla_total_size(0) + /* IFLA_BAREUDP_MULTIPROTO_MODE */ + nla_total_size(0) + /* IFLA_BAREUDP_RX_COLLECT_METADATA */ 0; } @@ -685,6 +693,9 @@ static int bareudp_fill_info(struct sk_buff *skb, const struct net_device *dev) if (bareudp->multi_proto_mode && nla_put_flag(skb, IFLA_BAREUDP_MULTIPROTO_MODE)) goto nla_put_failure; + if (bareudp->rx_collect_metadata && + nla_put_flag(skb, IFLA_BAREUDP_RX_COLLECT_METADATA)) + goto nla_put_failure; return 0; diff --git a/include/net/bareudp.h b/include/net/bareudp.h index dc65a0d71d9b..3dd5f9a8d01c 100644 --- a/include/net/bareudp.h +++ b/include/net/bareudp.h @@ -12,6 +12,7 @@ struct bareudp_conf { __be16 port; u16 sport_min; bool multi_proto_mode; + bool rx_collect_metadata; }; struct net_device *bareudp_dev_create(struct net *net, const char *name, diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index a009365ad67b..cc185a007ade 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -600,6 +600,7 @@ enum { IFLA_BAREUDP_ETHERTYPE, IFLA_BAREUDP_SRCPORT_MIN, IFLA_BAREUDP_MULTIPROTO_MODE, + IFLA_BAREUDP_RX_COLLECT_METADATA, __IFLA_BAREUDP_MAX }; -- cgit v1.2.3-59-g8ed1b From aee9caa03fc3c8b02f8f31824354d85f30e562e0 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Sat, 27 Jun 2020 01:45:28 +0300 Subject: net: sched: sch_red: Add qevents "early_drop" and "mark" In order to allow acting on dropped and/or ECN-marked packets, add two new qevents to the RED qdisc: "early_drop" and "mark". Filters attached at "early_drop" block are executed as packets are early-dropped, those attached at the "mark" block are executed as packets are ECN-marked. Two new attributes are introduced: TCA_RED_EARLY_DROP_BLOCK with the block index for the "early_drop" qevent, and TCA_RED_MARK_BLOCK for the "mark" qevent. Absence of these attributes signifies "don't care": no block is allocated in that case, or the existing blocks are left intact in case of the change callback. For purposes of offloading, blocks attached to these qevents appear with newly-introduced binder types, FLOW_BLOCK_BINDER_TYPE_RED_EARLY_DROP and FLOW_BLOCK_BINDER_TYPE_RED_MARK. Signed-off-by: Petr Machata Signed-off-by: David S. Miller --- include/net/flow_offload.h | 2 ++ include/uapi/linux/pkt_sched.h | 2 ++ net/sched/sch_red.c | 58 ++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 60 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index 3bafb5124ac0..3e793ac66baf 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -424,6 +424,8 @@ enum flow_block_binder_type { FLOW_BLOCK_BINDER_TYPE_UNSPEC, FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS, FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS, + FLOW_BLOCK_BINDER_TYPE_RED_EARLY_DROP, + FLOW_BLOCK_BINDER_TYPE_RED_MARK, }; struct flow_block { diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index a95f3ae7ab37..9e7c2c607845 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -257,6 +257,8 @@ enum { TCA_RED_STAB, TCA_RED_MAX_P, TCA_RED_FLAGS, /* bitfield32 */ + TCA_RED_EARLY_DROP_BLOCK, /* u32 */ + TCA_RED_MARK_BLOCK, /* u32 */ __TCA_RED_MAX, }; diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 225ce370e5a8..de2be4d04ed6 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -46,6 +46,8 @@ struct red_sched_data { struct red_vars vars; struct red_stats stats; struct Qdisc *qdisc; + struct tcf_qevent qe_early_drop; + struct tcf_qevent qe_mark; }; #define TC_RED_SUPPORTED_FLAGS (TC_RED_HISTORIC_FLAGS | TC_RED_NODROP) @@ -92,6 +94,9 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_ if (INET_ECN_set_ce(skb)) { q->stats.prob_mark++; + skb = tcf_qevent_handle(&q->qe_mark, sch, skb, root_lock, to_free, &ret); + if (!skb) + return NET_XMIT_CN | ret; } else if (!red_use_nodrop(q)) { q->stats.prob_drop++; goto congestion_drop; @@ -109,6 +114,9 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_ if (INET_ECN_set_ce(skb)) { q->stats.forced_mark++; + skb = tcf_qevent_handle(&q->qe_mark, sch, skb, root_lock, to_free, &ret); + if (!skb) + return NET_XMIT_CN | ret; } else if (!red_use_nodrop(q)) { q->stats.forced_drop++; goto congestion_drop; @@ -129,6 +137,10 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_ return ret; congestion_drop: + skb = tcf_qevent_handle(&q->qe_early_drop, sch, skb, root_lock, to_free, &ret); + if (!skb) + return NET_XMIT_CN | ret; + qdisc_drop(skb, sch, to_free); return NET_XMIT_CN; } @@ -202,6 +214,8 @@ static void red_destroy(struct Qdisc *sch) { struct red_sched_data *q = qdisc_priv(sch); + tcf_qevent_destroy(&q->qe_mark, sch); + tcf_qevent_destroy(&q->qe_early_drop, sch); del_timer_sync(&q->adapt_timer); red_offload(sch, false); qdisc_put(q->qdisc); @@ -213,6 +227,8 @@ static const struct nla_policy red_policy[TCA_RED_MAX + 1] = { [TCA_RED_STAB] = { .len = RED_STAB_SIZE }, [TCA_RED_MAX_P] = { .type = NLA_U32 }, [TCA_RED_FLAGS] = NLA_POLICY_BITFIELD32(TC_RED_SUPPORTED_FLAGS), + [TCA_RED_EARLY_DROP_BLOCK] = { .type = NLA_U32 }, + [TCA_RED_MARK_BLOCK] = { .type = NLA_U32 }, }; static int __red_change(struct Qdisc *sch, struct nlattr **tb, @@ -328,12 +344,38 @@ static int red_init(struct Qdisc *sch, struct nlattr *opt, q->qdisc = &noop_qdisc; q->sch = sch; timer_setup(&q->adapt_timer, red_adaptative_timer, 0); - return __red_change(sch, tb, extack); + + err = __red_change(sch, tb, extack); + if (err) + return err; + + err = tcf_qevent_init(&q->qe_early_drop, sch, + FLOW_BLOCK_BINDER_TYPE_RED_EARLY_DROP, + tb[TCA_RED_EARLY_DROP_BLOCK], extack); + if (err) + goto err_early_drop_init; + + err = tcf_qevent_init(&q->qe_mark, sch, + FLOW_BLOCK_BINDER_TYPE_RED_MARK, + tb[TCA_RED_MARK_BLOCK], extack); + if (err) + goto err_mark_init; + + return 0; + +err_mark_init: + tcf_qevent_destroy(&q->qe_early_drop, sch); +err_early_drop_init: + del_timer_sync(&q->adapt_timer); + red_offload(sch, false); + qdisc_put(q->qdisc); + return err; } static int red_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { + struct red_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_RED_MAX + 1]; int err; @@ -345,6 +387,16 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt, if (err < 0) return err; + err = tcf_qevent_validate_change(&q->qe_early_drop, + tb[TCA_RED_EARLY_DROP_BLOCK], extack); + if (err) + return err; + + err = tcf_qevent_validate_change(&q->qe_mark, + tb[TCA_RED_MARK_BLOCK], extack); + if (err) + return err; + return __red_change(sch, tb, extack); } @@ -389,7 +441,9 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb) if (nla_put(skb, TCA_RED_PARMS, sizeof(opt), &opt) || nla_put_u32(skb, TCA_RED_MAX_P, q->parms.max_P) || nla_put_bitfield32(skb, TCA_RED_FLAGS, - q->flags, TC_RED_SUPPORTED_FLAGS)) + q->flags, TC_RED_SUPPORTED_FLAGS) || + tcf_qevent_dump(skb, TCA_RED_MARK_BLOCK, &q->qe_mark) || + tcf_qevent_dump(skb, TCA_RED_EARLY_DROP_BLOCK, &q->qe_early_drop)) goto nla_put_failure; return nla_nest_end(skb, opts); -- cgit v1.2.3-59-g8ed1b From ecc31c60240b9808a274befc5db6b8a249a6ade1 Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Mon, 29 Jun 2020 23:46:16 +0300 Subject: ethtool: Add link extended state Currently, drivers can only tell whether the link is up/down using LINKSTATE_GET, but no additional information is given. Add attributes to LINKSTATE_GET command in order to allow drivers to expose the user more information in addition to link state to ease the debug process, for example, reason for link down state. Extended state consists of two attributes - link_ext_state and link_ext_substate. The idea is to avoid 'vendor specific' states in order to prevent drivers to use specific link_ext_state that can be in the future common link_ext_state. The substates allows drivers to add more information to the common link_ext_state. For example, vendor can expose 'Autoneg' as link_ext_state and add 'No partner detected during force mode' as link_ext_substate. If a driver cannot pinpoint the extended state with the substate accuracy, it is free to expose only the extended state and omit the substate attribute. Signed-off-by: Amit Cohen Reviewed-by: Jiri Pirko Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/linux/ethtool.h | 23 ++++++++++++ include/uapi/linux/ethtool.h | 70 ++++++++++++++++++++++++++++++++++++ include/uapi/linux/ethtool_netlink.h | 2 ++ net/ethtool/linkstate.c | 52 ++++++++++++++++++++++++--- 4 files changed, 143 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index a23b26eab479..48ad3b6a0150 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -86,6 +86,22 @@ struct net_device; u32 ethtool_op_get_link(struct net_device *dev); int ethtool_op_get_ts_info(struct net_device *dev, struct ethtool_ts_info *eti); + +/** + * struct ethtool_link_ext_state_info - link extended state and substate. + */ +struct ethtool_link_ext_state_info { + enum ethtool_link_ext_state link_ext_state; + union { + enum ethtool_link_ext_substate_autoneg autoneg; + enum ethtool_link_ext_substate_link_training link_training; + enum ethtool_link_ext_substate_link_logical_mismatch link_logical_mismatch; + enum ethtool_link_ext_substate_bad_signal_integrity bad_signal_integrity; + enum ethtool_link_ext_substate_cable_issue cable_issue; + u8 __link_ext_substate; + }; +}; + /** * ethtool_rxfh_indir_default - get default value for RX flow hash indirection * @index: Index in RX flow hash indirection table @@ -245,6 +261,11 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32, * @get_link: Report whether physical link is up. Will only be called if * the netdev is up. Should usually be set to ethtool_op_get_link(), * which uses netif_carrier_ok(). + * @get_link_ext_state: Report link extended state. Should set link_ext_state and + * link_ext_substate (link_ext_substate of 0 means link_ext_substate is unknown, + * do not attach ext_substate attribute to netlink message). If link_ext_state + * and link_ext_substate are unknown, return -ENODATA. If not implemented, + * link_ext_state and link_ext_substate will not be sent to userspace. * @get_eeprom: Read data from the device EEPROM. * Should fill in the magic field. Don't need to check len for zero * or wraparound. Fill in the data argument with the eeprom values @@ -384,6 +405,8 @@ struct ethtool_ops { void (*set_msglevel)(struct net_device *, u32); int (*nway_reset)(struct net_device *); u32 (*get_link)(struct net_device *); + int (*get_link_ext_state)(struct net_device *, + struct ethtool_link_ext_state_info *); int (*get_eeprom_len)(struct net_device *); int (*get_eeprom)(struct net_device *, struct ethtool_eeprom *, u8 *); diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index f4662b3a9e1e..d1413538ef30 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -579,6 +579,76 @@ struct ethtool_pauseparam { __u32 tx_pause; }; +/** + * enum ethtool_link_ext_state - link extended state + */ +enum ethtool_link_ext_state { + ETHTOOL_LINK_EXT_STATE_AUTONEG, + ETHTOOL_LINK_EXT_STATE_LINK_TRAINING_FAILURE, + ETHTOOL_LINK_EXT_STATE_LINK_LOGICAL_MISMATCH, + ETHTOOL_LINK_EXT_STATE_BAD_SIGNAL_INTEGRITY, + ETHTOOL_LINK_EXT_STATE_NO_CABLE, + ETHTOOL_LINK_EXT_STATE_CABLE_ISSUE, + ETHTOOL_LINK_EXT_STATE_EEPROM_ISSUE, + ETHTOOL_LINK_EXT_STATE_CALIBRATION_FAILURE, + ETHTOOL_LINK_EXT_STATE_POWER_BUDGET_EXCEEDED, + ETHTOOL_LINK_EXT_STATE_OVERHEAT, +}; + +/** + * enum ethtool_link_ext_substate_autoneg - more information in addition to + * ETHTOOL_LINK_EXT_STATE_AUTONEG. + */ +enum ethtool_link_ext_substate_autoneg { + ETHTOOL_LINK_EXT_SUBSTATE_AN_NO_PARTNER_DETECTED = 1, + ETHTOOL_LINK_EXT_SUBSTATE_AN_ACK_NOT_RECEIVED, + ETHTOOL_LINK_EXT_SUBSTATE_AN_NEXT_PAGE_EXCHANGE_FAILED, + ETHTOOL_LINK_EXT_SUBSTATE_AN_NO_PARTNER_DETECTED_FORCE_MODE, + ETHTOOL_LINK_EXT_SUBSTATE_AN_FEC_MISMATCH_DURING_OVERRIDE, + ETHTOOL_LINK_EXT_SUBSTATE_AN_NO_HCD, +}; + +/** + * enum ethtool_link_ext_substate_link_training - more information in addition to + * ETHTOOL_LINK_EXT_STATE_LINK_TRAINING_FAILURE. + */ +enum ethtool_link_ext_substate_link_training { + ETHTOOL_LINK_EXT_SUBSTATE_LT_KR_FRAME_LOCK_NOT_ACQUIRED = 1, + ETHTOOL_LINK_EXT_SUBSTATE_LT_KR_LINK_INHIBIT_TIMEOUT, + ETHTOOL_LINK_EXT_SUBSTATE_LT_KR_LINK_PARTNER_DID_NOT_SET_RECEIVER_READY, + ETHTOOL_LINK_EXT_SUBSTATE_LT_REMOTE_FAULT, +}; + +/** + * enum ethtool_link_ext_substate_logical_mismatch - more information in addition + * to ETHTOOL_LINK_EXT_STATE_LINK_LOGICAL_MISMATCH. + */ +enum ethtool_link_ext_substate_link_logical_mismatch { + ETHTOOL_LINK_EXT_SUBSTATE_LLM_PCS_DID_NOT_ACQUIRE_BLOCK_LOCK = 1, + ETHTOOL_LINK_EXT_SUBSTATE_LLM_PCS_DID_NOT_ACQUIRE_AM_LOCK, + ETHTOOL_LINK_EXT_SUBSTATE_LLM_PCS_DID_NOT_GET_ALIGN_STATUS, + ETHTOOL_LINK_EXT_SUBSTATE_LLM_FC_FEC_IS_NOT_LOCKED, + ETHTOOL_LINK_EXT_SUBSTATE_LLM_RS_FEC_IS_NOT_LOCKED, +}; + +/** + * enum ethtool_link_ext_substate_bad_signal_integrity - more information in + * addition to ETHTOOL_LINK_EXT_STATE_BAD_SIGNAL_INTEGRITY. + */ +enum ethtool_link_ext_substate_bad_signal_integrity { + ETHTOOL_LINK_EXT_SUBSTATE_BSI_LARGE_NUMBER_OF_PHYSICAL_ERRORS = 1, + ETHTOOL_LINK_EXT_SUBSTATE_BSI_UNSUPPORTED_RATE, +}; + +/** + * enum ethtool_link_ext_substate_cable_issue - more information in + * addition to ETHTOOL_LINK_EXT_STATE_CABLE_ISSUE. + */ +enum ethtool_link_ext_substate_cable_issue { + ETHTOOL_LINK_EXT_SUBSTATE_CI_UNSUPPORTED_CABLE = 1, + ETHTOOL_LINK_EXT_SUBSTATE_CI_CABLE_TEST_FAILURE, +}; + #define ETH_GSTRING_LEN 32 /** diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 4dda5e4244a7..c12ce4df4b6b 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -236,6 +236,8 @@ enum { ETHTOOL_A_LINKSTATE_LINK, /* u8 */ ETHTOOL_A_LINKSTATE_SQI, /* u32 */ ETHTOOL_A_LINKSTATE_SQI_MAX, /* u32 */ + ETHTOOL_A_LINKSTATE_EXT_STATE, /* u8 */ + ETHTOOL_A_LINKSTATE_EXT_SUBSTATE, /* u8 */ /* add new constants above here */ __ETHTOOL_A_LINKSTATE_CNT, diff --git a/net/ethtool/linkstate.c b/net/ethtool/linkstate.c index afe5ac8a0f00..4834091ec24c 100644 --- a/net/ethtool/linkstate.c +++ b/net/ethtool/linkstate.c @@ -9,10 +9,12 @@ struct linkstate_req_info { }; struct linkstate_reply_data { - struct ethnl_reply_data base; - int link; - int sqi; - int sqi_max; + struct ethnl_reply_data base; + int link; + int sqi; + int sqi_max; + bool link_ext_state_provided; + struct ethtool_link_ext_state_info ethtool_link_ext_state_info; }; #define LINKSTATE_REPDATA(__reply_base) \ @@ -25,6 +27,8 @@ linkstate_get_policy[ETHTOOL_A_LINKSTATE_MAX + 1] = { [ETHTOOL_A_LINKSTATE_LINK] = { .type = NLA_REJECT }, [ETHTOOL_A_LINKSTATE_SQI] = { .type = NLA_REJECT }, [ETHTOOL_A_LINKSTATE_SQI_MAX] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKSTATE_EXT_STATE] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKSTATE_EXT_SUBSTATE] = { .type = NLA_REJECT }, }; static int linkstate_get_sqi(struct net_device *dev) @@ -61,6 +65,23 @@ static int linkstate_get_sqi_max(struct net_device *dev) mutex_unlock(&phydev->lock); return ret; +}; + +static int linkstate_get_link_ext_state(struct net_device *dev, + struct linkstate_reply_data *data) +{ + int err; + + if (!dev->ethtool_ops->get_link_ext_state) + return -EOPNOTSUPP; + + err = dev->ethtool_ops->get_link_ext_state(dev, &data->ethtool_link_ext_state_info); + if (err) + return err; + + data->link_ext_state_provided = true; + + return 0; } static int linkstate_prepare_data(const struct ethnl_req_info *req_base, @@ -86,6 +107,12 @@ static int linkstate_prepare_data(const struct ethnl_req_info *req_base, goto out; data->sqi_max = ret; + if (dev->flags & IFF_UP) { + ret = linkstate_get_link_ext_state(dev, data); + if (ret < 0 && ret != -EOPNOTSUPP && ret != -ENODATA) + goto out; + } + ret = 0; out: ethnl_ops_complete(dev); @@ -107,6 +134,12 @@ static int linkstate_reply_size(const struct ethnl_req_info *req_base, if (data->sqi_max != -EOPNOTSUPP) len += nla_total_size(sizeof(u32)); + if (data->link_ext_state_provided) + len += nla_total_size(sizeof(u8)); /* LINKSTATE_EXT_STATE */ + + if (data->ethtool_link_ext_state_info.__link_ext_substate) + len += nla_total_size(sizeof(u8)); /* LINKSTATE_EXT_SUBSTATE */ + return len; } @@ -128,6 +161,17 @@ static int linkstate_fill_reply(struct sk_buff *skb, nla_put_u32(skb, ETHTOOL_A_LINKSTATE_SQI_MAX, data->sqi_max)) return -EMSGSIZE; + if (data->link_ext_state_provided) { + if (nla_put_u8(skb, ETHTOOL_A_LINKSTATE_EXT_STATE, + data->ethtool_link_ext_state_info.link_ext_state)) + return -EMSGSIZE; + + if (data->ethtool_link_ext_state_info.__link_ext_substate && + nla_put_u8(skb, ETHTOOL_A_LINKSTATE_EXT_SUBSTATE, + data->ethtool_link_ext_state_info.__link_ext_substate)) + return -EMSGSIZE; + } + return 0; } -- cgit v1.2.3-59-g8ed1b From 970471914c67b70df24def6b2a30cc42acbebded Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Tue, 16 Jun 2020 16:47:14 +0200 Subject: iommu: Allow page responses without PASID Some PCIe devices do not expect a PASID value in PRI Page Responses. If the "PRG Response PASID Required" bit in the PRI capability is zero, then the OS should not set the PASID field. Similarly on Arm SMMU, responses to stall events do not have a PASID. Currently iommu_page_response() systematically checks that the PASID in the page response corresponds to the one in the page request. This can't work with virtualization because a page response coming from a guest OS won't have a PASID if the passed-through device does not require one. Add a flag to page requests that declares whether the corresponding response needs to have a PASID. When this flag isn't set, allow page responses without PASID. Reported-by: Shameerali Kolothum Thodi Signed-off-by: Jean-Philippe Brucker Link: https://lore.kernel.org/r/20200616144712.748818-1-jean-philippe@linaro.org Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 23 +++++++++++++++++------ include/uapi/linux/iommu.h | 6 +++++- 2 files changed, 22 insertions(+), 7 deletions(-) (limited to 'include/uapi') diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index d43120eb1dc5..1ed1e14a1f0c 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1185,11 +1185,12 @@ EXPORT_SYMBOL_GPL(iommu_report_device_fault); int iommu_page_response(struct device *dev, struct iommu_page_response *msg) { - bool pasid_valid; + bool needs_pasid; int ret = -EINVAL; struct iommu_fault_event *evt; struct iommu_fault_page_request *prm; struct dev_iommu *param = dev->iommu; + bool has_pasid = msg->flags & IOMMU_PAGE_RESP_PASID_VALID; struct iommu_domain *domain = iommu_get_domain_for_dev(dev); if (!domain || !domain->ops->page_response) @@ -1214,14 +1215,24 @@ int iommu_page_response(struct device *dev, */ list_for_each_entry(evt, ¶m->fault_param->faults, list) { prm = &evt->fault.prm; - pasid_valid = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; + if (prm->grpid != msg->grpid) + continue; - if ((pasid_valid && prm->pasid != msg->pasid) || - prm->grpid != msg->grpid) + /* + * If the PASID is required, the corresponding request is + * matched using the group ID, the PASID valid bit and the PASID + * value. Otherwise only the group ID matches request and + * response. + */ + needs_pasid = prm->flags & IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID; + if (needs_pasid && (!has_pasid || msg->pasid != prm->pasid)) continue; - /* Sanitize the reply */ - msg->flags = pasid_valid ? IOMMU_PAGE_RESP_PASID_VALID : 0; + if (!needs_pasid && has_pasid) { + /* No big deal, just clear it. */ + msg->flags &= ~IOMMU_PAGE_RESP_PASID_VALID; + msg->pasid = 0; + } ret = domain->ops->page_response(dev, evt, msg); list_del(&evt->list); diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h index e907b7091a46..c2b2caf9ed41 100644 --- a/include/uapi/linux/iommu.h +++ b/include/uapi/linux/iommu.h @@ -81,7 +81,10 @@ struct iommu_fault_unrecoverable { /** * struct iommu_fault_page_request - Page Request data * @flags: encodes whether the corresponding fields are valid and whether this - * is the last page in group (IOMMU_FAULT_PAGE_REQUEST_* values) + * is the last page in group (IOMMU_FAULT_PAGE_REQUEST_* values). + * When IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID is set, the page response + * must have the same PASID value as the page request. When it is clear, + * the page response should not have a PASID. * @pasid: Process Address Space ID * @grpid: Page Request Group Index * @perm: requested page permissions (IOMMU_FAULT_PERM_* values) @@ -92,6 +95,7 @@ struct iommu_fault_page_request { #define IOMMU_FAULT_PAGE_REQUEST_PASID_VALID (1 << 0) #define IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE (1 << 1) #define IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA (1 << 2) +#define IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID (1 << 3) __u32 flags; __u32 pasid; __u32 grpid; -- cgit v1.2.3-59-g8ed1b From fa28dcb82a38f8e3993b0fae9106b1a80b59e4f0 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 29 Jun 2020 23:28:44 -0700 Subject: bpf: Introduce helper bpf_get_task_stack() Introduce helper bpf_get_task_stack(), which dumps stack trace of given task. This is different to bpf_get_stack(), which gets stack track of current task. One potential use case of bpf_get_task_stack() is to call it from bpf_iter__task and dump all /proc//stack to a seq_file. bpf_get_task_stack() uses stack_trace_save_tsk() instead of get_perf_callchain() for kernel stack. The benefit of this choice is that stack_trace_save_tsk() doesn't require changes in arch/. The downside of using stack_trace_save_tsk() is that stack_trace_save_tsk() dumps the stack trace to unsigned long array. For 32-bit systems, we need to translate it to u64 array. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200630062846.664389-3-songliubraving@fb.com --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 37 +++++++++++++++++++- kernel/bpf/stackmap.c | 77 +++++++++++++++++++++++++++++++++++++++--- kernel/bpf/verifier.c | 4 ++- kernel/trace/bpf_trace.c | 2 ++ scripts/bpf_helpers_doc.py | 2 ++ tools/include/uapi/linux/bpf.h | 37 +++++++++++++++++++- 7 files changed, 153 insertions(+), 7 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3d2ade703a35..0cd7f6884c5c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1627,6 +1627,7 @@ extern const struct bpf_func_proto bpf_get_current_uid_gid_proto; extern const struct bpf_func_proto bpf_get_current_comm_proto; extern const struct bpf_func_proto bpf_get_stackid_proto; extern const struct bpf_func_proto bpf_get_stack_proto; +extern const struct bpf_func_proto bpf_get_task_stack_proto; extern const struct bpf_func_proto bpf_sock_map_update_proto; extern const struct bpf_func_proto bpf_sock_hash_update_proto; extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0cb8ec948816..da9bf35a26f8 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3285,6 +3285,39 @@ union bpf_attr { * Dynamically cast a *sk* pointer to a *udp6_sock* pointer. * Return * *sk* if casting is valid, or NULL otherwise. + * + * long bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags) + * Description + * Return a user or a kernel stack in bpf program provided buffer. + * To achieve this, the helper needs *task*, which is a valid + * pointer to struct task_struct. To store the stacktrace, the + * bpf program provides *buf* with a nonnegative *size*. + * + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * the following flags: + * + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_USER_BUILD_ID** + * Collect buildid+offset instead of ips for user stack, + * only valid if **BPF_F_USER_STACK** is also specified. + * + * **bpf_get_task_stack**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject + * to sufficient large buffer size. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: + * + * :: + * + * # sysctl kernel.perf_event_max_stack= + * Return + * A non-negative value equal to or less than *size* on success, + * or a negative error in case of failure. + * */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3427,7 +3460,9 @@ union bpf_attr { FN(skc_to_tcp_sock), \ FN(skc_to_tcp_timewait_sock), \ FN(skc_to_tcp_request_sock), \ - FN(skc_to_udp6_sock), + FN(skc_to_udp6_sock), \ + FN(get_task_stack), \ + /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 071f98d0f7c6..5ad72ab2276b 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -348,6 +348,40 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, } } +static struct perf_callchain_entry * +get_callchain_entry_for_task(struct task_struct *task, u32 init_nr) +{ + struct perf_callchain_entry *entry; + int rctx; + + entry = get_callchain_entry(&rctx); + + if (!entry) + return NULL; + + entry->nr = init_nr + + stack_trace_save_tsk(task, (unsigned long *)(entry->ip + init_nr), + sysctl_perf_event_max_stack - init_nr, 0); + + /* stack_trace_save_tsk() works on unsigned long array, while + * perf_callchain_entry uses u64 array. For 32-bit systems, it is + * necessary to fix this mismatch. + */ + if (__BITS_PER_LONG != 64) { + unsigned long *from = (unsigned long *) entry->ip; + u64 *to = entry->ip; + int i; + + /* copy data from the end to avoid using extra buffer */ + for (i = entry->nr - 1; i >= (int)init_nr; i--) + to[i] = (u64)(from[i]); + } + + put_callchain_entry(rctx); + + return entry; +} + BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, u64, flags) { @@ -448,8 +482,8 @@ const struct bpf_func_proto bpf_get_stackid_proto = { .arg3_type = ARG_ANYTHING, }; -BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size, - u64, flags) +static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, + void *buf, u32 size, u64 flags) { u32 init_nr, trace_nr, copy_len, elem_size, num_elem; bool user_build_id = flags & BPF_F_USER_BUILD_ID; @@ -471,13 +505,22 @@ BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size, if (unlikely(size % elem_size)) goto clear; + /* cannot get valid user stack for task without user_mode regs */ + if (task && user && !user_mode(regs)) + goto err_fault; + num_elem = size / elem_size; if (sysctl_perf_event_max_stack < num_elem) init_nr = 0; else init_nr = sysctl_perf_event_max_stack - num_elem; - trace = get_perf_callchain(regs, init_nr, kernel, user, - sysctl_perf_event_max_stack, false, false); + + if (kernel && task) + trace = get_callchain_entry_for_task(task, init_nr); + else + trace = get_perf_callchain(regs, init_nr, kernel, user, + sysctl_perf_event_max_stack, + false, false); if (unlikely(!trace)) goto err_fault; @@ -505,6 +548,12 @@ clear: return err; } +BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size, + u64, flags) +{ + return __bpf_get_stack(regs, NULL, buf, size, flags); +} + const struct bpf_func_proto bpf_get_stack_proto = { .func = bpf_get_stack, .gpl_only = true, @@ -515,6 +564,26 @@ const struct bpf_func_proto bpf_get_stack_proto = { .arg4_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf, + u32, size, u64, flags) +{ + struct pt_regs *regs = task_pt_regs(task); + + return __bpf_get_stack(regs, task, buf, size, flags); +} + +static int bpf_get_task_stack_btf_ids[5]; +const struct bpf_func_proto bpf_get_task_stack_proto = { + .func = bpf_get_task_stack, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, + .btf_id = bpf_get_task_stack_btf_ids, +}; + /* Called from eBPF program */ static void *stack_map_lookup_elem(struct bpf_map *map, void *key) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7de98906ddf4..b608185e1ffd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4864,7 +4864,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn if (err) return err; - if (func_id == BPF_FUNC_get_stack && !env->prog->has_callchain_buf) { + if ((func_id == BPF_FUNC_get_stack || + func_id == BPF_FUNC_get_task_stack) && + !env->prog->has_callchain_buf) { const char *err_str; #ifdef CONFIG_PERF_EVENTS diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 5d59dda5f661..977ba3b6f6c6 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1137,6 +1137,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_ringbuf_query_proto; case BPF_FUNC_jiffies64: return &bpf_jiffies64_proto; + case BPF_FUNC_get_task_stack: + return &bpf_get_task_stack_proto; default: return NULL; } diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 6bab40ff442e..6843376733df 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -426,6 +426,7 @@ class PrinterHelpers(Printer): 'struct tcp_timewait_sock', 'struct tcp_request_sock', 'struct udp6_sock', + 'struct task_struct', 'struct __sk_buff', 'struct sk_msg_md', @@ -468,6 +469,7 @@ class PrinterHelpers(Printer): 'struct tcp_timewait_sock', 'struct tcp_request_sock', 'struct udp6_sock', + 'struct task_struct', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 0cb8ec948816..da9bf35a26f8 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3285,6 +3285,39 @@ union bpf_attr { * Dynamically cast a *sk* pointer to a *udp6_sock* pointer. * Return * *sk* if casting is valid, or NULL otherwise. + * + * long bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags) + * Description + * Return a user or a kernel stack in bpf program provided buffer. + * To achieve this, the helper needs *task*, which is a valid + * pointer to struct task_struct. To store the stacktrace, the + * bpf program provides *buf* with a nonnegative *size*. + * + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * the following flags: + * + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_USER_BUILD_ID** + * Collect buildid+offset instead of ips for user stack, + * only valid if **BPF_F_USER_STACK** is also specified. + * + * **bpf_get_task_stack**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject + * to sufficient large buffer size. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: + * + * :: + * + * # sysctl kernel.perf_event_max_stack= + * Return + * A non-negative value equal to or less than *size* on success, + * or a negative error in case of failure. + * */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3427,7 +3460,9 @@ union bpf_attr { FN(skc_to_tcp_sock), \ FN(skc_to_tcp_timewait_sock), \ FN(skc_to_tcp_request_sock), \ - FN(skc_to_udp6_sock), + FN(skc_to_udp6_sock), \ + FN(get_task_stack), \ + /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3-59-g8ed1b From 62e9dd177732843ae6c5b9d2ed61e7c9538fa276 Mon Sep 17 00:00:00 2001 From: Shyam Sundar Date: Tue, 30 Jun 2020 03:22:28 -0700 Subject: scsi: qla2xxx: Change in PUREX to handle FPIN ELS requests SAN Congestion Management generates ELS pkts whose size can vary and be > 64 bytes. Change the PUREX handling code to support non-standard ELS pkt size. Link: https://lore.kernel.org/r/20200630102229.29660-2-njavali@marvell.com Reviewed-by: Himanshu Madhani Signed-off-by: Shyam Sundar Signed-off-by: Arun Easi Signed-off-by: Nilesh Javali Signed-off-by: Martin K. Petersen --- drivers/scsi/qla2xxx/qla_def.h | 15 +++++- drivers/scsi/qla2xxx/qla_gbl.h | 3 +- drivers/scsi/qla2xxx/qla_isr.c | 116 ++++++++++++++++++++++++++++++----------- drivers/scsi/qla2xxx/qla_mbx.c | 22 ++++++-- drivers/scsi/qla2xxx/qla_os.c | 19 +++++-- include/uapi/scsi/fc/fc_els.h | 2 + 6 files changed, 134 insertions(+), 43 deletions(-) (limited to 'include/uapi') diff --git a/drivers/scsi/qla2xxx/qla_def.h b/drivers/scsi/qla2xxx/qla_def.h index 42dbf90d4651..9a0f2314fe7b 100644 --- a/drivers/scsi/qla2xxx/qla_def.h +++ b/drivers/scsi/qla2xxx/qla_def.h @@ -34,6 +34,8 @@ #include #include +#include + /* Big endian Fibre Channel S_ID (source ID) or D_ID (destination ID). */ typedef struct { uint8_t domain; @@ -1304,7 +1306,6 @@ static inline bool qla2xxx_is_valid_mbs(unsigned int mbs) #define RNID_TYPE_ASIC_TEMP 0xC #define ELS_CMD_MAP_SIZE 32 -#define ELS_COMMAND_RDP 0x18 /* * Firmware state codes from get firmware state mailbox command @@ -4522,10 +4523,19 @@ struct active_regions { #define QLA_SET_DATA_RATE_NOLR 1 #define QLA_SET_DATA_RATE_LR 2 /* Set speed and initiate LR */ +#define QLA_DEFAULT_PAYLOAD_SIZE 64 +/* + * This item might be allocated with a size > sizeof(struct purex_item). + * The "size" variable gives the size of the payload (which + * is variable) starting at "iocb". + */ struct purex_item { struct list_head list; struct scsi_qla_host *vha; - void (*process_item)(struct scsi_qla_host *vha, void *pkt); + void (*process_item)(struct scsi_qla_host *vha, + struct purex_item *pkt); + atomic_t in_use; + uint16_t size; struct { uint8_t iocb[64]; } iocb; @@ -4725,6 +4735,7 @@ typedef struct scsi_qla_host { struct list_head head; spinlock_t lock; } purex_list; + struct purex_item default_item; struct name_list_extended gnl; /* Count of active session/fcport */ diff --git a/drivers/scsi/qla2xxx/qla_gbl.h b/drivers/scsi/qla2xxx/qla_gbl.h index 061f91b521b3..9cf33d05e3f8 100644 --- a/drivers/scsi/qla2xxx/qla_gbl.h +++ b/drivers/scsi/qla2xxx/qla_gbl.h @@ -229,7 +229,8 @@ void qla2x00_handle_login_done_event(struct scsi_qla_host *, fc_port_t *, int qla24xx_post_gnl_work(struct scsi_qla_host *, fc_port_t *); int qla24xx_post_relogin_work(struct scsi_qla_host *vha); void qla2x00_wait_for_sess_deletion(scsi_qla_host_t *); -void qla24xx_process_purex_rdp(struct scsi_qla_host *vha, void *pkt); +void qla24xx_process_purex_rdp(struct scsi_qla_host *vha, + struct purex_item *pkt); /* * Global Functions in qla_mid.c source file. diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c index cf0800546740..3bbfff20e3a6 100644 --- a/drivers/scsi/qla2xxx/qla_isr.c +++ b/drivers/scsi/qla2xxx/qla_isr.c @@ -31,35 +31,11 @@ const char *const port_state_str[] = { "ONLINE" }; -static void qla24xx_purex_iocb(scsi_qla_host_t *vha, void *pkt, - void (*process_item)(struct scsi_qla_host *vha, void *pkt)) -{ - struct purex_list *list = &vha->purex_list; - struct purex_item *item; - ulong flags; - - item = kzalloc(sizeof(*item), GFP_KERNEL); - if (!item) { - ql_log(ql_log_warn, vha, 0x5092, - ">> Failed allocate purex list item.\n"); - return; - } - - item->vha = vha; - item->process_item = process_item; - memcpy(&item->iocb, pkt, sizeof(item->iocb)); - - spin_lock_irqsave(&list->lock, flags); - list_add_tail(&item->list, &list->head); - spin_unlock_irqrestore(&list->lock, flags); - - set_bit(PROCESS_PUREX_IOCB, &vha->dpc_flags); -} - static void -qla24xx_process_abts(struct scsi_qla_host *vha, void *pkt) +qla24xx_process_abts(struct scsi_qla_host *vha, struct purex_item *pkt) { - struct abts_entry_24xx *abts = pkt; + struct abts_entry_24xx *abts = + (struct abts_entry_24xx *)&pkt->iocb; struct qla_hw_data *ha = vha->hw; struct els_entry_24xx *rsp_els; struct abts_entry_24xx *abts_rsp; @@ -789,6 +765,74 @@ qla27xx_handle_8200_aen(scsi_qla_host_t *vha, uint16_t *mb) } } +struct purex_item * +qla24xx_alloc_purex_item(scsi_qla_host_t *vha, uint16_t size) +{ + struct purex_item *item = NULL; + uint8_t item_hdr_size = sizeof(*item); + + if (size > QLA_DEFAULT_PAYLOAD_SIZE) { + item = kzalloc(item_hdr_size + + (size - QLA_DEFAULT_PAYLOAD_SIZE), GFP_ATOMIC); + } else { + if (atomic_inc_return(&vha->default_item.in_use) == 1) { + item = &vha->default_item; + goto initialize_purex_header; + } else { + item = kzalloc(item_hdr_size, GFP_ATOMIC); + } + } + if (!item) { + ql_log(ql_log_warn, vha, 0x5092, + ">> Failed allocate purex list item.\n"); + + return NULL; + } + +initialize_purex_header: + item->vha = vha; + item->size = size; + return item; +} + +static void +qla24xx_queue_purex_item(scsi_qla_host_t *vha, struct purex_item *pkt, + void (*process_item)(struct scsi_qla_host *vha, + struct purex_item *pkt)) +{ + struct purex_list *list = &vha->purex_list; + ulong flags; + + pkt->process_item = process_item; + + spin_lock_irqsave(&list->lock, flags); + list_add_tail(&pkt->list, &list->head); + spin_unlock_irqrestore(&list->lock, flags); + + set_bit(PROCESS_PUREX_IOCB, &vha->dpc_flags); +} + +/** + * qla24xx_copy_std_pkt() - Copy over purex ELS which is + * contained in a single IOCB. + * purex packet. + * @vha: SCSI driver HA context + * @pkt: ELS packet + */ +struct purex_item +*qla24xx_copy_std_pkt(struct scsi_qla_host *vha, void *pkt) +{ + struct purex_item *item; + + item = qla24xx_alloc_purex_item(vha, + QLA_DEFAULT_PAYLOAD_SIZE); + if (!item) + return item; + + memcpy(&item->iocb, pkt, sizeof(item->iocb)); + return item; +} + /** * qla2x00_async_event() - Process aynchronous events. * @vha: SCSI driver HA context @@ -3229,6 +3273,7 @@ void qla24xx_process_response_queue(struct scsi_qla_host *vha, { struct sts_entry_24xx *pkt; struct qla_hw_data *ha = vha->hw; + struct purex_item *pure_item; if (!ha->flags.fw_started) return; @@ -3280,8 +3325,12 @@ process_err: break; case ABTS_RECV_24XX: if (qla_ini_mode_enabled(vha)) { - qla24xx_purex_iocb(vha, pkt, - qla24xx_process_abts); + pure_item = qla24xx_copy_std_pkt(vha, pkt); + if (!pure_item) + break; + + qla24xx_queue_purex_item(vha, pure_item, + qla24xx_process_abts); break; } if (IS_QLA83XX(ha) || IS_QLA27XX(ha) || @@ -3332,13 +3381,18 @@ process_err: { struct purex_entry_24xx *purex = (void *)pkt; - if (purex->els_frame_payload[3] != ELS_COMMAND_RDP) { + if (purex->els_frame_payload[3] != ELS_RDP) { ql_dbg(ql_dbg_init, vha, 0x5091, "Discarding ELS Request opcode %#x...\n", purex->els_frame_payload[3]); break; } - qla24xx_purex_iocb(vha, pkt, qla24xx_process_purex_rdp); + pure_item = qla24xx_copy_std_pkt(vha, pkt); + if (!pure_item) + break; + + qla24xx_queue_purex_item(vha, pure_item, + qla24xx_process_purex_rdp); break; } default: diff --git a/drivers/scsi/qla2xxx/qla_mbx.c b/drivers/scsi/qla2xxx/qla_mbx.c index df31ee0d59b2..b29d3831ea73 100644 --- a/drivers/scsi/qla2xxx/qla_mbx.c +++ b/drivers/scsi/qla2xxx/qla_mbx.c @@ -59,6 +59,7 @@ static struct rom_cmd { { MBC_IOCB_COMMAND_A64 }, { MBC_GET_ADAPTER_LOOP_ID }, { MBC_READ_SFP }, + { MBC_SET_RNID_PARAMS }, { MBC_GET_RNID_PARAMS }, { MBC_GET_SET_ZIO_THRESHOLD }, }; @@ -4866,6 +4867,7 @@ qla24xx_get_port_login_templ(scsi_qla_host_t *vha, dma_addr_t buf_dma, return rval; } +#define PUREX_CMD_COUNT 2 int qla25xx_set_els_cmds_supported(scsi_qla_host_t *vha) { @@ -4874,12 +4876,12 @@ qla25xx_set_els_cmds_supported(scsi_qla_host_t *vha) mbx_cmd_t *mcp = &mc; uint8_t *els_cmd_map; dma_addr_t els_cmd_map_dma; - uint cmd_opcode = ELS_COMMAND_RDP; - uint index = cmd_opcode / 8; - uint bit = cmd_opcode % 8; + uint8_t cmd_opcode[PUREX_CMD_COUNT]; + uint8_t i, index, purex_bit; struct qla_hw_data *ha = vha->hw; - if (!IS_QLA25XX(ha) && !IS_QLA2031(ha) && !IS_QLA27XX(ha)) + if (!IS_QLA25XX(ha) && !IS_QLA2031(ha) && + !IS_QLA27XX(ha) && !IS_QLA28XX(ha)) return QLA_SUCCESS; ql_dbg(ql_dbg_mbx + ql_dbg_verbose, vha, 0x1197, @@ -4893,7 +4895,17 @@ qla25xx_set_els_cmds_supported(scsi_qla_host_t *vha) return QLA_MEMORY_ALLOC_FAILED; } - els_cmd_map[index] |= 1 << bit; + memset(els_cmd_map, 0, ELS_CMD_MAP_SIZE); + + /* List of Purex ELS */ + cmd_opcode[0] = ELS_FPIN; + cmd_opcode[1] = ELS_RDP; + + for (i = 0; i < PUREX_CMD_COUNT; i++) { + index = cmd_opcode[i] / 8; + purex_bit = cmd_opcode[i] % 8; + els_cmd_map[index] |= 1 << purex_bit; + } mcp->mb[0] = MBC_SET_RNID_PARAMS; mcp->mb[1] = RNID_TYPE_ELS_CMD << 8; diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index e92fad99338c..80ce22cfc1b9 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -5893,10 +5893,12 @@ qla25xx_rdp_port_speed_currently(struct qla_hw_data *ha) * vha: SCSI qla host * purex: RDP request received by HBA */ -void qla24xx_process_purex_rdp(struct scsi_qla_host *vha, void *pkt) +void qla24xx_process_purex_rdp(struct scsi_qla_host *vha, + struct purex_item *item) { struct qla_hw_data *ha = vha->hw; - struct purex_entry_24xx *purex = pkt; + struct purex_entry_24xx *purex = + (struct purex_entry_24xx *)&item->iocb; dma_addr_t rsp_els_dma; dma_addr_t rsp_payload_dma; dma_addr_t stat_dma; @@ -6306,6 +6308,15 @@ dealloc: rsp_els, rsp_els_dma); } +void +qla24xx_free_purex_item(struct purex_item *item) +{ + if (item == &item->vha->default_item) + memset(&item->vha->default_item, 0, sizeof(struct purex_item)); + else + kfree(item); +} + void qla24xx_process_purex_list(struct purex_list *list) { struct list_head head = LIST_HEAD_INIT(head); @@ -6318,8 +6329,8 @@ void qla24xx_process_purex_list(struct purex_list *list) list_for_each_entry_safe(item, next, &head, list) { list_del(&item->list); - item->process_item(item->vha, &item->iocb); - kfree(item); + item->process_item(item->vha, item); + qla24xx_free_purex_item(item); } } diff --git a/include/uapi/scsi/fc/fc_els.h b/include/uapi/scsi/fc/fc_els.h index 66318c44acd7..8c704e510e39 100644 --- a/include/uapi/scsi/fc/fc_els.h +++ b/include/uapi/scsi/fc/fc_els.h @@ -41,6 +41,7 @@ enum fc_els_cmd { ELS_REC = 0x13, /* read exchange concise */ ELS_SRR = 0x14, /* sequence retransmission request */ ELS_FPIN = 0x16, /* Fabric Performance Impact Notification */ + ELS_RDP = 0x18, /* Read Diagnostic Parameters */ ELS_RDF = 0x19, /* Register Diagnostic Functions */ ELS_PRLI = 0x20, /* process login */ ELS_PRLO = 0x21, /* process logout */ @@ -110,6 +111,7 @@ enum fc_els_cmd { [ELS_REC] = "REC", \ [ELS_SRR] = "SRR", \ [ELS_FPIN] = "FPIN", \ + [ELS_RDP] = "RDP", \ [ELS_RDF] = "RDF", \ [ELS_PRLI] = "PRLI", \ [ELS_PRLO] = "PRLO", \ -- cgit v1.2.3-59-g8ed1b From e4266b991fead8eb996688e82ff39f6cc59ef7dd Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Thu, 2 Jul 2020 10:13:05 +0200 Subject: bridge: uapi: mrp: Extend MRP attributes to get the status Add MRP attribute IFLA_BRIDGE_MRP_INFO to allow the userspace to get the current state of the MRP instances. This is a nested attribute that contains other attributes like, ring id, index of primary and secondary port, priority, ring state, ring role. Signed-off-by: Horatiu Vultur Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index caa6914a3e53..c114c1c2bd53 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -166,6 +166,7 @@ enum { IFLA_BRIDGE_MRP_RING_STATE, IFLA_BRIDGE_MRP_RING_ROLE, IFLA_BRIDGE_MRP_START_TEST, + IFLA_BRIDGE_MRP_INFO, __IFLA_BRIDGE_MRP_MAX, }; @@ -228,6 +229,22 @@ enum { #define IFLA_BRIDGE_MRP_START_TEST_MAX (__IFLA_BRIDGE_MRP_START_TEST_MAX - 1) +enum { + IFLA_BRIDGE_MRP_INFO_UNSPEC, + IFLA_BRIDGE_MRP_INFO_RING_ID, + IFLA_BRIDGE_MRP_INFO_P_IFINDEX, + IFLA_BRIDGE_MRP_INFO_S_IFINDEX, + IFLA_BRIDGE_MRP_INFO_PRIO, + IFLA_BRIDGE_MRP_INFO_RING_STATE, + IFLA_BRIDGE_MRP_INFO_RING_ROLE, + IFLA_BRIDGE_MRP_INFO_TEST_INTERVAL, + IFLA_BRIDGE_MRP_INFO_TEST_MAX_MISS, + IFLA_BRIDGE_MRP_INFO_TEST_MONITOR, + __IFLA_BRIDGE_MRP_INFO_MAX, +}; + +#define IFLA_BRIDGE_MRP_INFO_MAX (__IFLA_BRIDGE_MRP_INFO_MAX - 1) + struct br_mrp_instance { __u32 ring_id; __u32 p_ifindex; -- cgit v1.2.3-59-g8ed1b From 36a8e8e26542056bbd7eb5e047cadee30587d230 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Thu, 2 Jul 2020 10:13:07 +0200 Subject: bridge: Extend br_fill_ifinfo to return MPR status This patch extends the function br_fill_ifinfo to return also the MRP status for each instance on a bridge. It also adds a new filter RTEXT_FILTER_MRP to return the MRP status only when this is set, not to interfer with the vlans. The MRP status is return only on the bridge interfaces. Signed-off-by: Horatiu Vultur Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/rtnetlink.h | 1 + net/bridge/br_netlink.c | 25 ++++++++++++++++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 879e64950a0a..9b814c92de12 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -778,6 +778,7 @@ enum { #define RTEXT_FILTER_BRVLAN (1 << 1) #define RTEXT_FILTER_BRVLAN_COMPRESSED (1 << 2) #define RTEXT_FILTER_SKIP_STATS (1 << 3) +#define RTEXT_FILTER_MRP (1 << 4) /* End of information exported to user level */ diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 240e260e3461..c532fa65c983 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -453,6 +453,28 @@ static int br_fill_ifinfo(struct sk_buff *skb, rcu_read_unlock(); if (err) goto nla_put_failure; + + nla_nest_end(skb, af); + } + + if (filter_mask & RTEXT_FILTER_MRP) { + struct nlattr *af; + int err; + + if (!br_mrp_enabled(br) || port) + goto done; + + af = nla_nest_start_noflag(skb, IFLA_AF_SPEC); + if (!af) + goto nla_put_failure; + + rcu_read_lock(); + err = br_mrp_fill_info(skb, br); + rcu_read_unlock(); + + if (err) + goto nla_put_failure; + nla_nest_end(skb, af); } @@ -516,7 +538,8 @@ int br_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct net_bridge_port *port = br_port_get_rtnl(dev); if (!port && !(filter_mask & RTEXT_FILTER_BRVLAN) && - !(filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED)) + !(filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED) && + !(filter_mask & RTEXT_FILTER_MRP)) return 0; return br_fill_ifinfo(skb, port, pid, seq, RTM_NEWLINK, nlflags, -- cgit v1.2.3-59-g8ed1b From 74cccc3d38438b346e40a4f8133cff3f0839ff84 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 30 Jun 2020 19:21:11 +0200 Subject: netfilter: nf_tables: add NFTA_CHAIN_ID attribute This netlink attribute allows you to refer to chains inside a transaction as an alternative to the name and the handle. The chain binding support requires this new chain ID approach. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 3 +++ include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nf_tables_api.c | 15 ++++++++++++--- 3 files changed, 17 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 6f0f6fca9ac3..3e5226684017 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1433,6 +1433,7 @@ struct nft_trans_chain { char *name; struct nft_stats __percpu *stats; u8 policy; + u32 chain_id; }; #define nft_trans_chain_update(trans) \ @@ -1443,6 +1444,8 @@ struct nft_trans_chain { (((struct nft_trans_chain *)trans->data)->stats) #define nft_trans_chain_policy(trans) \ (((struct nft_trans_chain *)trans->data)->policy) +#define nft_trans_chain_id(trans) \ + (((struct nft_trans_chain *)trans->data)->chain_id) struct nft_trans_table { bool update; diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 4565456c0ef4..477779595b78 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -196,6 +196,7 @@ enum nft_table_attributes { * @NFTA_CHAIN_TYPE: type name of the string (NLA_NUL_STRING) * @NFTA_CHAIN_COUNTERS: counter specification of the chain (NLA_NESTED: nft_counter_attributes) * @NFTA_CHAIN_FLAGS: chain flags + * @NFTA_CHAIN_ID: uniquely identifies a chain in a transaction (NLA_U32) */ enum nft_chain_attributes { NFTA_CHAIN_UNSPEC, @@ -209,6 +210,7 @@ enum nft_chain_attributes { NFTA_CHAIN_COUNTERS, NFTA_CHAIN_PAD, NFTA_CHAIN_FLAGS, + NFTA_CHAIN_ID, __NFTA_CHAIN_MAX }; #define NFTA_CHAIN_MAX (__NFTA_CHAIN_MAX - 1) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 7647ecfa0d40..650ef0dd0773 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -280,9 +280,15 @@ static struct nft_trans *nft_trans_chain_add(struct nft_ctx *ctx, int msg_type) if (trans == NULL) return ERR_PTR(-ENOMEM); - if (msg_type == NFT_MSG_NEWCHAIN) + if (msg_type == NFT_MSG_NEWCHAIN) { nft_activate_next(ctx->net, ctx->chain); + if (ctx->nla[NFTA_CHAIN_ID]) { + nft_trans_chain_id(trans) = + ntohl(nla_get_be32(ctx->nla[NFTA_CHAIN_ID])); + } + } + list_add_tail(&trans->list, &ctx->net->nft.commit_list); return trans; } @@ -1274,6 +1280,7 @@ static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = { .len = NFT_MODULE_AUTOLOAD_LIMIT }, [NFTA_CHAIN_COUNTERS] = { .type = NLA_NESTED }, [NFTA_CHAIN_FLAGS] = { .type = NLA_U32 }, + [NFTA_CHAIN_ID] = { .type = NLA_U32 }, }; static const struct nla_policy nft_hook_policy[NFTA_HOOK_MAX + 1] = { @@ -2154,9 +2161,9 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); int family = nfmsg->nfgen_family; + struct nft_chain *chain = NULL; const struct nlattr *attr; struct nft_table *table; - struct nft_chain *chain; u8 policy = NF_ACCEPT; struct nft_ctx ctx; u64 handle = 0; @@ -2181,7 +2188,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, return PTR_ERR(chain); } attr = nla[NFTA_CHAIN_HANDLE]; - } else { + } else if (nla[NFTA_CHAIN_NAME]) { chain = nft_chain_lookup(net, table, attr, genmask); if (IS_ERR(chain)) { if (PTR_ERR(chain) != -ENOENT) { @@ -2190,6 +2197,8 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, } chain = NULL; } + } else if (!nla[NFTA_CHAIN_ID]) { + return -EINVAL; } if (nla[NFTA_CHAIN_POLICY]) { -- cgit v1.2.3-59-g8ed1b From 837830a4b439bfeb86c70b0115c280377c84714b Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 30 Jun 2020 19:21:16 +0200 Subject: netfilter: nf_tables: add NFTA_RULE_CHAIN_ID attribute This new netlink attribute allows you to add rules to chains by the chain ID. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 1 + net/netfilter/nf_tables_api.c | 36 ++++++++++++++++++++++++++++---- 2 files changed, 33 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 477779595b78..2304d1b7ba5e 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -240,6 +240,7 @@ enum nft_rule_attributes { NFTA_RULE_PAD, NFTA_RULE_ID, NFTA_RULE_POSITION_ID, + NFTA_RULE_CHAIN_ID, __NFTA_RULE_MAX }; #define NFTA_RULE_MAX (__NFTA_RULE_MAX - 1) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 650ef0dd0773..fbe8f9209813 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2153,6 +2153,22 @@ err: return err; } +static struct nft_chain *nft_chain_lookup_byid(const struct net *net, + const struct nlattr *nla) +{ + u32 id = ntohl(nla_get_be32(nla)); + struct nft_trans *trans; + + list_for_each_entry(trans, &net->nft.commit_list, list) { + struct nft_chain *chain = trans->ctx.chain; + + if (trans->msg_type == NFT_MSG_NEWCHAIN && + id == nft_trans_chain_id(trans)) + return chain; + } + return ERR_PTR(-ENOENT); +} + static int nf_tables_newchain(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[], @@ -2633,6 +2649,7 @@ static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = { .len = NFT_USERDATA_MAXLEN }, [NFTA_RULE_ID] = { .type = NLA_U32 }, [NFTA_RULE_POSITION_ID] = { .type = NLA_U32 }, + [NFTA_RULE_CHAIN_ID] = { .type = NLA_U32 }, }; static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, @@ -3039,10 +3056,21 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, return PTR_ERR(table); } - chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], genmask); - if (IS_ERR(chain)) { - NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); - return PTR_ERR(chain); + if (nla[NFTA_RULE_CHAIN]) { + chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], + genmask); + if (IS_ERR(chain)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); + return PTR_ERR(chain); + } + } else if (nla[NFTA_RULE_CHAIN_ID]) { + chain = nft_chain_lookup_byid(net, nla[NFTA_RULE_CHAIN_ID]); + if (IS_ERR(chain)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN_ID]); + return PTR_ERR(chain); + } + } else { + return -EINVAL; } if (nla[NFTA_RULE_HANDLE]) { -- cgit v1.2.3-59-g8ed1b From 51d70f181ff4e2c996ddf256af1efecd7d5864e5 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 30 Jun 2020 19:21:21 +0200 Subject: netfilter: nf_tables: add NFTA_VERDICT_CHAIN_ID attribute This netlink attribute allows you to identify the chain to jump/goto by means of the chain ID. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nf_tables_api.c | 16 +++++++++++++--- 2 files changed, 15 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 2304d1b7ba5e..683e75126d68 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -471,11 +471,13 @@ enum nft_data_attributes { * * @NFTA_VERDICT_CODE: nf_tables verdict (NLA_U32: enum nft_verdicts) * @NFTA_VERDICT_CHAIN: jump target chain name (NLA_STRING) + * @NFTA_VERDICT_CHAIN_ID: jump target chain ID (NLA_U32) */ enum nft_verdict_attributes { NFTA_VERDICT_UNSPEC, NFTA_VERDICT_CODE, NFTA_VERDICT_CHAIN, + NFTA_VERDICT_CHAIN_ID, __NFTA_VERDICT_MAX }; #define NFTA_VERDICT_MAX (__NFTA_VERDICT_MAX - 1) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index fbe8f9209813..d86602797a69 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -8242,6 +8242,7 @@ static const struct nla_policy nft_verdict_policy[NFTA_VERDICT_MAX + 1] = { [NFTA_VERDICT_CODE] = { .type = NLA_U32 }, [NFTA_VERDICT_CHAIN] = { .type = NLA_STRING, .len = NFT_CHAIN_MAXNAMELEN - 1 }, + [NFTA_VERDICT_CHAIN_ID] = { .type = NLA_U32 }, }; static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, @@ -8278,10 +8279,19 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, break; case NFT_JUMP: case NFT_GOTO: - if (!tb[NFTA_VERDICT_CHAIN]) + if (tb[NFTA_VERDICT_CHAIN]) { + chain = nft_chain_lookup(ctx->net, ctx->table, + tb[NFTA_VERDICT_CHAIN], + genmask); + } else if (tb[NFTA_VERDICT_CHAIN_ID]) { + chain = nft_chain_lookup_byid(ctx->net, + tb[NFTA_VERDICT_CHAIN_ID]); + if (IS_ERR(chain)) + return PTR_ERR(chain); + } else { return -EINVAL; - chain = nft_chain_lookup(ctx->net, ctx->table, - tb[NFTA_VERDICT_CHAIN], genmask); + } + if (IS_ERR(chain)) return PTR_ERR(chain); if (nft_is_base_chain(chain)) -- cgit v1.2.3-59-g8ed1b From 67c49de4ad862c567088c5119cf125e566f56e7f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 30 Jun 2020 19:21:25 +0200 Subject: netfilter: nf_tables: expose enum nft_chain_flags through UAPI This enum definition was never exposed through UAPI. Rename NFT_BASE_CHAIN to NFT_CHAIN_BASE for consistency. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 7 +------ include/uapi/linux/netfilter/nf_tables.h | 5 +++++ net/netfilter/nf_tables_api.c | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include/uapi') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 3e5226684017..6d1e7da6e00a 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -921,11 +921,6 @@ static inline void nft_set_elem_update_expr(const struct nft_set_ext *ext, (expr) != (last); \ (expr) = nft_expr_next(expr)) -enum nft_chain_flags { - NFT_BASE_CHAIN = 0x1, - NFT_CHAIN_HW_OFFLOAD = 0x2, -}; - #define NFT_CHAIN_POLICY_UNSET U8_MAX /** @@ -1036,7 +1031,7 @@ static inline struct nft_base_chain *nft_base_chain(const struct nft_chain *chai static inline bool nft_is_base_chain(const struct nft_chain *chain) { - return chain->flags & NFT_BASE_CHAIN; + return chain->flags & NFT_CHAIN_BASE; } int __nft_release_basechain(struct nft_ctx *ctx); diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 683e75126d68..2cf7cc3b50c1 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -184,6 +184,11 @@ enum nft_table_attributes { }; #define NFTA_TABLE_MAX (__NFTA_TABLE_MAX - 1) +enum nft_chain_flags { + NFT_CHAIN_BASE = (1 << 0), + NFT_CHAIN_HW_OFFLOAD = (1 << 1), +}; + /** * enum nft_chain_attributes - nf_tables chain netlink attributes * diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index d86602797a69..b7582a1c8dce 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1903,7 +1903,7 @@ static int nft_basechain_init(struct nft_base_chain *basechain, u8 family, nft_basechain_hook_init(&basechain->ops, family, hook, chain); } - chain->flags |= NFT_BASE_CHAIN | flags; + chain->flags |= NFT_CHAIN_BASE | flags; basechain->policy = NF_ACCEPT; if (chain->flags & NFT_CHAIN_HW_OFFLOAD && nft_chain_offload_priority(basechain) < 0) @@ -2255,7 +2255,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, if (nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; - flags |= chain->flags & NFT_BASE_CHAIN; + flags |= chain->flags & NFT_CHAIN_BASE; return nf_tables_updchain(&ctx, genmask, policy, flags); } -- cgit v1.2.3-59-g8ed1b From d0e2c7de92c7f2b3d355ad76b0bb9fc43d1beb87 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 30 Jun 2020 19:21:36 +0200 Subject: netfilter: nf_tables: add NFT_CHAIN_BINDING This new chain flag specifies that: * the kernel dynamically allocates the chain name, if no chain name is specified. * If the immediate expression that refers to this chain is removed, then this bound chain (and its content) is destroyed. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 13 ++++- include/uapi/linux/netfilter/nf_tables.h | 1 + net/netfilter/nf_tables_api.c | 86 +++++++++++++++++++++++++++----- net/netfilter/nft_immediate.c | 51 +++++++++++++++++++ 4 files changed, 138 insertions(+), 13 deletions(-) (limited to 'include/uapi') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 6d1e7da6e00a..822c26766330 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -899,6 +899,8 @@ static inline struct nft_userdata *nft_userdata(const struct nft_rule *rule) return (void *)&rule->data[rule->dlen]; } +void nf_tables_rule_release(const struct nft_ctx *ctx, struct nft_rule *rule); + static inline void nft_set_elem_update_expr(const struct nft_set_ext *ext, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -944,7 +946,8 @@ struct nft_chain { struct nft_table *table; u64 handle; u32 use; - u8 flags:6, + u8 flags:5, + bound:1, genmask:2; char *name; @@ -989,6 +992,14 @@ int nft_chain_validate_dependency(const struct nft_chain *chain, int nft_chain_validate_hooks(const struct nft_chain *chain, unsigned int hook_flags); +static inline bool nft_chain_is_bound(struct nft_chain *chain) +{ + return (chain->flags & NFT_CHAIN_BINDING) && chain->bound; +} + +void nft_chain_del(struct nft_chain *chain); +void nf_tables_chain_destroy(struct nft_ctx *ctx); + struct nft_stats { u64 bytes; u64 pkts; diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 2cf7cc3b50c1..e00b4ae6174e 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -187,6 +187,7 @@ enum nft_table_attributes { enum nft_chain_flags { NFT_CHAIN_BASE = (1 << 0), NFT_CHAIN_HW_OFFLOAD = (1 << 1), + NFT_CHAIN_BINDING = (1 << 2), }; /** diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index a7cb9c07802b..b8a970dad213 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1056,6 +1056,9 @@ static int nft_flush_table(struct nft_ctx *ctx) if (!nft_is_active_next(ctx->net, chain)) continue; + if (nft_chain_is_bound(chain)) + continue; + ctx->chain = chain; err = nft_delrule_by_chain(ctx); @@ -1098,6 +1101,9 @@ static int nft_flush_table(struct nft_ctx *ctx) if (!nft_is_active_next(ctx->net, chain)) continue; + if (nft_chain_is_bound(chain)) + continue; + ctx->chain = chain; err = nft_delchain(ctx); @@ -1413,13 +1419,12 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, lockdep_commit_lock_is_held(net)); if (nft_dump_stats(skb, stats)) goto nla_put_failure; - - if ((chain->flags & NFT_CHAIN_HW_OFFLOAD) && - nla_put_be32(skb, NFTA_CHAIN_FLAGS, - htonl(NFT_CHAIN_HW_OFFLOAD))) - goto nla_put_failure; } + if (chain->flags && + nla_put_be32(skb, NFTA_CHAIN_FLAGS, htonl(chain->flags))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_CHAIN_USE, htonl(chain->use))) goto nla_put_failure; @@ -1621,7 +1626,7 @@ static void nf_tables_chain_free_chain_rules(struct nft_chain *chain) kvfree(chain->rules_next); } -static void nf_tables_chain_destroy(struct nft_ctx *ctx) +void nf_tables_chain_destroy(struct nft_ctx *ctx) { struct nft_chain *chain = ctx->chain; struct nft_hook *hook, *next; @@ -1928,6 +1933,8 @@ static int nft_chain_add(struct nft_table *table, struct nft_chain *chain) return 0; } +static u64 chain_id; + static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, u8 policy, u32 flags) { @@ -1936,6 +1943,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, struct nft_base_chain *basechain; struct nft_stats __percpu *stats; struct net *net = ctx->net; + char name[NFT_NAME_MAXLEN]; struct nft_trans *trans; struct nft_chain *chain; struct nft_rule **rules; @@ -1947,6 +1955,9 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, if (nla[NFTA_CHAIN_HOOK]) { struct nft_chain_hook hook; + if (flags & NFT_CHAIN_BINDING) + return -EOPNOTSUPP; + err = nft_chain_parse_hook(net, nla, &hook, family, true); if (err < 0) return err; @@ -1976,16 +1987,33 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, return err; } } else { + if (flags & NFT_CHAIN_BASE) + return -EINVAL; + if (flags & NFT_CHAIN_HW_OFFLOAD) + return -EOPNOTSUPP; + chain = kzalloc(sizeof(*chain), GFP_KERNEL); if (chain == NULL) return -ENOMEM; + + chain->flags = flags; } ctx->chain = chain; INIT_LIST_HEAD(&chain->rules); chain->handle = nf_tables_alloc_handle(table); chain->table = table; - chain->name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL); + + if (nla[NFTA_CHAIN_NAME]) { + chain->name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL); + } else { + if (!(flags & NFT_CHAIN_BINDING)) + return -EINVAL; + + snprintf(name, sizeof(name), "__chain%llu", ++chain_id); + chain->name = kstrdup(name, GFP_KERNEL); + } + if (!chain->name) { err = -ENOMEM; goto err1; @@ -2976,8 +3004,7 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx, kfree(rule); } -static void nf_tables_rule_release(const struct nft_ctx *ctx, - struct nft_rule *rule) +void nf_tables_rule_release(const struct nft_ctx *ctx, struct nft_rule *rule) { nft_rule_expr_deactivate(ctx, rule, NFT_TRANS_RELEASE); nf_tables_rule_destroy(ctx, rule); @@ -3075,6 +3102,9 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); return PTR_ERR(chain); } + if (nft_chain_is_bound(chain)) + return -EOPNOTSUPP; + } else if (nla[NFTA_RULE_CHAIN_ID]) { chain = nft_chain_lookup_byid(net, nla[NFTA_RULE_CHAIN_ID]); if (IS_ERR(chain)) { @@ -3294,6 +3324,8 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk, NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); return PTR_ERR(chain); } + if (nft_chain_is_bound(chain)) + return -EOPNOTSUPP; } nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla); @@ -5330,11 +5362,24 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, */ void nft_data_hold(const struct nft_data *data, enum nft_data_types type) { + struct nft_chain *chain; + struct nft_rule *rule; + if (type == NFT_DATA_VERDICT) { switch (data->verdict.code) { case NFT_JUMP: case NFT_GOTO: - data->verdict.chain->use++; + chain = data->verdict.chain; + chain->use++; + + if (!nft_chain_is_bound(chain)) + break; + + chain->table->use++; + list_for_each_entry(rule, &chain->rules, list) + chain->use++; + + nft_chain_add(chain->table, chain); break; } } @@ -7474,7 +7519,7 @@ static void nft_obj_del(struct nft_object *obj) list_del_rcu(&obj->list); } -static void nft_chain_del(struct nft_chain *chain) +void nft_chain_del(struct nft_chain *chain) { struct nft_table *table = chain->table; @@ -7825,6 +7870,10 @@ static int __nf_tables_abort(struct net *net, bool autoload) kfree(nft_trans_chain_name(trans)); nft_trans_destroy(trans); } else { + if (nft_chain_is_bound(trans->ctx.chain)) { + nft_trans_destroy(trans); + break; + } trans->ctx.table->use--; nft_chain_del(trans->ctx.chain); nf_tables_unregister_hook(trans->ctx.net, @@ -8321,10 +8370,23 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, static void nft_verdict_uninit(const struct nft_data *data) { + struct nft_chain *chain; + struct nft_rule *rule; + switch (data->verdict.code) { case NFT_JUMP: case NFT_GOTO: - data->verdict.chain->use--; + chain = data->verdict.chain; + chain->use--; + + if (!nft_chain_is_bound(chain)) + break; + + chain->table->use--; + list_for_each_entry(rule, &chain->rules, list) + chain->use--; + + nft_chain_del(chain); break; } } diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index c7f0ef73d939..9e556638bb32 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -54,6 +54,23 @@ static int nft_immediate_init(const struct nft_ctx *ctx, if (err < 0) goto err1; + if (priv->dreg == NFT_REG_VERDICT) { + struct nft_chain *chain = priv->data.verdict.chain; + + switch (priv->data.verdict.code) { + case NFT_JUMP: + case NFT_GOTO: + if (nft_chain_is_bound(chain)) { + err = -EBUSY; + goto err1; + } + chain->bound = true; + break; + default: + break; + } + } + return 0; err1: @@ -81,6 +98,39 @@ static void nft_immediate_deactivate(const struct nft_ctx *ctx, return nft_data_release(&priv->data, nft_dreg_to_type(priv->dreg)); } +static void nft_immediate_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + const struct nft_immediate_expr *priv = nft_expr_priv(expr); + const struct nft_data *data = &priv->data; + struct nft_ctx chain_ctx; + struct nft_chain *chain; + struct nft_rule *rule; + + if (priv->dreg != NFT_REG_VERDICT) + return; + + switch (data->verdict.code) { + case NFT_JUMP: + case NFT_GOTO: + chain = data->verdict.chain; + + if (!nft_chain_is_bound(chain)) + break; + + chain_ctx = *ctx; + chain_ctx.chain = chain; + + list_for_each_entry(rule, &chain->rules, list) + nf_tables_rule_release(&chain_ctx, rule); + + nf_tables_chain_destroy(&chain_ctx); + break; + default: + break; + } +} + static int nft_immediate_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); @@ -170,6 +220,7 @@ static const struct nft_expr_ops nft_imm_ops = { .init = nft_immediate_init, .activate = nft_immediate_activate, .deactivate = nft_immediate_deactivate, + .destroy = nft_immediate_destroy, .dump = nft_immediate_dump, .validate = nft_immediate_validate, .offload = nft_immediate_offload, -- cgit v1.2.3-59-g8ed1b From c1f79a2eefdcc0aef5d7a911c27a3f75f1936ecd Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 4 Jul 2020 02:51:28 +0200 Subject: netfilter: nf_tables: reject unsupported chain flags Bail out if userspace sends unsupported chain flags. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 3 +++ net/netfilter/nf_tables_api.c | 3 +++ 2 files changed, 6 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index e00b4ae6174e..42f351c1f5c5 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -189,6 +189,9 @@ enum nft_chain_flags { NFT_CHAIN_HW_OFFLOAD = (1 << 1), NFT_CHAIN_BINDING = (1 << 2), }; +#define NFT_CHAIN_FLAGS (NFT_CHAIN_BASE | \ + NFT_CHAIN_HW_OFFLOAD | \ + NFT_CHAIN_BINDING) /** * enum nft_chain_attributes - nf_tables chain netlink attributes diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index b8a970dad213..f96785586f64 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2285,6 +2285,9 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, else if (chain) flags = chain->flags; + if (flags & ~NFT_CHAIN_FLAGS) + return -EOPNOTSUPP; + nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla); if (chain != NULL) { -- cgit v1.2.3-59-g8ed1b From ea1be1e59b19017e61aa357d524b743ba5602d3c Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Thu, 7 May 2020 20:03:35 +0800 Subject: serial: Remove duplicated macro definition of port type There exists the same macro definition of port type from 0 to 13 in include/uapi/linux/serial.h, remove these duplicated code in include/uapi/linux/serial_core.h which includes the former header. Acked-by: Jiri Slaby Signed-off-by: Tiezhu Yang Link: https://lore.kernel.org/r/1588853015-28392-1-git-send-email-yangtiezhu@loongson.cn Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/serial_core.h | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/serial_core.h b/include/uapi/linux/serial_core.h index 8ec3dd742ea4..851b982f8c4b 100644 --- a/include/uapi/linux/serial_core.h +++ b/include/uapi/linux/serial_core.h @@ -26,20 +26,6 @@ /* * The type definitions. These are from Ted Ts'o's serial.h */ -#define PORT_UNKNOWN 0 -#define PORT_8250 1 -#define PORT_16450 2 -#define PORT_16550 3 -#define PORT_16550A 4 -#define PORT_CIRRUS 5 -#define PORT_16650 6 -#define PORT_16650V2 7 -#define PORT_16750 8 -#define PORT_STARTECH 9 -#define PORT_16C950 10 -#define PORT_16654 11 -#define PORT_16850 12 -#define PORT_RSA 13 #define PORT_NS16550A 14 #define PORT_XSCALE 15 #define PORT_RM9000 16 /* PMC-Sierra RM9xxx internal UART */ -- cgit v1.2.3-59-g8ed1b From 1c8fb1ea5a1dbd2159e78fa580aaffb001794cfa Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 30 Jun 2020 12:39:12 +0300 Subject: IB/uverbs: Expose UAPI to query ucontext Expose UAPI to query ucontext, this will let user space application that didn't allocate the ucontext but has access to by owning the matching command FD to retrieve the ucontext information. Link: https://lore.kernel.org/r/20200630093916.332097-4-leon@kernel.org Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 1 + drivers/infiniband/core/uverbs_std_types_device.c | 41 ++++++++++++++++++++++- include/rdma/ib_verbs.h | 4 +++ include/uapi/rdma/ib_user_ioctl_cmds.h | 6 ++++ 4 files changed, 51 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 40cf07129f66..1900c0df3c8a 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -2674,6 +2674,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, query_port); SET_DEVICE_OP(dev_ops, query_qp); SET_DEVICE_OP(dev_ops, query_srq); + SET_DEVICE_OP(dev_ops, query_ucontext); SET_DEVICE_OP(dev_ops, rdma_netdev_get_params); SET_DEVICE_OP(dev_ops, read_counters); SET_DEVICE_OP(dev_ops, reg_dm_mr); diff --git a/drivers/infiniband/core/uverbs_std_types_device.c b/drivers/infiniband/core/uverbs_std_types_device.c index ae4a59d6f9b1..8e58605a17be 100644 --- a/drivers/infiniband/core/uverbs_std_types_device.c +++ b/drivers/infiniband/core/uverbs_std_types_device.c @@ -229,6 +229,37 @@ static int UVERBS_HANDLER(UVERBS_METHOD_GET_CONTEXT)( return 0; } +static int UVERBS_HANDLER(UVERBS_METHOD_QUERY_CONTEXT)( + struct uverbs_attr_bundle *attrs) +{ + u64 core_support = IB_UVERBS_CORE_SUPPORT_OPTIONAL_MR_ACCESS; + struct ib_ucontext *ucontext; + struct ib_device *ib_dev; + u32 num_comp; + int ret; + + ucontext = ib_uverbs_get_ucontext(attrs); + if (IS_ERR(ucontext)) + return PTR_ERR(ucontext); + ib_dev = ucontext->device; + + if (!ib_dev->ops.query_ucontext) + return -EOPNOTSUPP; + + num_comp = attrs->ufile->device->num_comp_vectors; + ret = uverbs_copy_to(attrs, UVERBS_ATTR_QUERY_CONTEXT_NUM_COMP_VECTORS, + &num_comp, sizeof(num_comp)); + if (IS_UVERBS_COPY_ERR(ret)) + return ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_QUERY_CONTEXT_CORE_SUPPORT, + &core_support, sizeof(core_support)); + if (IS_UVERBS_COPY_ERR(ret)) + return ret; + + return ucontext->device->ops.query_ucontext(ucontext, attrs); +} + DECLARE_UVERBS_NAMED_METHOD( UVERBS_METHOD_GET_CONTEXT, UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_GET_CONTEXT_NUM_COMP_VECTORS, @@ -237,6 +268,13 @@ DECLARE_UVERBS_NAMED_METHOD( UVERBS_ATTR_TYPE(u64), UA_OPTIONAL), UVERBS_ATTR_UHW()); +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_QUERY_CONTEXT, + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_CONTEXT_NUM_COMP_VECTORS, + UVERBS_ATTR_TYPE(u32), UA_OPTIONAL), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_CONTEXT_CORE_SUPPORT, + UVERBS_ATTR_TYPE(u64), UA_OPTIONAL)); + DECLARE_UVERBS_NAMED_METHOD( UVERBS_METHOD_INFO_HANDLES, /* Also includes any device specific object ids */ @@ -260,7 +298,8 @@ DECLARE_UVERBS_GLOBAL_METHODS(UVERBS_OBJECT_DEVICE, &UVERBS_METHOD(UVERBS_METHOD_GET_CONTEXT), &UVERBS_METHOD(UVERBS_METHOD_INVOKE_WRITE), &UVERBS_METHOD(UVERBS_METHOD_INFO_HANDLES), - &UVERBS_METHOD(UVERBS_METHOD_QUERY_PORT)); + &UVERBS_METHOD(UVERBS_METHOD_QUERY_PORT), + &UVERBS_METHOD(UVERBS_METHOD_QUERY_CONTEXT)); const struct uapi_definition uverbs_def_obj_device[] = { UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_DEVICE), diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 20c801730fed..6c72bb194148 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2645,6 +2645,10 @@ struct ib_device_ops { */ int (*fill_stat_mr_entry)(struct sk_buff *msg, struct ib_mr *ibmr); + /* query driver for its ucontext properties */ + int (*query_ucontext)(struct ib_ucontext *context, + struct uverbs_attr_bundle *attrs); + DECLARE_RDMA_OBJ_SIZE(ib_ah); DECLARE_RDMA_OBJ_SIZE(ib_cq); DECLARE_RDMA_OBJ_SIZE(ib_pd); diff --git a/include/uapi/rdma/ib_user_ioctl_cmds.h b/include/uapi/rdma/ib_user_ioctl_cmds.h index 4961d5e858eb..83b6e71ea216 100644 --- a/include/uapi/rdma/ib_user_ioctl_cmds.h +++ b/include/uapi/rdma/ib_user_ioctl_cmds.h @@ -69,6 +69,7 @@ enum uverbs_methods_device { UVERBS_METHOD_INFO_HANDLES, UVERBS_METHOD_QUERY_PORT, UVERBS_METHOD_GET_CONTEXT, + UVERBS_METHOD_QUERY_CONTEXT, }; enum uverbs_attrs_invoke_write_cmd_attr_ids { @@ -87,6 +88,11 @@ enum uverbs_attrs_get_context_attr_ids { UVERBS_ATTR_GET_CONTEXT_CORE_SUPPORT, }; +enum uverbs_attrs_query_context_attr_ids { + UVERBS_ATTR_QUERY_CONTEXT_NUM_COMP_VECTORS, + UVERBS_ATTR_QUERY_CONTEXT_CORE_SUPPORT, +}; + enum uverbs_attrs_create_cq_cmd_attr_ids { UVERBS_ATTR_CREATE_CQ_HANDLE, UVERBS_ATTR_CREATE_CQ_CQE, -- cgit v1.2.3-59-g8ed1b From 0fb556b2b58d6b8336c94379042bad2a8512af1a Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 30 Jun 2020 12:39:14 +0300 Subject: RDMA/mlx5: Implement the query ucontext functionality Implement the query ucontext functionality by returning the original ucontext data as part of an extra mlx5 attribute that holds the driver UAPI response. Link: https://lore.kernel.org/r/20200630093916.332097-6-leon@kernel.org Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_ioctl.c | 1 + drivers/infiniband/hw/mlx5/main.c | 35 ++++++++++++++++++++++++++++++++ include/uapi/rdma/mlx5_user_ioctl_cmds.h | 4 ++++ 3 files changed, 40 insertions(+) (limited to 'include/uapi') diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c index 2d882c02387c..ef04a261097f 100644 --- a/drivers/infiniband/core/uverbs_ioctl.c +++ b/drivers/infiniband/core/uverbs_ioctl.c @@ -790,6 +790,7 @@ int uverbs_copy_to_struct_or_zero(const struct uverbs_attr_bundle *bundle, } return uverbs_copy_to(bundle, idx, from, size); } +EXPORT_SYMBOL(uverbs_copy_to_struct_or_zero); /* Once called an abort will call through to the type's destroy_hw() */ void uverbs_finalize_uobj_create(const struct uverbs_attr_bundle *bundle, diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 56039fa6c530..dc77388464c9 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1990,6 +1990,29 @@ out_ctx: return err; } +static int mlx5_ib_query_ucontext(struct ib_ucontext *ibcontext, + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_ib_alloc_ucontext_resp uctx_resp = {}; + int ret; + + ret = set_ucontext_resp(ibcontext, &uctx_resp); + if (ret) + return ret; + + uctx_resp.response_length = + min_t(size_t, + uverbs_attr_get_len(attrs, + MLX5_IB_ATTR_QUERY_CONTEXT_RESP_UCTX), + sizeof(uctx_resp)); + + ret = uverbs_copy_to_struct_or_zero(attrs, + MLX5_IB_ATTR_QUERY_CONTEXT_RESP_UCTX, + &uctx_resp, + sizeof(uctx_resp)); + return ret; +} + static void mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) { struct mlx5_ib_ucontext *context = to_mucontext(ibcontext); @@ -6364,6 +6387,16 @@ ADD_UVERBS_ATTRIBUTES_SIMPLE( UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_CREATE_FLOW_ACTION_FLAGS, enum mlx5_ib_uapi_flow_action_flags)); +ADD_UVERBS_ATTRIBUTES_SIMPLE( + mlx5_ib_query_context, + UVERBS_OBJECT_DEVICE, + UVERBS_METHOD_QUERY_CONTEXT, + UVERBS_ATTR_PTR_OUT( + MLX5_IB_ATTR_QUERY_CONTEXT_RESP_UCTX, + UVERBS_ATTR_STRUCT(struct mlx5_ib_alloc_ucontext_resp, + dump_fill_mkey), + UA_MANDATORY)); + static const struct uapi_definition mlx5_ib_defs[] = { UAPI_DEF_CHAIN(mlx5_ib_devx_defs), UAPI_DEF_CHAIN(mlx5_ib_flow_defs), @@ -6372,6 +6405,7 @@ static const struct uapi_definition mlx5_ib_defs[] = { UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_FLOW_ACTION, &mlx5_ib_flow_action), UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_DM, &mlx5_ib_dm), + UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_DEVICE, &mlx5_ib_query_context), UAPI_DEF_CHAIN_OBJ_TREE_NAMED(MLX5_IB_OBJECT_VAR, UAPI_DEF_IS_OBJ_SUPPORTED(var_is_supported)), UAPI_DEF_CHAIN_OBJ_TREE_NAMED(MLX5_IB_OBJECT_UAR), @@ -6605,6 +6639,7 @@ static const struct ib_device_ops mlx5_ib_dev_ops = { .query_pkey = mlx5_ib_query_pkey, .query_qp = mlx5_ib_query_qp, .query_srq = mlx5_ib_query_srq, + .query_ucontext = mlx5_ib_query_ucontext, .read_counters = mlx5_ib_read_counters, .reg_user_mr = mlx5_ib_reg_user_mr, .req_notify_cq = mlx5_ib_arm_cq, diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h index 8e316ef896b5..496309e8a856 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h +++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h @@ -228,6 +228,10 @@ enum mlx5_ib_flow_matcher_methods { MLX5_IB_METHOD_FLOW_MATCHER_DESTROY, }; +enum mlx5_ib_device_query_context_attrs { + MLX5_IB_ATTR_QUERY_CONTEXT_RESP_UCTX = (1U << UVERBS_ID_NS_SHIFT), +}; + #define MLX5_IB_DW_MATCH_PARAM 0x80 struct mlx5_ib_match_params { -- cgit v1.2.3-59-g8ed1b From 05f71ef9797454b9384c740b8acd0d02eca1d1bc Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 30 Jun 2020 12:39:15 +0300 Subject: RDMA/mlx5: Introduce UAPI to query PD attributes Introduce UAPI to query PD attributes, this can be used to retrieve PD attributes by having the PD handle of the created one and owning the command FD for the ucontxet. Link: https://lore.kernel.org/r/20200630093916.332097-7-leon@kernel.org Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/Makefile | 3 ++- drivers/infiniband/hw/mlx5/main.c | 1 + drivers/infiniband/hw/mlx5/mlx5_ib.h | 2 ++ drivers/infiniband/hw/mlx5/std_types.c | 45 ++++++++++++++++++++++++++++++++ include/uapi/rdma/mlx5_user_ioctl_cmds.h | 10 +++++++ 5 files changed, 60 insertions(+), 1 deletion(-) create mode 100644 drivers/infiniband/hw/mlx5/std_types.c (limited to 'include/uapi') diff --git a/drivers/infiniband/hw/mlx5/Makefile b/drivers/infiniband/hw/mlx5/Makefile index 8cca61c671f8..9838719aacb9 100644 --- a/drivers/infiniband/hw/mlx5/Makefile +++ b/drivers/infiniband/hw/mlx5/Makefile @@ -23,4 +23,5 @@ mlx5_ib-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += odp.o mlx5_ib-$(CONFIG_MLX5_ESWITCH) += ib_rep.o mlx5_ib-$(CONFIG_INFINIBAND_USER_ACCESS) += devx.o \ flow.o \ - qos.o + qos.o \ + std_types.o diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index dc77388464c9..324a98c77ad5 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -6401,6 +6401,7 @@ static const struct uapi_definition mlx5_ib_defs[] = { UAPI_DEF_CHAIN(mlx5_ib_devx_defs), UAPI_DEF_CHAIN(mlx5_ib_flow_defs), UAPI_DEF_CHAIN(mlx5_ib_qos_defs), + UAPI_DEF_CHAIN(mlx5_ib_std_types_defs), UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_FLOW_ACTION, &mlx5_ib_flow_action), diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 74d3507d6f80..27b069ef63d9 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -1384,6 +1384,8 @@ int mlx5_ib_fill_stat_mr_entry(struct sk_buff *msg, struct ib_mr *ib_mr); extern const struct uapi_definition mlx5_ib_devx_defs[]; extern const struct uapi_definition mlx5_ib_flow_defs[]; extern const struct uapi_definition mlx5_ib_qos_defs[]; +extern const struct uapi_definition mlx5_ib_std_types_defs[]; + #if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user); diff --git a/drivers/infiniband/hw/mlx5/std_types.c b/drivers/infiniband/hw/mlx5/std_types.c new file mode 100644 index 000000000000..16145fda68d0 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/std_types.c @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. + */ + +#include +#include +#include +#include +#include "mlx5_ib.h" + +#define UVERBS_MODULE_NAME mlx5_ib +#include + +static int UVERBS_HANDLER(MLX5_IB_METHOD_PD_QUERY)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_pd *pd = + uverbs_attr_get_obj(attrs, MLX5_IB_ATTR_QUERY_PD_HANDLE); + struct mlx5_ib_pd *mpd = to_mpd(pd); + + return uverbs_copy_to(attrs, MLX5_IB_ATTR_QUERY_PD_RESP_PDN, + &mpd->pdn, sizeof(mpd->pdn)); +} + +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_PD_QUERY, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_QUERY_PD_HANDLE, + UVERBS_OBJECT_PD, + UVERBS_ACCESS_READ, + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_QUERY_PD_RESP_PDN, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY)); + +ADD_UVERBS_METHODS(mlx5_ib_pd, + UVERBS_OBJECT_PD, + &UVERBS_METHOD(MLX5_IB_METHOD_PD_QUERY)); + +const struct uapi_definition mlx5_ib_std_types_defs[] = { + UAPI_DEF_CHAIN_OBJ_TREE( + UVERBS_OBJECT_PD, + &mlx5_ib_pd), + {}, +}; diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h index 496309e8a856..b330e6eee626 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h +++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h @@ -290,4 +290,14 @@ enum mlx5_ib_create_flow_action_create_packet_reformat_attrs { MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_DATA_BUF, }; +enum mlx5_ib_query_pd_attrs { + MLX5_IB_ATTR_QUERY_PD_HANDLE = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_ATTR_QUERY_PD_RESP_PDN, +}; + +enum mlx5_ib_pd_methods { + MLX5_IB_METHOD_PD_QUERY = (1U << UVERBS_ID_NS_SHIFT), + +}; + #endif -- cgit v1.2.3-59-g8ed1b From 6c01e6b218aea09ec9947cbf88a4db97b4dd155c Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 30 Jun 2020 12:39:16 +0300 Subject: IB/uverbs: Expose UAPI to query MR Expose UAPI to query MR, this will let user space application that didn't allocate the MR but has access to by owning the matching command FD to retrieve its information. Link: https://lore.kernel.org/r/20200630093916.332097-8-leon@kernel.org Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_std_types_mr.c | 52 ++++++++++++++++++++++++++- include/uapi/rdma/ib_user_ioctl_cmds.h | 9 +++++ 2 files changed, 60 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/core/uverbs_std_types_mr.c b/drivers/infiniband/core/uverbs_std_types_mr.c index a2722ef8496e..62f58ad56afd 100644 --- a/drivers/infiniband/core/uverbs_std_types_mr.c +++ b/drivers/infiniband/core/uverbs_std_types_mr.c @@ -148,6 +148,36 @@ static int UVERBS_HANDLER(UVERBS_METHOD_DM_MR_REG)( return ret; } +static int UVERBS_HANDLER(UVERBS_METHOD_QUERY_MR)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_mr *mr = + uverbs_attr_get_obj(attrs, UVERBS_ATTR_QUERY_MR_HANDLE); + int ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_QUERY_MR_RESP_LKEY, &mr->lkey, + sizeof(mr->lkey)); + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_QUERY_MR_RESP_RKEY, + &mr->rkey, sizeof(mr->rkey)); + + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_QUERY_MR_RESP_LENGTH, + &mr->length, sizeof(mr->length)); + + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_QUERY_MR_RESP_IOVA, + &mr->iova, sizeof(mr->iova)); + + return IS_UVERBS_COPY_ERR(ret) ? ret : 0; +} + DECLARE_UVERBS_NAMED_METHOD( UVERBS_METHOD_ADVISE_MR, UVERBS_ATTR_IDR(UVERBS_ATTR_ADVISE_MR_PD_HANDLE, @@ -165,6 +195,25 @@ DECLARE_UVERBS_NAMED_METHOD( UA_MANDATORY, UA_ALLOC_AND_COPY)); +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_QUERY_MR, + UVERBS_ATTR_IDR(UVERBS_ATTR_QUERY_MR_HANDLE, + UVERBS_OBJECT_MR, + UVERBS_ACCESS_READ, + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_MR_RESP_RKEY, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_MR_RESP_LKEY, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_MR_RESP_LENGTH, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_MR_RESP_IOVA, + UVERBS_ATTR_TYPE(u64), + UA_OPTIONAL)); + DECLARE_UVERBS_NAMED_METHOD( UVERBS_METHOD_DM_MR_REG, UVERBS_ATTR_IDR(UVERBS_ATTR_REG_DM_MR_HANDLE, @@ -206,7 +255,8 @@ DECLARE_UVERBS_NAMED_OBJECT( UVERBS_TYPE_ALLOC_IDR(uverbs_free_mr), &UVERBS_METHOD(UVERBS_METHOD_DM_MR_REG), &UVERBS_METHOD(UVERBS_METHOD_MR_DESTROY), - &UVERBS_METHOD(UVERBS_METHOD_ADVISE_MR)); + &UVERBS_METHOD(UVERBS_METHOD_ADVISE_MR), + &UVERBS_METHOD(UVERBS_METHOD_QUERY_MR)); const struct uapi_definition uverbs_def_obj_mr[] = { UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_MR, diff --git a/include/uapi/rdma/ib_user_ioctl_cmds.h b/include/uapi/rdma/ib_user_ioctl_cmds.h index 83b6e71ea216..99dcabf61a71 100644 --- a/include/uapi/rdma/ib_user_ioctl_cmds.h +++ b/include/uapi/rdma/ib_user_ioctl_cmds.h @@ -248,6 +248,7 @@ enum uverbs_methods_mr { UVERBS_METHOD_DM_MR_REG, UVERBS_METHOD_MR_DESTROY, UVERBS_METHOD_ADVISE_MR, + UVERBS_METHOD_QUERY_MR, }; enum uverbs_attrs_mr_destroy_ids { @@ -261,6 +262,14 @@ enum uverbs_attrs_advise_mr_cmd_attr_ids { UVERBS_ATTR_ADVISE_MR_SGE_LIST, }; +enum uverbs_attrs_query_mr_cmd_attr_ids { + UVERBS_ATTR_QUERY_MR_HANDLE, + UVERBS_ATTR_QUERY_MR_RESP_LKEY, + UVERBS_ATTR_QUERY_MR_RESP_RKEY, + UVERBS_ATTR_QUERY_MR_RESP_LENGTH, + UVERBS_ATTR_QUERY_MR_RESP_IOVA, +}; + enum uverbs_attrs_create_counters_cmd_attr_ids { UVERBS_ATTR_CREATE_COUNTERS_HANDLE, }; -- cgit v1.2.3-59-g8ed1b From fe6a3d6521227e49857d0d6caabbdae3bd4aef89 Mon Sep 17 00:00:00 2001 From: Xu Yilun Date: Tue, 16 Jun 2020 12:08:45 +0800 Subject: fpga: dfl: afu: add interrupt support for port error reporting Error reporting interrupt is very useful to notify users that some errors are detected by the hardware. Once users are notified, they could query hardware logged error states, no need to continuously poll on these states. This patch adds interrupt support for port error reporting sub feature. It follows the common DFL interrupt notification and handling mechanism, implements two ioctl commands below for user to query number of irqs supported, and set/unset interrupt triggers. Ioctls: * DFL_FPGA_PORT_ERR_GET_IRQ_NUM get the number of irqs, which is used to determine whether/how many interrupts error reporting feature supports. * DFL_FPGA_PORT_ERR_SET_IRQ set/unset given eventfds as error interrupt triggers. Signed-off-by: Luwei Kang Signed-off-by: Wu Hao Signed-off-by: Xu Yilun Reviewed-by: Marcelo Tosatti Acked-by: Wu Hao Signed-off-by: Moritz Fischer --- drivers/fpga/dfl-afu-error.c | 17 +++++++++++++++++ drivers/fpga/dfl-afu-main.c | 4 ++++ include/uapi/linux/fpga-dfl.h | 23 +++++++++++++++++++++++ 3 files changed, 44 insertions(+) (limited to 'include/uapi') diff --git a/drivers/fpga/dfl-afu-error.c b/drivers/fpga/dfl-afu-error.c index c1467ae1a6b6..c4691187cca9 100644 --- a/drivers/fpga/dfl-afu-error.c +++ b/drivers/fpga/dfl-afu-error.c @@ -14,6 +14,7 @@ * Mitchel Henry */ +#include #include #include "dfl-afu.h" @@ -219,6 +220,21 @@ static void port_err_uinit(struct platform_device *pdev, afu_port_err_mask(&pdev->dev, true); } +static long +port_err_ioctl(struct platform_device *pdev, struct dfl_feature *feature, + unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case DFL_FPGA_PORT_ERR_GET_IRQ_NUM: + return dfl_feature_ioctl_get_num_irqs(pdev, feature, arg); + case DFL_FPGA_PORT_ERR_SET_IRQ: + return dfl_feature_ioctl_set_irq(pdev, feature, arg); + default: + dev_dbg(&pdev->dev, "%x cmd not handled", cmd); + return -ENODEV; + } +} + const struct dfl_feature_id port_err_id_table[] = { {.id = PORT_FEATURE_ID_ERROR,}, {0,} @@ -227,4 +243,5 @@ const struct dfl_feature_id port_err_id_table[] = { const struct dfl_feature_ops port_err_ops = { .init = port_err_init, .uinit = port_err_uinit, + .ioctl = port_err_ioctl, }; diff --git a/drivers/fpga/dfl-afu-main.c b/drivers/fpga/dfl-afu-main.c index b0c31789a909..357cd5d9b267 100644 --- a/drivers/fpga/dfl-afu-main.c +++ b/drivers/fpga/dfl-afu-main.c @@ -577,6 +577,7 @@ static int afu_release(struct inode *inode, struct file *filp) { struct platform_device *pdev = filp->private_data; struct dfl_feature_platform_data *pdata; + struct dfl_feature *feature; dev_dbg(&pdev->dev, "Device File Release\n"); @@ -586,6 +587,9 @@ static int afu_release(struct inode *inode, struct file *filp) dfl_feature_dev_use_end(pdata); if (!dfl_feature_dev_use_count(pdata)) { + dfl_fpga_dev_for_each_feature(pdata, feature) + dfl_fpga_set_irq_triggers(feature, 0, + feature->nr_irqs, NULL); __port_reset(pdev); afu_dma_region_destroy(pdata); } diff --git a/include/uapi/linux/fpga-dfl.h b/include/uapi/linux/fpga-dfl.h index 7331350f3067..6c71c9d7e926 100644 --- a/include/uapi/linux/fpga-dfl.h +++ b/include/uapi/linux/fpga-dfl.h @@ -164,6 +164,29 @@ struct dfl_fpga_irq_set { __s32 evtfds[]; }; +/** + * DFL_FPGA_PORT_ERR_GET_IRQ_NUM - _IOR(DFL_FPGA_MAGIC, DFL_PORT_BASE + 5, + * __u32 num_irqs) + * + * Get the number of irqs supported by the fpga port error reporting private + * feature. Currently hardware supports up to 1 irq. + * Return: 0 on success, -errno on failure. + */ +#define DFL_FPGA_PORT_ERR_GET_IRQ_NUM _IOR(DFL_FPGA_MAGIC, \ + DFL_PORT_BASE + 5, __u32) + +/** + * DFL_FPGA_PORT_ERR_SET_IRQ - _IOW(DFL_FPGA_MAGIC, DFL_PORT_BASE + 6, + * struct dfl_fpga_irq_set) + * + * Set fpga port error reporting interrupt trigger if evtfds[n] is valid. + * Unset related interrupt trigger if evtfds[n] is a negative value. + * Return: 0 on success, -errno on failure. + */ +#define DFL_FPGA_PORT_ERR_SET_IRQ _IOW(DFL_FPGA_MAGIC, \ + DFL_PORT_BASE + 6, \ + struct dfl_fpga_irq_set) + /* IOCTLs for FME file descriptor */ /** -- cgit v1.2.3-59-g8ed1b From d43f20bae5173ba431526040c320c36fdd4f086d Mon Sep 17 00:00:00 2001 From: Xu Yilun Date: Tue, 16 Jun 2020 12:08:46 +0800 Subject: fpga: dfl: fme: add interrupt support for global error reporting Error reporting interrupt is very useful to notify users that some errors are detected by the hardware. Once users are notified, they could query hardware logged error states, no need to continuously poll on these states. This patch adds interrupt support for fme global error reporting sub feature. It follows the common DFL interrupt notification and handling mechanism. And it implements two ioctls below for user to query number of irqs supported, and set/unset interrupt triggers. Ioctls: * DFL_FPGA_FME_ERR_GET_IRQ_NUM get the number of irqs, which is used to determine whether/how many interrupts fme error reporting feature supports. * DFL_FPGA_FME_ERR_SET_IRQ set/unset given eventfds as fme error reporting interrupt triggers. Signed-off-by: Luwei Kang Signed-off-by: Wu Hao Signed-off-by: Xu Yilun Reviewed-by: Marcelo Tosatti Acked-by: Wu Hao Signed-off-by: Moritz Fischer --- drivers/fpga/dfl-fme-error.c | 18 ++++++++++++++++++ drivers/fpga/dfl-fme-main.c | 6 ++++++ include/uapi/linux/fpga-dfl.h | 23 +++++++++++++++++++++++ 3 files changed, 47 insertions(+) (limited to 'include/uapi') diff --git a/drivers/fpga/dfl-fme-error.c b/drivers/fpga/dfl-fme-error.c index f897d414b923..51c2892ec06d 100644 --- a/drivers/fpga/dfl-fme-error.c +++ b/drivers/fpga/dfl-fme-error.c @@ -15,6 +15,7 @@ * Mitchel, Henry */ +#include #include #include "dfl.h" @@ -348,6 +349,22 @@ static void fme_global_err_uinit(struct platform_device *pdev, fme_err_mask(&pdev->dev, true); } +static long +fme_global_error_ioctl(struct platform_device *pdev, + struct dfl_feature *feature, + unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case DFL_FPGA_FME_ERR_GET_IRQ_NUM: + return dfl_feature_ioctl_get_num_irqs(pdev, feature, arg); + case DFL_FPGA_FME_ERR_SET_IRQ: + return dfl_feature_ioctl_set_irq(pdev, feature, arg); + default: + dev_dbg(&pdev->dev, "%x cmd not handled", cmd); + return -ENODEV; + } +} + const struct dfl_feature_id fme_global_err_id_table[] = { {.id = FME_FEATURE_ID_GLOBAL_ERR,}, {0,} @@ -356,4 +373,5 @@ const struct dfl_feature_id fme_global_err_id_table[] = { const struct dfl_feature_ops fme_global_err_ops = { .init = fme_global_err_init, .uinit = fme_global_err_uinit, + .ioctl = fme_global_error_ioctl, }; diff --git a/drivers/fpga/dfl-fme-main.c b/drivers/fpga/dfl-fme-main.c index fc210d4e1863..77ea04d4edbe 100644 --- a/drivers/fpga/dfl-fme-main.c +++ b/drivers/fpga/dfl-fme-main.c @@ -620,11 +620,17 @@ static int fme_release(struct inode *inode, struct file *filp) { struct dfl_feature_platform_data *pdata = filp->private_data; struct platform_device *pdev = pdata->dev; + struct dfl_feature *feature; dev_dbg(&pdev->dev, "Device File Release\n"); mutex_lock(&pdata->lock); dfl_feature_dev_use_end(pdata); + + if (!dfl_feature_dev_use_count(pdata)) + dfl_fpga_dev_for_each_feature(pdata, feature) + dfl_fpga_set_irq_triggers(feature, 0, + feature->nr_irqs, NULL); mutex_unlock(&pdata->lock); return 0; diff --git a/include/uapi/linux/fpga-dfl.h b/include/uapi/linux/fpga-dfl.h index 6c71c9d7e926..b6495ea412ec 100644 --- a/include/uapi/linux/fpga-dfl.h +++ b/include/uapi/linux/fpga-dfl.h @@ -230,4 +230,27 @@ struct dfl_fpga_fme_port_pr { */ #define DFL_FPGA_FME_PORT_ASSIGN _IOW(DFL_FPGA_MAGIC, DFL_FME_BASE + 2, int) +/** + * DFL_FPGA_FME_ERR_GET_IRQ_NUM - _IOR(DFL_FPGA_MAGIC, DFL_FME_BASE + 3, + * __u32 num_irqs) + * + * Get the number of irqs supported by the fpga fme error reporting private + * feature. Currently hardware supports up to 1 irq. + * Return: 0 on success, -errno on failure. + */ +#define DFL_FPGA_FME_ERR_GET_IRQ_NUM _IOR(DFL_FPGA_MAGIC, \ + DFL_FME_BASE + 3, __u32) + +/** + * DFL_FPGA_FME_ERR_SET_IRQ - _IOW(DFL_FPGA_MAGIC, DFL_FME_BASE + 4, + * struct dfl_fpga_irq_set) + * + * Set fpga fme error reporting interrupt trigger if evtfds[n] is valid. + * Unset related interrupt trigger if evtfds[n] is a negative value. + * Return: 0 on success, -errno on failure. + */ +#define DFL_FPGA_FME_ERR_SET_IRQ _IOW(DFL_FPGA_MAGIC, \ + DFL_FME_BASE + 4, \ + struct dfl_fpga_irq_set) + #endif /* _UAPI_LINUX_FPGA_DFL_H */ -- cgit v1.2.3-59-g8ed1b From 09d86150141955ba1b3d7cbef23785f4996e4d6f Mon Sep 17 00:00:00 2001 From: Xu Yilun Date: Tue, 16 Jun 2020 12:08:47 +0800 Subject: fpga: dfl: afu: add AFU interrupt support AFU (Accelerated Function Unit) is dynamic region of the DFL based FPGA, and always defined by users. Some DFL based FPGA cards allow users to implement their own interrupts in AFU. In order to support this, hardware implements a new UINT (AFU Interrupt) private feature with related capability register which describes the number of supported AFU interrupts as well as the local index of the interrupts for software enumeration, and from software side, driver follows the common DFL interrupt notification and handling mechanism, and it implements two ioctls below for user to query number of irqs supported and set/unset interrupt triggers. Ioctls: * DFL_FPGA_PORT_UINT_GET_IRQ_NUM get the number of irqs, which is used to determine how many interrupts UINT feature supports. * DFL_FPGA_PORT_UINT_SET_IRQ set/unset eventfds as AFU interrupt triggers. Signed-off-by: Luwei Kang Signed-off-by: Wu Hao Signed-off-by: Xu Yilun Reviewed-by: Marcelo Tosatti Acked-by: Wu Hao Signed-off-by: Moritz Fischer --- drivers/fpga/dfl-afu-main.c | 28 ++++++++++++++++++++++++++++ include/uapi/linux/fpga-dfl.h | 23 +++++++++++++++++++++++ 2 files changed, 51 insertions(+) (limited to 'include/uapi') diff --git a/drivers/fpga/dfl-afu-main.c b/drivers/fpga/dfl-afu-main.c index 357cd5d9b267..7c84feea7354 100644 --- a/drivers/fpga/dfl-afu-main.c +++ b/drivers/fpga/dfl-afu-main.c @@ -529,6 +529,30 @@ static const struct dfl_feature_ops port_stp_ops = { .init = port_stp_init, }; +static long +port_uint_ioctl(struct platform_device *pdev, struct dfl_feature *feature, + unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case DFL_FPGA_PORT_UINT_GET_IRQ_NUM: + return dfl_feature_ioctl_get_num_irqs(pdev, feature, arg); + case DFL_FPGA_PORT_UINT_SET_IRQ: + return dfl_feature_ioctl_set_irq(pdev, feature, arg); + default: + dev_dbg(&pdev->dev, "%x cmd not handled", cmd); + return -ENODEV; + } +} + +static const struct dfl_feature_id port_uint_id_table[] = { + {.id = PORT_FEATURE_ID_UINT,}, + {0,} +}; + +static const struct dfl_feature_ops port_uint_ops = { + .ioctl = port_uint_ioctl, +}; + static struct dfl_feature_driver port_feature_drvs[] = { { .id_table = port_hdr_id_table, @@ -546,6 +570,10 @@ static struct dfl_feature_driver port_feature_drvs[] = { .id_table = port_stp_id_table, .ops = &port_stp_ops, }, + { + .id_table = port_uint_id_table, + .ops = &port_uint_ops, + }, { .ops = NULL, } diff --git a/include/uapi/linux/fpga-dfl.h b/include/uapi/linux/fpga-dfl.h index b6495ea412ec..1621b077bf21 100644 --- a/include/uapi/linux/fpga-dfl.h +++ b/include/uapi/linux/fpga-dfl.h @@ -187,6 +187,29 @@ struct dfl_fpga_irq_set { DFL_PORT_BASE + 6, \ struct dfl_fpga_irq_set) +/** + * DFL_FPGA_PORT_UINT_GET_IRQ_NUM - _IOR(DFL_FPGA_MAGIC, DFL_PORT_BASE + 7, + * __u32 num_irqs) + * + * Get the number of irqs supported by the fpga AFU interrupt private + * feature. + * Return: 0 on success, -errno on failure. + */ +#define DFL_FPGA_PORT_UINT_GET_IRQ_NUM _IOR(DFL_FPGA_MAGIC, \ + DFL_PORT_BASE + 7, __u32) + +/** + * DFL_FPGA_PORT_UINT_SET_IRQ - _IOW(DFL_FPGA_MAGIC, DFL_PORT_BASE + 8, + * struct dfl_fpga_irq_set) + * + * Set fpga AFU interrupt trigger if evtfds[n] is valid. + * Unset related interrupt trigger if evtfds[n] is a negative value. + * Return: 0 on success, -errno on failure. + */ +#define DFL_FPGA_PORT_UINT_SET_IRQ _IOW(DFL_FPGA_MAGIC, \ + DFL_PORT_BASE + 8, \ + struct dfl_fpga_irq_set) + /* IOCTLs for FME file descriptor */ /** -- cgit v1.2.3-59-g8ed1b From 1ce50e7d408ef2bdc8ca021363fd46d1b8bfad00 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 6 Jul 2020 12:55:37 +0200 Subject: thermal: core: genetlink support for events/cmd/sampling Initially the thermal framework had a very simple notification mechanism to send generic netlink messages to the userspace. The notification function was never called from anywhere and the corresponding dead code was removed. It was probably a first attempt to introduce the netlink notification. At LPC2018, the presentation "Linux thermal: User kernel interface", proposed to create the notifications to the userspace via a kfifo. The advantage of the kfifo is the performance. It is usually used from a 1:1 communication channel where a driver captures data and sends it as fast as possible to a userspace process. The drawback is that only one process uses the notification channel exclusively, thus no other process is allowed to use the channel to get temperature or notifications. This patch defines a generic netlink API to discover the current thermal setup and adds event notifications as well as temperature sampling. As any genetlink protocol, it can evolve and the versioning allows to keep the backward compatibility. In order to prevent the user from getting flooded with data on a single channel, there are two multicast channels, one for the temperature sampling when the thermal zone is updated and another one for the events, so the user can get the events only without the thermal zone temperature sampling. Also, a list of commands to discover the thermal setup is added and can be extended when needed. Reviewed-by: Amit Kucheria Signed-off-by: Daniel Lezcano Acked-by: Zhang Rui Link: https://lore.kernel.org/r/20200706105538.2159-3-daniel.lezcano@linaro.org --- drivers/thermal/Makefile | 2 +- drivers/thermal/thermal_core.h | 18 ++ drivers/thermal/thermal_netlink.c | 648 ++++++++++++++++++++++++++++++++++++++ include/linux/thermal.h | 17 - include/uapi/linux/thermal.h | 89 +++++- 5 files changed, 739 insertions(+), 35 deletions(-) create mode 100644 drivers/thermal/thermal_netlink.c (limited to 'include/uapi') diff --git a/drivers/thermal/Makefile b/drivers/thermal/Makefile index 0c8b84a09b9a..1bbf0805fb04 100644 --- a/drivers/thermal/Makefile +++ b/drivers/thermal/Makefile @@ -5,7 +5,7 @@ obj-$(CONFIG_THERMAL) += thermal_sys.o thermal_sys-y += thermal_core.o thermal_sysfs.o \ - thermal_helpers.o + thermal_helpers.o thermal_netlink.o # interface to/from other layers providing sensors thermal_sys-$(CONFIG_THERMAL_HWMON) += thermal_hwmon.o diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h index 967b0ba6593e..4d9455ef0c32 100644 --- a/drivers/thermal/thermal_core.h +++ b/drivers/thermal/thermal_core.h @@ -52,6 +52,24 @@ int for_each_thermal_governor(int (*cb)(struct thermal_governor *, void *), struct thermal_zone_device *thermal_zone_get_by_id(int id); +/* Netlink notification function */ +int thermal_notify_tz_create(int tz_id, const char *name); +int thermal_notify_tz_delete(int tz_id); +int thermal_notify_tz_enable(int tz_id); +int thermal_notify_tz_disable(int tz_id); +int thermal_notify_tz_trip_down(int tz_id, int id); +int thermal_notify_tz_trip_up(int tz_id, int id); +int thermal_notify_tz_trip_delete(int tz_id, int id); +int thermal_notify_tz_trip_add(int tz_id, int id, int type, + int temp, int hyst); +int thermal_notify_tz_trip_change(int tz_id, int id, int type, + int temp, int hyst); +int thermal_notify_cdev_state_update(int cdev_id, int state); +int thermal_notify_cdev_add(int cdev_id, const char *name, int max_state); +int thermal_notify_cdev_delete(int cdev_id); +int thermal_notify_tz_gov_change(int tz_id, const char *name); +int thermal_genl_sampling_temp(int id, int temp); + struct thermal_attr { struct device_attribute attr; char name[THERMAL_NAME_LENGTH]; diff --git a/drivers/thermal/thermal_netlink.c b/drivers/thermal/thermal_netlink.c new file mode 100644 index 000000000000..dd0a3b889674 --- /dev/null +++ b/drivers/thermal/thermal_netlink.c @@ -0,0 +1,648 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2020 Linaro Limited + * + * Author: Daniel Lezcano + * + * Generic netlink for thermal management framework + */ +#include +#include +#include +#include + +#include "thermal_core.h" + +static const struct genl_multicast_group thermal_genl_mcgrps[] = { + { .name = THERMAL_GENL_SAMPLING_GROUP_NAME, }, + { .name = THERMAL_GENL_EVENT_GROUP_NAME, }, +}; + +static const struct nla_policy thermal_genl_policy[THERMAL_GENL_ATTR_MAX + 1] = { + /* Thermal zone */ + [THERMAL_GENL_ATTR_TZ] = { .type = NLA_NESTED }, + [THERMAL_GENL_ATTR_TZ_ID] = { .type = NLA_U32 }, + [THERMAL_GENL_ATTR_TZ_TEMP] = { .type = NLA_U32 }, + [THERMAL_GENL_ATTR_TZ_TRIP] = { .type = NLA_NESTED }, + [THERMAL_GENL_ATTR_TZ_TRIP_ID] = { .type = NLA_U32 }, + [THERMAL_GENL_ATTR_TZ_TRIP_TEMP] = { .type = NLA_U32 }, + [THERMAL_GENL_ATTR_TZ_TRIP_TYPE] = { .type = NLA_U32 }, + [THERMAL_GENL_ATTR_TZ_TRIP_HYST] = { .type = NLA_U32 }, + [THERMAL_GENL_ATTR_TZ_MODE] = { .type = NLA_U32 }, + [THERMAL_GENL_ATTR_TZ_CDEV_WEIGHT] = { .type = NLA_U32 }, + [THERMAL_GENL_ATTR_TZ_NAME] = { .type = NLA_STRING, + .len = THERMAL_NAME_LENGTH }, + /* Governor(s) */ + [THERMAL_GENL_ATTR_TZ_GOV] = { .type = NLA_NESTED }, + [THERMAL_GENL_ATTR_TZ_GOV_NAME] = { .type = NLA_STRING, + .len = THERMAL_NAME_LENGTH }, + /* Cooling devices */ + [THERMAL_GENL_ATTR_CDEV] = { .type = NLA_NESTED }, + [THERMAL_GENL_ATTR_CDEV_ID] = { .type = NLA_U32 }, + [THERMAL_GENL_ATTR_CDEV_CUR_STATE] = { .type = NLA_U32 }, + [THERMAL_GENL_ATTR_CDEV_MAX_STATE] = { .type = NLA_U32 }, + [THERMAL_GENL_ATTR_CDEV_NAME] = { .type = NLA_STRING, + .len = THERMAL_NAME_LENGTH }, +}; + +struct param { + struct nlattr **attrs; + struct sk_buff *msg; + const char *name; + int tz_id; + int cdev_id; + int trip_id; + int trip_temp; + int trip_type; + int trip_hyst; + int temp; + int cdev_state; + int cdev_max_state; +}; + +typedef int (*cb_t)(struct param *); + +static struct genl_family thermal_gnl_family; + +/************************** Sampling encoding *******************************/ + +int thermal_genl_sampling_temp(int id, int temp) +{ + struct sk_buff *skb; + void *hdr; + + skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOMEM; + + hdr = genlmsg_put(skb, 0, 0, &thermal_gnl_family, 0, + THERMAL_GENL_SAMPLING_TEMP); + if (!hdr) + return -EMSGSIZE; + + if (nla_put_u32(skb, THERMAL_GENL_ATTR_TZ_ID, id)) + goto out_cancel; + + if (nla_put_u32(skb, THERMAL_GENL_ATTR_TZ_TEMP, temp)) + goto out_cancel; + + genlmsg_end(skb, hdr); + + genlmsg_multicast(&thermal_gnl_family, skb, 0, 0, GFP_KERNEL); + + return 0; +out_cancel: + genlmsg_cancel(skb, hdr); + nlmsg_free(skb); + + return -EMSGSIZE; +} + +/**************************** Event encoding *********************************/ + +static int thermal_genl_event_tz_create(struct param *p) +{ + if (nla_put_u32(p->msg, THERMAL_GENL_ATTR_TZ_ID, p->tz_id) || + nla_put_string(p->msg, THERMAL_GENL_ATTR_TZ_NAME, p->name)) + return -EMSGSIZE; + + return 0; +} + +static int thermal_genl_event_tz(struct param *p) +{ + if (nla_put_u32(p->msg, THERMAL_GENL_ATTR_TZ_ID, p->tz_id)) + return -EMSGSIZE; + + return 0; +} + +static int thermal_genl_event_tz_trip_up(struct param *p) +{ + if (nla_put_u32(p->msg, THERMAL_GENL_ATTR_TZ_ID, p->tz_id) || + nla_put_u32(p->msg, THERMAL_GENL_ATTR_TZ_TRIP_ID, p->trip_id)) + return -EMSGSIZE; + + return 0; +} + +static int thermal_genl_event_tz_trip_add(struct param *p) +{ + if (nla_put_u32(p->msg, THERMAL_GENL_ATTR_TZ_ID, p->tz_id) || + nla_put_u32(p->msg, THERMAL_GENL_ATTR_TZ_TRIP_ID, p->trip_id) || + nla_put_u32(p->msg, THERMAL_GENL_ATTR_TZ_TRIP_TYPE, p->trip_type) || + nla_put_u32(p->msg, THERMAL_GENL_ATTR_TZ_TRIP_TEMP, p->trip_temp) || + nla_put_u32(p->msg, THERMAL_GENL_ATTR_TZ_TRIP_HYST, p->trip_hyst)) + return -EMSGSIZE; + + return 0; +} + +static int thermal_genl_event_tz_trip_delete(struct param *p) +{ + if (nla_put_u32(p->msg, THERMAL_GENL_ATTR_TZ_ID, p->tz_id) || + nla_put_u32(p->msg, THERMAL_GENL_ATTR_TZ_TRIP_ID, p->trip_id)) + return -EMSGSIZE; + + return 0; +} + +static int thermal_genl_event_cdev_add(struct param *p) +{ + if (nla_put_string(p->msg, THERMAL_GENL_ATTR_CDEV_NAME, + p->name) || + nla_put_u32(p->msg, THERMAL_GENL_ATTR_CDEV_ID, + p->cdev_id) || + nla_put_u32(p->msg, THERMAL_GENL_ATTR_CDEV_MAX_STATE, + p->cdev_max_state)) + return -EMSGSIZE; + + return 0; +} + +static int thermal_genl_event_cdev_delete(struct param *p) +{ + if (nla_put_u32(p->msg, THERMAL_GENL_ATTR_CDEV_ID, p->cdev_id)) + return -EMSGSIZE; + + return 0; +} + +static int thermal_genl_event_cdev_state_update(struct param *p) +{ + if (nla_put_u32(p->msg, THERMAL_GENL_ATTR_CDEV_ID, + p->cdev_id) || + nla_put_u32(p->msg, THERMAL_GENL_ATTR_CDEV_CUR_STATE, + p->cdev_state)) + return -EMSGSIZE; + + return 0; +} + +static int thermal_genl_event_gov_change(struct param *p) +{ + if (nla_put_u32(p->msg, THERMAL_GENL_ATTR_TZ_ID, p->tz_id) || + nla_put_string(p->msg, THERMAL_GENL_ATTR_GOV_NAME, p->name)) + return -EMSGSIZE; + + return 0; +} + +int thermal_genl_event_tz_delete(struct param *p) + __attribute__((alias("thermal_genl_event_tz"))); + +int thermal_genl_event_tz_enable(struct param *p) + __attribute__((alias("thermal_genl_event_tz"))); + +int thermal_genl_event_tz_disable(struct param *p) + __attribute__((alias("thermal_genl_event_tz"))); + +int thermal_genl_event_tz_trip_down(struct param *p) + __attribute__((alias("thermal_genl_event_tz_trip_up"))); + +int thermal_genl_event_tz_trip_change(struct param *p) + __attribute__((alias("thermal_genl_event_tz_trip_add"))); + +static cb_t event_cb[] = { + [THERMAL_GENL_EVENT_TZ_CREATE] = thermal_genl_event_tz_create, + [THERMAL_GENL_EVENT_TZ_DELETE] = thermal_genl_event_tz_delete, + [THERMAL_GENL_EVENT_TZ_ENABLE] = thermal_genl_event_tz_enable, + [THERMAL_GENL_EVENT_TZ_DISABLE] = thermal_genl_event_tz_disable, + [THERMAL_GENL_EVENT_TZ_TRIP_UP] = thermal_genl_event_tz_trip_up, + [THERMAL_GENL_EVENT_TZ_TRIP_DOWN] = thermal_genl_event_tz_trip_down, + [THERMAL_GENL_EVENT_TZ_TRIP_CHANGE] = thermal_genl_event_tz_trip_change, + [THERMAL_GENL_EVENT_TZ_TRIP_ADD] = thermal_genl_event_tz_trip_add, + [THERMAL_GENL_EVENT_TZ_TRIP_DELETE] = thermal_genl_event_tz_trip_delete, + [THERMAL_GENL_EVENT_CDEV_ADD] = thermal_genl_event_cdev_add, + [THERMAL_GENL_EVENT_CDEV_DELETE] = thermal_genl_event_cdev_delete, + [THERMAL_GENL_EVENT_CDEV_STATE_UPDATE] = thermal_genl_event_cdev_state_update, + [THERMAL_GENL_EVENT_TZ_GOV_CHANGE] = thermal_genl_event_gov_change, +}; + +/* + * Generic netlink event encoding + */ +static int thermal_genl_send_event(enum thermal_genl_event event, + struct param *p) +{ + struct sk_buff *msg; + int ret = -EMSGSIZE; + void *hdr; + + msg = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + p->msg = msg; + + hdr = genlmsg_put(msg, 0, 0, &thermal_gnl_family, 0, event); + if (!hdr) + goto out_free_msg; + + ret = event_cb[event](p); + if (ret) + goto out_cancel_msg; + + genlmsg_end(msg, hdr); + + genlmsg_multicast(&thermal_gnl_family, msg, 0, 1, GFP_KERNEL); + + return 0; + +out_cancel_msg: + genlmsg_cancel(msg, hdr); +out_free_msg: + nlmsg_free(msg); + + return ret; +} + +int thermal_notify_tz_create(int tz_id, const char *name) +{ + struct param p = { .tz_id = tz_id, .name = name }; + + return thermal_genl_send_event(THERMAL_GENL_EVENT_TZ_CREATE, &p); +} + +int thermal_notify_tz_delete(int tz_id) +{ + struct param p = { .tz_id = tz_id }; + + return thermal_genl_send_event(THERMAL_GENL_EVENT_TZ_DELETE, &p); +} + +int thermal_notify_tz_enable(int tz_id) +{ + struct param p = { .tz_id = tz_id }; + + return thermal_genl_send_event(THERMAL_GENL_EVENT_TZ_ENABLE, &p); +} + +int thermal_notify_tz_disable(int tz_id) +{ + struct param p = { .tz_id = tz_id }; + + return thermal_genl_send_event(THERMAL_GENL_EVENT_TZ_DISABLE, &p); +} + +int thermal_notify_tz_trip_down(int tz_id, int trip_id) +{ + struct param p = { .tz_id = tz_id, .trip_id = trip_id }; + + return thermal_genl_send_event(THERMAL_GENL_EVENT_TZ_TRIP_DOWN, &p); +} + +int thermal_notify_tz_trip_up(int tz_id, int trip_id) +{ + struct param p = { .tz_id = tz_id, .trip_id = trip_id }; + + return thermal_genl_send_event(THERMAL_GENL_EVENT_TZ_TRIP_UP, &p); +} + +int thermal_notify_tz_trip_add(int tz_id, int trip_id, int trip_type, + int trip_temp, int trip_hyst) +{ + struct param p = { .tz_id = tz_id, .trip_id = trip_id, + .trip_type = trip_type, .trip_temp = trip_temp, + .trip_hyst = trip_hyst }; + + return thermal_genl_send_event(THERMAL_GENL_EVENT_TZ_TRIP_ADD, &p); +} + +int thermal_notify_tz_trip_delete(int tz_id, int trip_id) +{ + struct param p = { .tz_id = tz_id, .trip_id = trip_id }; + + return thermal_genl_send_event(THERMAL_GENL_EVENT_TZ_TRIP_DELETE, &p); +} + +int thermal_notify_tz_trip_change(int tz_id, int trip_id, int trip_type, + int trip_temp, int trip_hyst) +{ + struct param p = { .tz_id = tz_id, .trip_id = trip_id, + .trip_type = trip_type, .trip_temp = trip_temp, + .trip_hyst = trip_hyst }; + + return thermal_genl_send_event(THERMAL_GENL_EVENT_TZ_TRIP_CHANGE, &p); +} + +int thermal_notify_cdev_state_update(int cdev_id, int cdev_state) +{ + struct param p = { .cdev_id = cdev_id, .cdev_state = cdev_state }; + + return thermal_genl_send_event(THERMAL_GENL_EVENT_CDEV_STATE_UPDATE, &p); +} + +int thermal_notify_cdev_add(int cdev_id, const char *name, int cdev_max_state) +{ + struct param p = { .cdev_id = cdev_id, .name = name, + .cdev_max_state = cdev_max_state }; + + return thermal_genl_send_event(THERMAL_GENL_EVENT_CDEV_ADD, &p); +} + +int thermal_notify_cdev_delete(int cdev_id) +{ + struct param p = { .cdev_id = cdev_id }; + + return thermal_genl_send_event(THERMAL_GENL_EVENT_CDEV_DELETE, &p); +} + +int thermal_notify_tz_gov_change(int tz_id, const char *name) +{ + struct param p = { .tz_id = tz_id, .name = name }; + + return thermal_genl_send_event(THERMAL_GENL_EVENT_TZ_GOV_CHANGE, &p); +} + +/*************************** Command encoding ********************************/ + +static int __thermal_genl_cmd_tz_get_id(struct thermal_zone_device *tz, + void *data) +{ + struct sk_buff *msg = data; + + if (nla_put_u32(msg, THERMAL_GENL_ATTR_TZ_ID, tz->id) || + nla_put_string(msg, THERMAL_GENL_ATTR_TZ_NAME, tz->type)) + return -EMSGSIZE; + + return 0; +} + +static int thermal_genl_cmd_tz_get_id(struct param *p) +{ + struct sk_buff *msg = p->msg; + struct nlattr *start_tz; + int ret; + + start_tz = nla_nest_start(msg, THERMAL_GENL_ATTR_TZ); + if (!start_tz) + return -EMSGSIZE; + + ret = for_each_thermal_zone(__thermal_genl_cmd_tz_get_id, msg); + if (ret) + goto out_cancel_nest; + + nla_nest_end(msg, start_tz); + + return 0; + +out_cancel_nest: + nla_nest_cancel(msg, start_tz); + + return ret; +} + +static int thermal_genl_cmd_tz_get_trip(struct param *p) +{ + struct sk_buff *msg = p->msg; + struct thermal_zone_device *tz; + struct nlattr *start_trip; + int i, id; + + if (!p->attrs[THERMAL_GENL_ATTR_TZ_ID]) + return -EINVAL; + + id = nla_get_u32(p->attrs[THERMAL_GENL_ATTR_TZ_ID]); + + tz = thermal_zone_get_by_id(id); + if (!tz) + return -EINVAL; + + start_trip = nla_nest_start(msg, THERMAL_GENL_ATTR_TZ_TRIP); + if (!start_trip) + return -EMSGSIZE; + + mutex_lock(&tz->lock); + + for (i = 0; i < tz->trips; i++) { + + enum thermal_trip_type type; + int temp, hyst; + + tz->ops->get_trip_type(tz, i, &type); + tz->ops->get_trip_temp(tz, i, &temp); + tz->ops->get_trip_hyst(tz, i, &hyst); + + if (nla_put_u32(msg, THERMAL_GENL_ATTR_TZ_TRIP_ID, i) || + nla_put_u32(msg, THERMAL_GENL_ATTR_TZ_TRIP_TYPE, type) || + nla_put_u32(msg, THERMAL_GENL_ATTR_TZ_TRIP_TEMP, temp) || + nla_put_u32(msg, THERMAL_GENL_ATTR_TZ_TRIP_HYST, hyst)) + goto out_cancel_nest; + } + + mutex_unlock(&tz->lock); + + nla_nest_end(msg, start_trip); + + return 0; + +out_cancel_nest: + mutex_unlock(&tz->lock); + + return -EMSGSIZE; +} + +static int thermal_genl_cmd_tz_get_temp(struct param *p) +{ + struct sk_buff *msg = p->msg; + struct thermal_zone_device *tz; + int temp, ret, id; + + if (!p->attrs[THERMAL_GENL_ATTR_TZ_ID]) + return -EINVAL; + + id = nla_get_u32(p->attrs[THERMAL_GENL_ATTR_TZ_ID]); + + tz = thermal_zone_get_by_id(id); + if (!tz) + return -EINVAL; + + ret = thermal_zone_get_temp(tz, &temp); + if (ret) + return ret; + + if (nla_put_u32(msg, THERMAL_GENL_ATTR_TZ_ID, id) || + nla_put_u32(msg, THERMAL_GENL_ATTR_TZ_TEMP, temp)) + return -EMSGSIZE; + + return 0; +} + +static int thermal_genl_cmd_tz_get_gov(struct param *p) +{ + struct sk_buff *msg = p->msg; + struct thermal_zone_device *tz; + int id, ret = 0; + + if (!p->attrs[THERMAL_GENL_ATTR_TZ_ID]) + return -EINVAL; + + id = nla_get_u32(p->attrs[THERMAL_GENL_ATTR_TZ_ID]); + + tz = thermal_zone_get_by_id(id); + if (!tz) + return -EINVAL; + + mutex_lock(&tz->lock); + + if (nla_put_u32(msg, THERMAL_GENL_ATTR_TZ_ID, id) || + nla_put_string(msg, THERMAL_GENL_ATTR_TZ_GOV_NAME, + tz->governor->name)) + ret = -EMSGSIZE; + + mutex_unlock(&tz->lock); + + return ret; +} + +static int __thermal_genl_cmd_cdev_get(struct thermal_cooling_device *cdev, + void *data) +{ + struct sk_buff *msg = data; + + if (nla_put_u32(msg, THERMAL_GENL_ATTR_CDEV_ID, cdev->id)) + return -EMSGSIZE; + + if (nla_put_string(msg, THERMAL_GENL_ATTR_CDEV_NAME, cdev->type)) + return -EMSGSIZE; + + return 0; +} + +static int thermal_genl_cmd_cdev_get(struct param *p) +{ + struct sk_buff *msg = p->msg; + struct nlattr *start_cdev; + int ret; + + start_cdev = nla_nest_start(msg, THERMAL_GENL_ATTR_CDEV); + if (!start_cdev) + return -EMSGSIZE; + + ret = for_each_thermal_cooling_device(__thermal_genl_cmd_cdev_get, msg); + if (ret) + goto out_cancel_nest; + + nla_nest_end(msg, start_cdev); + + return 0; +out_cancel_nest: + nla_nest_cancel(msg, start_cdev); + + return ret; +} + +static cb_t cmd_cb[] = { + [THERMAL_GENL_CMD_TZ_GET_ID] = thermal_genl_cmd_tz_get_id, + [THERMAL_GENL_CMD_TZ_GET_TRIP] = thermal_genl_cmd_tz_get_trip, + [THERMAL_GENL_CMD_TZ_GET_TEMP] = thermal_genl_cmd_tz_get_temp, + [THERMAL_GENL_CMD_TZ_GET_GOV] = thermal_genl_cmd_tz_get_gov, + [THERMAL_GENL_CMD_CDEV_GET] = thermal_genl_cmd_cdev_get, +}; + +static int thermal_genl_cmd_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct param p = { .msg = skb }; + const struct genl_dumpit_info *info = genl_dumpit_info(cb); + int cmd = info->ops->cmd; + int ret = -EMSGSIZE; + void *hdr; + + hdr = genlmsg_put(skb, 0, 0, &thermal_gnl_family, 0, cmd); + if (!hdr) + return -EMSGSIZE; + + ret = cmd_cb[cmd](&p); + if (ret) + goto out_cancel_msg; + + genlmsg_end(skb, hdr); + + return 0; + +out_cancel_msg: + genlmsg_cancel(skb, hdr); + + return ret; +} + +static int thermal_genl_cmd_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct param p = { .attrs = info->attrs }; + struct sk_buff *msg; + void *hdr; + int cmd = info->genlhdr->cmd; + int ret = -EMSGSIZE; + + msg = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + p.msg = msg; + + hdr = genlmsg_put_reply(msg, info, &thermal_gnl_family, 0, cmd); + if (!hdr) + goto out_free_msg; + + ret = cmd_cb[cmd](&p); + if (ret) + goto out_cancel_msg; + + genlmsg_end(msg, hdr); + + return genlmsg_reply(msg, info); + +out_cancel_msg: + genlmsg_cancel(msg, hdr); +out_free_msg: + nlmsg_free(msg); + + return ret; +} + +static const struct genl_ops thermal_genl_ops[] = { + { + .cmd = THERMAL_GENL_CMD_TZ_GET_ID, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .dumpit = thermal_genl_cmd_dumpit, + }, + { + .cmd = THERMAL_GENL_CMD_TZ_GET_TRIP, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = thermal_genl_cmd_doit, + }, + { + .cmd = THERMAL_GENL_CMD_TZ_GET_TEMP, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = thermal_genl_cmd_doit, + }, + { + .cmd = THERMAL_GENL_CMD_TZ_GET_GOV, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = thermal_genl_cmd_doit, + }, + { + .cmd = THERMAL_GENL_CMD_CDEV_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .dumpit = thermal_genl_cmd_dumpit, + }, +}; + +static struct genl_family thermal_gnl_family __ro_after_init = { + .hdrsize = 0, + .name = THERMAL_GENL_FAMILY_NAME, + .version = THERMAL_GENL_VERSION, + .maxattr = THERMAL_GENL_ATTR_MAX, + .policy = thermal_genl_policy, + .ops = thermal_genl_ops, + .n_ops = ARRAY_SIZE(thermal_genl_ops), + .mcgrps = thermal_genl_mcgrps, + .n_mcgrps = ARRAY_SIZE(thermal_genl_mcgrps), +}; + +static int __init thermal_netlink_init(void) +{ + return genl_register_family(&thermal_gnl_family); +} +core_initcall(thermal_netlink_init); diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 108251f23e5c..42ef807e5d84 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -37,18 +37,6 @@ struct thermal_cooling_device; struct thermal_instance; struct thermal_attr; -enum thermal_device_mode { - THERMAL_DEVICE_DISABLED = 0, - THERMAL_DEVICE_ENABLED, -}; - -enum thermal_trip_type { - THERMAL_TRIP_ACTIVE = 0, - THERMAL_TRIP_PASSIVE, - THERMAL_TRIP_HOT, - THERMAL_TRIP_CRITICAL, -}; - enum thermal_trend { THERMAL_TREND_STABLE, /* temperature is stable */ THERMAL_TREND_RAISING, /* temperature is raising */ @@ -303,11 +291,6 @@ struct thermal_zone_params { int offset; }; -struct thermal_genl_event { - u32 orig; - enum events event; -}; - /** * struct thermal_zone_of_device_ops - scallbacks for handling DT based zones * diff --git a/include/uapi/linux/thermal.h b/include/uapi/linux/thermal.h index 96218378dda8..c105054cbb57 100644 --- a/include/uapi/linux/thermal.h +++ b/include/uapi/linux/thermal.h @@ -4,31 +4,86 @@ #define THERMAL_NAME_LENGTH 20 -/* Adding event notification support elements */ -#define THERMAL_GENL_FAMILY_NAME "thermal_event" -#define THERMAL_GENL_VERSION 0x01 -#define THERMAL_GENL_MCAST_GROUP_NAME "thermal_mc_grp" - -/* Events supported by Thermal Netlink */ -enum events { - THERMAL_AUX0, - THERMAL_AUX1, - THERMAL_CRITICAL, - THERMAL_DEV_FAULT, +enum thermal_device_mode { + THERMAL_DEVICE_DISABLED = 0, + THERMAL_DEVICE_ENABLED, +}; + +enum thermal_trip_type { + THERMAL_TRIP_ACTIVE = 0, + THERMAL_TRIP_PASSIVE, + THERMAL_TRIP_HOT, + THERMAL_TRIP_CRITICAL, }; -/* attributes of thermal_genl_family */ -enum { +/* Adding event notification support elements */ +#define THERMAL_GENL_FAMILY_NAME "thermal" +#define THERMAL_GENL_VERSION 0x01 +#define THERMAL_GENL_SAMPLING_GROUP_NAME "sampling" +#define THERMAL_GENL_EVENT_GROUP_NAME "event" + +/* Attributes of thermal_genl_family */ +enum thermal_genl_attr { THERMAL_GENL_ATTR_UNSPEC, - THERMAL_GENL_ATTR_EVENT, + THERMAL_GENL_ATTR_TZ, + THERMAL_GENL_ATTR_TZ_ID, + THERMAL_GENL_ATTR_TZ_TEMP, + THERMAL_GENL_ATTR_TZ_TRIP, + THERMAL_GENL_ATTR_TZ_TRIP_ID, + THERMAL_GENL_ATTR_TZ_TRIP_TYPE, + THERMAL_GENL_ATTR_TZ_TRIP_TEMP, + THERMAL_GENL_ATTR_TZ_TRIP_HYST, + THERMAL_GENL_ATTR_TZ_MODE, + THERMAL_GENL_ATTR_TZ_NAME, + THERMAL_GENL_ATTR_TZ_CDEV_WEIGHT, + THERMAL_GENL_ATTR_TZ_GOV, + THERMAL_GENL_ATTR_TZ_GOV_NAME, + THERMAL_GENL_ATTR_CDEV, + THERMAL_GENL_ATTR_CDEV_ID, + THERMAL_GENL_ATTR_CDEV_CUR_STATE, + THERMAL_GENL_ATTR_CDEV_MAX_STATE, + THERMAL_GENL_ATTR_CDEV_NAME, + THERMAL_GENL_ATTR_GOV_NAME, + __THERMAL_GENL_ATTR_MAX, }; #define THERMAL_GENL_ATTR_MAX (__THERMAL_GENL_ATTR_MAX - 1) -/* commands supported by the thermal_genl_family */ -enum { +enum thermal_genl_sampling { + THERMAL_GENL_SAMPLING_TEMP, + __THERMAL_GENL_SAMPLING_MAX, +}; +#define THERMAL_GENL_SAMPLING_MAX (__THERMAL_GENL_SAMPLING_MAX - 1) + +/* Events of thermal_genl_family */ +enum thermal_genl_event { + THERMAL_GENL_EVENT_UNSPEC, + THERMAL_GENL_EVENT_TZ_CREATE, /* Thermal zone creation */ + THERMAL_GENL_EVENT_TZ_DELETE, /* Thermal zone deletion */ + THERMAL_GENL_EVENT_TZ_DISABLE, /* Thermal zone disabed */ + THERMAL_GENL_EVENT_TZ_ENABLE, /* Thermal zone enabled */ + THERMAL_GENL_EVENT_TZ_TRIP_UP, /* Trip point crossed the way up */ + THERMAL_GENL_EVENT_TZ_TRIP_DOWN, /* Trip point crossed the way down */ + THERMAL_GENL_EVENT_TZ_TRIP_CHANGE, /* Trip point changed */ + THERMAL_GENL_EVENT_TZ_TRIP_ADD, /* Trip point added */ + THERMAL_GENL_EVENT_TZ_TRIP_DELETE, /* Trip point deleted */ + THERMAL_GENL_EVENT_CDEV_ADD, /* Cdev bound to the thermal zone */ + THERMAL_GENL_EVENT_CDEV_DELETE, /* Cdev unbound */ + THERMAL_GENL_EVENT_CDEV_STATE_UPDATE, /* Cdev state updated */ + THERMAL_GENL_EVENT_TZ_GOV_CHANGE, /* Governor policy changed */ + __THERMAL_GENL_EVENT_MAX, +}; +#define THERMAL_GENL_EVENT_MAX (__THERMAL_GENL_EVENT_MAX - 1) + +/* Commands supported by the thermal_genl_family */ +enum thermal_genl_cmd { THERMAL_GENL_CMD_UNSPEC, - THERMAL_GENL_CMD_EVENT, + THERMAL_GENL_CMD_TZ_GET_ID, /* List of thermal zones id */ + THERMAL_GENL_CMD_TZ_GET_TRIP, /* List of thermal trips */ + THERMAL_GENL_CMD_TZ_GET_TEMP, /* Get the thermal zone temperature */ + THERMAL_GENL_CMD_TZ_GET_GOV, /* Get the thermal zone governor */ + THERMAL_GENL_CMD_TZ_GET_MODE, /* Get the thermal zone mode */ + THERMAL_GENL_CMD_CDEV_GET, /* List of cdev id */ __THERMAL_GENL_CMD_MAX, }; #define THERMAL_GENL_CMD_MAX (__THERMAL_GENL_CMD_MAX - 1) -- cgit v1.2.3-59-g8ed1b From f5836749c9c04a10decd2742845ad4870965fdef Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 6 Jul 2020 16:01:25 -0700 Subject: bpf: Add BPF_CGROUP_INET_SOCK_RELEASE hook Sometimes it's handy to know when the socket gets freed. In particular, we'd like to try to use a smarter allocation of ports for bpf_bind and explore the possibility of limiting the number of SOCK_DGRAM sockets the process can have. Implement BPF_CGROUP_INET_SOCK_RELEASE hook that triggers on inet socket release. It triggers only for userspace sockets (not in-kernel ones) and therefore has the same semantics as the existing BPF_CGROUP_INET_SOCK_CREATE. Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200706230128.4073544-2-sdf@google.com --- include/linux/bpf-cgroup.h | 4 ++++ include/uapi/linux/bpf.h | 1 + kernel/bpf/syscall.c | 3 +++ net/core/filter.c | 1 + net/ipv4/af_inet.c | 3 +++ 5 files changed, 12 insertions(+) (limited to 'include/uapi') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index c66c545e161a..2c6f26670acc 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -210,6 +210,9 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) \ BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET_SOCK_CREATE) +#define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk) \ + BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET_SOCK_RELEASE) + #define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) \ BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET4_POST_BIND) @@ -401,6 +404,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) ({ 0; }) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index da9bf35a26f8..548a749aebb3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -226,6 +226,7 @@ enum bpf_attach_type { BPF_CGROUP_INET4_GETSOCKNAME, BPF_CGROUP_INET6_GETSOCKNAME, BPF_XDP_DEVMAP, + BPF_CGROUP_INET_SOCK_RELEASE, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8da159936bab..156f51ffada2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1981,6 +1981,7 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, case BPF_PROG_TYPE_CGROUP_SOCK: switch (expected_attach_type) { case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET_SOCK_RELEASE: case BPF_CGROUP_INET4_POST_BIND: case BPF_CGROUP_INET6_POST_BIND: return 0; @@ -2779,6 +2780,7 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) return BPF_PROG_TYPE_CGROUP_SKB; break; case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET_SOCK_RELEASE: case BPF_CGROUP_INET4_POST_BIND: case BPF_CGROUP_INET6_POST_BIND: return BPF_PROG_TYPE_CGROUP_SOCK; @@ -2929,6 +2931,7 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_INET_INGRESS: case BPF_CGROUP_INET_EGRESS: case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET_SOCK_RELEASE: case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_POST_BIND: diff --git a/net/core/filter.c b/net/core/filter.c index c5e696e6c315..ddcc0d6209e1 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6890,6 +6890,7 @@ static bool __sock_filter_check_attach_type(int off, case offsetof(struct bpf_sock, priority): switch (attach_type) { case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET_SOCK_RELEASE: goto full_access; default: return false; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index ea6ed6d487ed..ff141d630bdf 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -411,6 +411,9 @@ int inet_release(struct socket *sock) if (sk) { long timeout; + if (!sk->sk_kern_sock) + BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk); + /* Applications forget to leave groups before exiting */ ip_mc_drop_socket(sk); -- cgit v1.2.3-59-g8ed1b From 82394db7383d33641f3f565bd79792fb41b1741f Mon Sep 17 00:00:00 2001 From: Matias Bjørling Date: Mon, 29 Jun 2020 12:06:37 -0700 Subject: block: add capacity field to zone descriptors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the zoned storage model, the sectors within a zone are typically all writeable. With the introduction of the Zoned Namespace (ZNS) Command Set in the NVM Express organization, the model was extended to have a specific writeable capacity. Extend the zone descriptor data structure with a zone capacity field to indicate to the user how many sectors in a zone are writeable. Introduce backward compatibility in the zone report ioctl by extending the zone report header data structure with a flags field to indicate if the capacity field is available. Reviewed-by: Jens Axboe Reviewed-by: Javier González Reviewed-by: Chaitanya Kulkarni Reviewed-by: Himanshu Madhani Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Daniel Wagner Signed-off-by: Matias Bjørling Signed-off-by: Christoph Hellwig --- block/blk-zoned.c | 1 + drivers/block/null_blk_zoned.c | 2 ++ drivers/scsi/sd_zbc.c | 1 + include/uapi/linux/blkzoned.h | 15 +++++++++++++-- 4 files changed, 17 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 23831fa8701d..81152a260354 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -312,6 +312,7 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, return ret; rep.nr_zones = ret; + rep.flags = BLK_ZONE_REP_CAPACITY; if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report))) return -EFAULT; return 0; diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c index cc47606d8ffe..624aac09b005 100644 --- a/drivers/block/null_blk_zoned.c +++ b/drivers/block/null_blk_zoned.c @@ -47,6 +47,7 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) zone->start = sector; zone->len = dev->zone_size_sects; + zone->capacity = zone->len; zone->wp = zone->start + zone->len; zone->type = BLK_ZONE_TYPE_CONVENTIONAL; zone->cond = BLK_ZONE_COND_NOT_WP; @@ -59,6 +60,7 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) zone->start = zone->wp = sector; zone->len = dev->zone_size_sects; + zone->capacity = zone->len; zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ; zone->cond = BLK_ZONE_COND_EMPTY; diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index 6f7eba66687e..183a20720da9 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -59,6 +59,7 @@ static int sd_zbc_parse_report(struct scsi_disk *sdkp, u8 *buf, zone.non_seq = 1; zone.len = logical_to_sectors(sdp, get_unaligned_be64(&buf[8])); + zone.capacity = zone.len; zone.start = logical_to_sectors(sdp, get_unaligned_be64(&buf[16])); zone.wp = logical_to_sectors(sdp, get_unaligned_be64(&buf[24])); if (zone.type != ZBC_ZONE_TYPE_CONV && diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h index 0cdef67135f0..42c3366cc25f 100644 --- a/include/uapi/linux/blkzoned.h +++ b/include/uapi/linux/blkzoned.h @@ -73,6 +73,15 @@ enum blk_zone_cond { BLK_ZONE_COND_OFFLINE = 0xF, }; +/** + * enum blk_zone_report_flags - Feature flags of reported zone descriptors. + * + * @BLK_ZONE_REP_CAPACITY: Zone descriptor has capacity field. + */ +enum blk_zone_report_flags { + BLK_ZONE_REP_CAPACITY = (1 << 0), +}; + /** * struct blk_zone - Zone descriptor for BLKREPORTZONE ioctl. * @@ -99,7 +108,9 @@ struct blk_zone { __u8 cond; /* Zone condition */ __u8 non_seq; /* Non-sequential write resources active */ __u8 reset; /* Reset write pointer recommended */ - __u8 reserved[36]; + __u8 resv[4]; + __u64 capacity; /* Zone capacity in number of sectors */ + __u8 reserved[24]; }; /** @@ -115,7 +126,7 @@ struct blk_zone { struct blk_zone_report { __u64 sector; __u32 nr_zones; - __u8 reserved[4]; + __u32 flags; struct blk_zone zones[0]; }; -- cgit v1.2.3-59-g8ed1b From 1aa561b1a4c0ae2a9a9b9c21a84b5ca66b4775d8 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Wed, 3 Jun 2020 16:56:21 -0700 Subject: kvm: x86: Add "last CPU" to some KVM_EXIT information More often than not, a failed VM-entry in an x86 production environment is induced by a defective CPU. To help identify the bad hardware, include the id of the last logical CPU to run a vCPU in the information provided to userspace on a KVM exit for failed VM-entry or for KVM internal errors not associated with emulation. The presence of this additional information is indicated by a new capability, KVM_CAP_LAST_CPU. Signed-off-by: Jim Mattson Reviewed-by: Oliver Upton Reviewed-by: Peter Shier Message-Id: <20200603235623.245638-5-jmattson@google.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 1 + arch/x86/kvm/svm/svm.c | 4 +++- arch/x86/kvm/vmx/vmx.c | 10 ++++++++-- arch/x86/kvm/x86.c | 1 + include/uapi/linux/kvm.h | 2 ++ 5 files changed, 15 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 426f94582b7a..1cfe79b932d6 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -4794,6 +4794,7 @@ hardware_exit_reason. /* KVM_EXIT_FAIL_ENTRY */ struct { __u64 hardware_entry_failure_reason; + __u32 cpu; /* if KVM_LAST_CPU */ } fail_entry; If exit_reason is KVM_EXIT_FAIL_ENTRY, the vcpu could not be run due diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 24b7f321874f..8ecd46f2cb1e 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2947,6 +2947,7 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; kvm_run->fail_entry.hardware_entry_failure_reason = svm->vmcb->control.exit_code; + kvm_run->fail_entry.cpu = svm->last_cpu; dump_vmcb(vcpu); return 0; } @@ -2970,8 +2971,9 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; - vcpu->run->internal.ndata = 1; + vcpu->run->internal.ndata = 2; vcpu->run->internal.data[0] = exit_code; + vcpu->run->internal.data[1] = svm->last_cpu; return 0; } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 4d8f12c0a5c6..b52bcebfa094 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4781,10 +4781,11 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; - vcpu->run->internal.ndata = 3; + vcpu->run->internal.ndata = 4; vcpu->run->internal.data[0] = vect_info; vcpu->run->internal.data[1] = intr_info; vcpu->run->internal.data[2] = error_code; + vcpu->run->internal.data[3] = vmx->last_cpu; return 0; } @@ -6006,6 +6007,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; vcpu->run->fail_entry.hardware_entry_failure_reason = exit_reason; + vcpu->run->fail_entry.cpu = vmx->last_cpu; return 0; } @@ -6014,6 +6016,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; vcpu->run->fail_entry.hardware_entry_failure_reason = vmcs_read32(VM_INSTRUCTION_ERROR); + vcpu->run->fail_entry.cpu = vmx->last_cpu; return 0; } @@ -6040,6 +6043,8 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) vcpu->run->internal.data[3] = vmcs_read64(GUEST_PHYSICAL_ADDRESS); } + vcpu->run->internal.data[vcpu->run->internal.ndata++] = + vmx->last_cpu; return 0; } @@ -6095,8 +6100,9 @@ unexpected_vmexit: vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; - vcpu->run->internal.ndata = 1; + vcpu->run->internal.ndata = 2; vcpu->run->internal.data[0] = exit_reason; + vcpu->run->internal.data[1] = vmx->last_cpu; return 0; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 82f457f0e7e0..1a0fad1018f9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3510,6 +3510,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_MSR_PLATFORM_INFO: case KVM_CAP_EXCEPTION_PAYLOAD: case KVM_CAP_SET_GUEST_DEBUG: + case KVM_CAP_LAST_CPU: r = 1; break; case KVM_CAP_SYNC_REGS: diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 4fdf30316582..ff9b335620d0 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -289,6 +289,7 @@ struct kvm_run { /* KVM_EXIT_FAIL_ENTRY */ struct { __u64 hardware_entry_failure_reason; + __u32 cpu; } fail_entry; /* KVM_EXIT_EXCEPTION */ struct { @@ -1031,6 +1032,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_PPC_SECURE_GUEST 181 #define KVM_CAP_HALT_POLL 182 #define KVM_CAP_ASYNC_PF_INT 183 +#define KVM_CAP_LAST_CPU 184 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3-59-g8ed1b From 065e0d42a0a728d7f6c2aec7c9f3e5dc7b715394 Mon Sep 17 00:00:00 2001 From: Meir Lichtinger Date: Mon, 6 Jul 2020 20:42:32 -0700 Subject: ethtool: Add support for 100Gbps per lane link modes Define 100G, 200G and 400G link modes using 100Gbps per lane LR, ER and FR are defined as a single link mode because they are using same technology and by design are fully interoperable. EEPROM content indicates if the module is LR, ER, or FR, and the user space ethtool decoder is planned to support decoding these modes in the EEPROM. Signed-off-by: Meir Lichtinger CC: Andrew Lunn Reviewed-by: Aya Levin Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/phy/phy-core.c | 17 ++++++++++++++++- include/uapi/linux/ethtool.h | 15 +++++++++++++++ net/ethtool/common.c | 15 +++++++++++++++ net/ethtool/linkmodes.c | 15 +++++++++++++++ 4 files changed, 61 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/drivers/net/phy/phy-core.c b/drivers/net/phy/phy-core.c index 46bd68e9ecfa..ff8e14b01eeb 100644 --- a/drivers/net/phy/phy-core.c +++ b/drivers/net/phy/phy-core.c @@ -8,7 +8,7 @@ const char *phy_speed_to_str(int speed) { - BUILD_BUG_ON_MSG(__ETHTOOL_LINK_MODE_MASK_NBITS != 75, + BUILD_BUG_ON_MSG(__ETHTOOL_LINK_MODE_MASK_NBITS != 90, "Enum ethtool_link_mode_bit_indices and phylib are out of sync. " "If a speed or mode has been added please update phy_speed_to_str " "and the PHY settings array.\n"); @@ -78,12 +78,22 @@ static const struct phy_setting settings[] = { PHY_SETTING( 400000, FULL, 400000baseLR8_ER8_FR8_Full ), PHY_SETTING( 400000, FULL, 400000baseDR8_Full ), PHY_SETTING( 400000, FULL, 400000baseSR8_Full ), + PHY_SETTING( 400000, FULL, 400000baseCR4_Full ), + PHY_SETTING( 400000, FULL, 400000baseKR4_Full ), + PHY_SETTING( 400000, FULL, 400000baseLR4_ER4_FR4_Full ), + PHY_SETTING( 400000, FULL, 400000baseDR4_Full ), + PHY_SETTING( 400000, FULL, 400000baseSR4_Full ), /* 200G */ PHY_SETTING( 200000, FULL, 200000baseCR4_Full ), PHY_SETTING( 200000, FULL, 200000baseKR4_Full ), PHY_SETTING( 200000, FULL, 200000baseLR4_ER4_FR4_Full ), PHY_SETTING( 200000, FULL, 200000baseDR4_Full ), PHY_SETTING( 200000, FULL, 200000baseSR4_Full ), + PHY_SETTING( 200000, FULL, 200000baseCR2_Full ), + PHY_SETTING( 200000, FULL, 200000baseKR2_Full ), + PHY_SETTING( 200000, FULL, 200000baseLR2_ER2_FR2_Full ), + PHY_SETTING( 200000, FULL, 200000baseDR2_Full ), + PHY_SETTING( 200000, FULL, 200000baseSR2_Full ), /* 100G */ PHY_SETTING( 100000, FULL, 100000baseCR4_Full ), PHY_SETTING( 100000, FULL, 100000baseKR4_Full ), @@ -94,6 +104,11 @@ static const struct phy_setting settings[] = { PHY_SETTING( 100000, FULL, 100000baseLR2_ER2_FR2_Full ), PHY_SETTING( 100000, FULL, 100000baseDR2_Full ), PHY_SETTING( 100000, FULL, 100000baseSR2_Full ), + PHY_SETTING( 100000, FULL, 100000baseCR_Full ), + PHY_SETTING( 100000, FULL, 100000baseKR_Full ), + PHY_SETTING( 100000, FULL, 100000baseLR_ER_FR_Full ), + PHY_SETTING( 100000, FULL, 100000baseDR_Full ), + PHY_SETTING( 100000, FULL, 100000baseSR_Full ), /* 56G */ PHY_SETTING( 56000, FULL, 56000baseCR4_Full ), PHY_SETTING( 56000, FULL, 56000baseKR4_Full ), diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index d1413538ef30..60856e0f9618 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1600,6 +1600,21 @@ enum ethtool_link_mode_bit_indices { ETHTOOL_LINK_MODE_400000baseDR8_Full_BIT = 72, ETHTOOL_LINK_MODE_400000baseCR8_Full_BIT = 73, ETHTOOL_LINK_MODE_FEC_LLRS_BIT = 74, + ETHTOOL_LINK_MODE_100000baseKR_Full_BIT = 75, + ETHTOOL_LINK_MODE_100000baseSR_Full_BIT = 76, + ETHTOOL_LINK_MODE_100000baseLR_ER_FR_Full_BIT = 77, + ETHTOOL_LINK_MODE_100000baseCR_Full_BIT = 78, + ETHTOOL_LINK_MODE_100000baseDR_Full_BIT = 79, + ETHTOOL_LINK_MODE_200000baseKR2_Full_BIT = 80, + ETHTOOL_LINK_MODE_200000baseSR2_Full_BIT = 81, + ETHTOOL_LINK_MODE_200000baseLR2_ER2_FR2_Full_BIT = 82, + ETHTOOL_LINK_MODE_200000baseDR2_Full_BIT = 83, + ETHTOOL_LINK_MODE_200000baseCR2_Full_BIT = 84, + ETHTOOL_LINK_MODE_400000baseKR4_Full_BIT = 85, + ETHTOOL_LINK_MODE_400000baseSR4_Full_BIT = 86, + ETHTOOL_LINK_MODE_400000baseLR4_ER4_FR4_Full_BIT = 87, + ETHTOOL_LINK_MODE_400000baseDR4_Full_BIT = 88, + ETHTOOL_LINK_MODE_400000baseCR4_Full_BIT = 89, /* must be last entry */ __ETHTOOL_LINK_MODE_MASK_NBITS }; diff --git a/net/ethtool/common.c b/net/ethtool/common.c index ce4dbae5a943..c54166713797 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -176,6 +176,21 @@ const char link_mode_names[][ETH_GSTRING_LEN] = { __DEFINE_LINK_MODE_NAME(400000, DR8, Full), __DEFINE_LINK_MODE_NAME(400000, CR8, Full), __DEFINE_SPECIAL_MODE_NAME(FEC_LLRS, "LLRS"), + __DEFINE_LINK_MODE_NAME(100000, KR, Full), + __DEFINE_LINK_MODE_NAME(100000, SR, Full), + __DEFINE_LINK_MODE_NAME(100000, LR_ER_FR, Full), + __DEFINE_LINK_MODE_NAME(100000, DR, Full), + __DEFINE_LINK_MODE_NAME(100000, CR, Full), + __DEFINE_LINK_MODE_NAME(200000, KR2, Full), + __DEFINE_LINK_MODE_NAME(200000, SR2, Full), + __DEFINE_LINK_MODE_NAME(200000, LR2_ER2_FR2, Full), + __DEFINE_LINK_MODE_NAME(200000, DR2, Full), + __DEFINE_LINK_MODE_NAME(200000, CR2, Full), + __DEFINE_LINK_MODE_NAME(400000, KR4, Full), + __DEFINE_LINK_MODE_NAME(400000, SR4, Full), + __DEFINE_LINK_MODE_NAME(400000, LR4_ER4_FR4, Full), + __DEFINE_LINK_MODE_NAME(400000, DR4, Full), + __DEFINE_LINK_MODE_NAME(400000, CR4, Full), }; static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS); diff --git a/net/ethtool/linkmodes.c b/net/ethtool/linkmodes.c index fd4f3e58c6f6..317a93129551 100644 --- a/net/ethtool/linkmodes.c +++ b/net/ethtool/linkmodes.c @@ -257,6 +257,21 @@ static const struct link_mode_info link_mode_params[] = { __DEFINE_LINK_MODE_PARAMS(400000, DR8, Full), __DEFINE_LINK_MODE_PARAMS(400000, CR8, Full), __DEFINE_SPECIAL_MODE_PARAMS(FEC_LLRS), + __DEFINE_LINK_MODE_PARAMS(100000, KR, Full), + __DEFINE_LINK_MODE_PARAMS(100000, SR, Full), + __DEFINE_LINK_MODE_PARAMS(100000, LR_ER_FR, Full), + __DEFINE_LINK_MODE_PARAMS(100000, DR, Full), + __DEFINE_LINK_MODE_PARAMS(100000, CR, Full), + __DEFINE_LINK_MODE_PARAMS(200000, KR2, Full), + __DEFINE_LINK_MODE_PARAMS(200000, SR2, Full), + __DEFINE_LINK_MODE_PARAMS(200000, LR2_ER2_FR2, Full), + __DEFINE_LINK_MODE_PARAMS(200000, DR2, Full), + __DEFINE_LINK_MODE_PARAMS(200000, CR2, Full), + __DEFINE_LINK_MODE_PARAMS(400000, KR4, Full), + __DEFINE_LINK_MODE_PARAMS(400000, SR4, Full), + __DEFINE_LINK_MODE_PARAMS(400000, LR4_ER4_FR4, Full), + __DEFINE_LINK_MODE_PARAMS(400000, DR4, Full), + __DEFINE_LINK_MODE_PARAMS(400000, CR4, Full), }; static const struct nla_policy -- cgit v1.2.3-59-g8ed1b From 3f935c75eb52dd968351dba824adf466fb9c9429 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 9 Jul 2020 15:12:39 +0200 Subject: inet_diag: support for wider protocol numbers After commit bf9765145b85 ("sock: Make sk_protocol a 16-bit value") the current size of 'sdiag_protocol' is not sufficient to represent the possible protocol values. This change introduces a new inet diag request attribute to let user space specify the relevant protocol number using u32 values. The attribute is parsed by inet diag core on get/dump command and the extended protocol value, if available, is preferred to 'sdiag_protocol' to lookup the diag handler. The parse attributed are exposed to all the diag handlers via the cb->data. Note that inet_diag_dump_one_icsk() is left unmodified, as it will not be used by protocol using the extended attribute. Suggested-by: David S. Miller Co-developed-by: Christoph Paasch Signed-off-by: Christoph Paasch Acked-by: Mat Martineau Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/uapi/linux/inet_diag.h | 1 + net/core/sock.c | 1 + net/ipv4/inet_diag.c | 65 +++++++++++++++++++++++++++++++----------- 3 files changed, 50 insertions(+), 17 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index e6f183ee8417..5ba122c1949a 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -65,6 +65,7 @@ enum { INET_DIAG_REQ_NONE, INET_DIAG_REQ_BYTECODE, INET_DIAG_REQ_SK_BPF_STORAGES, + INET_DIAG_REQ_PROTOCOL, __INET_DIAG_REQ_MAX, }; diff --git a/net/core/sock.c b/net/core/sock.c index f5b5fdd61c88..de26fe4ea19f 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3566,6 +3566,7 @@ int sock_load_diag_module(int family, int protocol) #ifdef CONFIG_INET if (family == AF_INET && protocol != IPPROTO_RAW && + protocol < MAX_INET_PROTOS && !rcu_access_pointer(inet_protos[protocol])) return -ENOENT; #endif diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 125f4f8a36b4..4a98dd736270 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -52,6 +52,11 @@ static DEFINE_MUTEX(inet_diag_table_mutex); static const struct inet_diag_handler *inet_diag_lock_handler(int proto) { + if (proto < 0 || proto >= IPPROTO_MAX) { + mutex_lock(&inet_diag_table_mutex); + return ERR_PTR(-ENOENT); + } + if (!inet_diag_table[proto]) sock_load_diag_module(AF_INET, proto); @@ -181,6 +186,28 @@ errout: } EXPORT_SYMBOL_GPL(inet_diag_msg_attrs_fill); +static void inet_diag_parse_attrs(const struct nlmsghdr *nlh, int hdrlen, + struct nlattr **req_nlas) +{ + struct nlattr *nla; + int remaining; + + nlmsg_for_each_attr(nla, nlh, hdrlen, remaining) { + int type = nla_type(nla); + + if (type < __INET_DIAG_REQ_MAX) + req_nlas[type] = nla; + } +} + +static int inet_diag_get_protocol(const struct inet_diag_req_v2 *req, + const struct inet_diag_dump_data *data) +{ + if (data->req_nlas[INET_DIAG_REQ_PROTOCOL]) + return nla_get_u32(data->req_nlas[INET_DIAG_REQ_PROTOCOL]); + return req->sdiag_protocol; +} + #define MAX_DUMP_ALLOC_SIZE (KMALLOC_MAX_SIZE - SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, @@ -198,7 +225,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, void *info = NULL; cb_data = cb->data; - handler = inet_diag_table[req->sdiag_protocol]; + handler = inet_diag_table[inet_diag_get_protocol(req, cb_data)]; BUG_ON(!handler); nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, @@ -539,20 +566,25 @@ EXPORT_SYMBOL_GPL(inet_diag_dump_one_icsk); static int inet_diag_cmd_exact(int cmd, struct sk_buff *in_skb, const struct nlmsghdr *nlh, + int hdrlen, const struct inet_diag_req_v2 *req) { const struct inet_diag_handler *handler; - int err; + struct inet_diag_dump_data dump_data; + int err, protocol; - handler = inet_diag_lock_handler(req->sdiag_protocol); + memset(&dump_data, 0, sizeof(dump_data)); + inet_diag_parse_attrs(nlh, hdrlen, dump_data.req_nlas); + protocol = inet_diag_get_protocol(req, &dump_data); + + handler = inet_diag_lock_handler(protocol); if (IS_ERR(handler)) { err = PTR_ERR(handler); } else if (cmd == SOCK_DIAG_BY_FAMILY) { - struct inet_diag_dump_data empty_dump_data = {}; struct netlink_callback cb = { .nlh = nlh, .skb = in_skb, - .data = &empty_dump_data, + .data = &dump_data, }; err = handler->dump_one(&cb, req); } else if (cmd == SOCK_DESTROY && handler->destroy) { @@ -1103,13 +1135,16 @@ EXPORT_SYMBOL_GPL(inet_diag_dump_icsk); static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { + struct inet_diag_dump_data *cb_data = cb->data; const struct inet_diag_handler *handler; u32 prev_min_dump_alloc; - int err = 0; + int protocol, err = 0; + + protocol = inet_diag_get_protocol(r, cb_data); again: prev_min_dump_alloc = cb->min_dump_alloc; - handler = inet_diag_lock_handler(r->sdiag_protocol); + handler = inet_diag_lock_handler(protocol); if (!IS_ERR(handler)) handler->dump(skb, cb, r); else @@ -1139,19 +1174,13 @@ static int __inet_diag_dump_start(struct netlink_callback *cb, int hdrlen) struct inet_diag_dump_data *cb_data; struct sk_buff *skb = cb->skb; struct nlattr *nla; - int rem, err; + int err; cb_data = kzalloc(sizeof(*cb_data), GFP_KERNEL); if (!cb_data) return -ENOMEM; - nla_for_each_attr(nla, nlmsg_attrdata(nlh, hdrlen), - nlmsg_attrlen(nlh, hdrlen), rem) { - int type = nla_type(nla); - - if (type < __INET_DIAG_REQ_MAX) - cb_data->req_nlas[type] = nla; - } + inet_diag_parse_attrs(nlh, hdrlen, cb_data->req_nlas); nla = cb_data->inet_diag_nla_bc; if (nla) { @@ -1237,7 +1266,8 @@ static int inet_diag_get_exact_compat(struct sk_buff *in_skb, req.idiag_states = rc->idiag_states; req.id = rc->id; - return inet_diag_cmd_exact(SOCK_DIAG_BY_FAMILY, in_skb, nlh, &req); + return inet_diag_cmd_exact(SOCK_DIAG_BY_FAMILY, in_skb, nlh, + sizeof(struct inet_diag_req), &req); } static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh) @@ -1279,7 +1309,8 @@ static int inet_diag_handler_cmd(struct sk_buff *skb, struct nlmsghdr *h) return netlink_dump_start(net->diag_nlsk, skb, h, &c); } - return inet_diag_cmd_exact(h->nlmsg_type, skb, h, nlmsg_data(h)); + return inet_diag_cmd_exact(h->nlmsg_type, skb, h, hdrlen, + nlmsg_data(h)); } static -- cgit v1.2.3-59-g8ed1b From ac3b45f6095452a9731f8825be1513d326dbfa15 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 9 Jul 2020 15:12:41 +0200 Subject: mptcp: add MPTCP socket diag interface exposes basic inet socket attribute, plus some MPTCP socket fields comprising PM status and MPTCP-level sequence numbers. Reviewed-by: Mat Martineau Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/uapi/linux/mptcp.h | 17 +++++ net/mptcp/Kconfig | 4 ++ net/mptcp/Makefile | 2 + net/mptcp/mptcp_diag.c | 169 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 192 insertions(+) create mode 100644 net/mptcp/mptcp_diag.c (limited to 'include/uapi') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 5f2c77082d9e..9762660df741 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -86,4 +86,21 @@ enum { __MPTCP_PM_CMD_AFTER_LAST }; +#define MPTCP_INFO_FLAG_FALLBACK _BITUL(0) +#define MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED _BITUL(1) + +struct mptcp_info { + __u8 mptcpi_subflows; + __u8 mptcpi_add_addr_signal; + __u8 mptcpi_add_addr_accepted; + __u8 mptcpi_subflows_max; + __u8 mptcpi_add_addr_signal_max; + __u8 mptcpi_add_addr_accepted_max; + __u32 mptcpi_flags; + __u32 mptcpi_token; + __u64 mptcpi_write_seq; + __u64 mptcpi_snd_una; + __u64 mptcpi_rcv_nxt; +}; + #endif /* _UAPI_MPTCP_H */ diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig index af84fce70bb0..698bc3525160 100644 --- a/net/mptcp/Kconfig +++ b/net/mptcp/Kconfig @@ -13,6 +13,10 @@ config MPTCP if MPTCP +config INET_MPTCP_DIAG + depends on INET_DIAG + def_tristate INET_DIAG + config MPTCP_IPV6 bool "MPTCP: IPv6 support for Multipath TCP" select IPV6 diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile index c53f9b845523..2360cbd27d59 100644 --- a/net/mptcp/Makefile +++ b/net/mptcp/Makefile @@ -4,6 +4,8 @@ obj-$(CONFIG_MPTCP) += mptcp.o mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \ mib.o pm_netlink.o +obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o + mptcp_crypto_test-objs := crypto_test.o mptcp_token_test-objs := token_test.o obj-$(CONFIG_MPTCP_KUNIT_TESTS) += mptcp_crypto_test.o mptcp_token_test.o diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c new file mode 100644 index 000000000000..5f390a97f556 --- /dev/null +++ b/net/mptcp/mptcp_diag.c @@ -0,0 +1,169 @@ +// SPDX-License-Identifier: GPL-2.0 +/* MPTCP socket monitoring support + * + * Copyright (c) 2020 Red Hat + * + * Author: Paolo Abeni + */ + +#include +#include +#include +#include +#include +#include "protocol.h" + +static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, + struct netlink_callback *cb, + const struct inet_diag_req_v2 *req, + struct nlattr *bc, bool net_admin) +{ + if (!inet_diag_bc_sk(bc, sk)) + return 0; + + return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, req, NLM_F_MULTI, + net_admin); +} + +static int mptcp_diag_dump_one(struct netlink_callback *cb, + const struct inet_diag_req_v2 *req) +{ + struct sk_buff *in_skb = cb->skb; + struct mptcp_sock *msk = NULL; + struct sk_buff *rep; + int err = -ENOENT; + struct net *net; + struct sock *sk; + + net = sock_net(in_skb->sk); + msk = mptcp_token_get_sock(req->id.idiag_cookie[0]); + if (!msk) + goto out_nosk; + + err = -ENOMEM; + sk = (struct sock *)msk; + rep = nlmsg_new(nla_total_size(sizeof(struct inet_diag_msg)) + + inet_diag_msg_attrs_size() + + nla_total_size(sizeof(struct mptcp_info)) + + nla_total_size(sizeof(struct inet_diag_meminfo)) + 64, + GFP_KERNEL); + if (!rep) + goto out; + + err = inet_sk_diag_fill(sk, inet_csk(sk), rep, cb, req, 0, + netlink_net_capable(in_skb, CAP_NET_ADMIN)); + if (err < 0) { + WARN_ON(err == -EMSGSIZE); + kfree_skb(rep); + goto out; + } + err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid, + MSG_DONTWAIT); + if (err > 0) + err = 0; +out: + sock_put(sk); + +out_nosk: + return err; +} + +static void mptcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, + const struct inet_diag_req_v2 *r) +{ + bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); + struct net *net = sock_net(skb->sk); + struct inet_diag_dump_data *cb_data; + struct mptcp_sock *msk; + struct nlattr *bc; + + cb_data = cb->data; + bc = cb_data->inet_diag_nla_bc; + + while ((msk = mptcp_token_iter_next(net, &cb->args[0], &cb->args[1])) != + NULL) { + struct inet_sock *inet = (struct inet_sock *)msk; + struct sock *sk = (struct sock *)msk; + int ret = 0; + + if (!(r->idiag_states & (1 << sk->sk_state))) + goto next; + if (r->sdiag_family != AF_UNSPEC && + sk->sk_family != r->sdiag_family) + goto next; + if (r->id.idiag_sport != inet->inet_sport && + r->id.idiag_sport) + goto next; + if (r->id.idiag_dport != inet->inet_dport && + r->id.idiag_dport) + goto next; + + ret = sk_diag_dump(sk, skb, cb, r, bc, net_admin); +next: + sock_put(sk); + if (ret < 0) { + /* will retry on the same position */ + cb->args[1]--; + break; + } + cond_resched(); + } +} + +static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, + void *_info) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct mptcp_info *info = _info; + u32 flags = 0; + bool slow; + u8 val; + + r->idiag_rqueue = sk_rmem_alloc_get(sk); + r->idiag_wqueue = sk_wmem_alloc_get(sk); + if (!info) + return; + + slow = lock_sock_fast(sk); + info->mptcpi_subflows = READ_ONCE(msk->pm.subflows); + info->mptcpi_add_addr_signal = READ_ONCE(msk->pm.add_addr_signaled); + info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted); + info->mptcpi_subflows_max = READ_ONCE(msk->pm.subflows_max); + val = READ_ONCE(msk->pm.add_addr_signal_max); + info->mptcpi_add_addr_signal_max = val; + val = READ_ONCE(msk->pm.add_addr_accept_max); + info->mptcpi_add_addr_accepted_max = val; + if (test_bit(MPTCP_FALLBACK_DONE, &msk->flags)) + flags |= MPTCP_INFO_FLAG_FALLBACK; + if (READ_ONCE(msk->can_ack)) + flags |= MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED; + info->mptcpi_flags = flags; + info->mptcpi_token = READ_ONCE(msk->token); + info->mptcpi_write_seq = READ_ONCE(msk->write_seq); + info->mptcpi_snd_una = atomic64_read(&msk->snd_una); + info->mptcpi_rcv_nxt = READ_ONCE(msk->ack_seq); + unlock_sock_fast(sk, slow); +} + +static const struct inet_diag_handler mptcp_diag_handler = { + .dump = mptcp_diag_dump, + .dump_one = mptcp_diag_dump_one, + .idiag_get_info = mptcp_diag_get_info, + .idiag_type = IPPROTO_MPTCP, + .idiag_info_size = sizeof(struct mptcp_info), +}; + +static int __init mptcp_diag_init(void) +{ + return inet_diag_register(&mptcp_diag_handler); +} + +static void __exit mptcp_diag_exit(void) +{ + inet_diag_unregister(&mptcp_diag_handler); +} + +module_init(mptcp_diag_init); +module_exit(mptcp_diag_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-262 /* AF_INET - IPPROTO_MPTCP */); -- cgit v1.2.3-59-g8ed1b From a21cf0a8330bba60e44ca6c99e1591042f336ff5 Mon Sep 17 00:00:00 2001 From: Danielle Ratson Date: Thu, 9 Jul 2020 16:18:18 +0300 Subject: devlink: Add a new devlink port lanes attribute and pass to netlink Add a new devlink port attribute that indicates the port's number of lanes. Drivers are expected to set it via devlink_port_attrs_set(), before registering the port. The attribute is not passed to user space in case the number of lanes is invalid (0). Signed-off-by: Danielle Ratson Reviewed-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/core.c | 1 + include/net/devlink.h | 2 ++ include/uapi/linux/devlink.h | 2 ++ net/core/devlink.c | 4 ++++ 4 files changed, 9 insertions(+) (limited to 'include/uapi') diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c index f44cb1a537f3..6cde196f6b70 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core.c +++ b/drivers/net/ethernet/mellanox/mlxsw/core.c @@ -2134,6 +2134,7 @@ static int __mlxsw_core_port_init(struct mlxsw_core *mlxsw_core, u8 local_port, int err; attrs.split = split; + attrs.lanes = lanes; attrs.flavour = flavour; attrs.phys.port_number = port_number; attrs.phys.split_subport_number = split_port_subnumber; diff --git a/include/net/devlink.h b/include/net/devlink.h index 8f9db991192d..91a9f8770d08 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -68,10 +68,12 @@ struct devlink_port_pci_vf_attrs { * struct devlink_port_attrs - devlink port object * @flavour: flavour of the port * @split: indicates if this is split port + * @lanes: maximum number of lanes the port supports. 0 value is not passed to netlink. * @switch_id: if the port is part of switch, this is buffer with ID, otherwise this is NULL */ struct devlink_port_attrs { u8 split:1; + u32 lanes; enum devlink_port_flavour flavour; struct netdev_phys_item_id switch_id; union { diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 87c83a82991b..f741ab8d9cf0 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -455,6 +455,8 @@ enum devlink_attr { DEVLINK_ATTR_INFO_BOARD_SERIAL_NUMBER, /* string */ + DEVLINK_ATTR_PORT_LANES, /* u32 */ + /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index 266936c38357..7f26d1054974 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -530,6 +530,10 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg, if (!devlink_port->attrs_set) return 0; + if (attrs->lanes) { + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_LANES, attrs->lanes)) + return -EMSGSIZE; + } if (nla_put_u16(msg, DEVLINK_ATTR_PORT_FLAVOUR, attrs->flavour)) return -EMSGSIZE; switch (devlink_port->attrs.flavour) { -- cgit v1.2.3-59-g8ed1b From a0f49b54865273c895be3826d6d59cbc5ad725c2 Mon Sep 17 00:00:00 2001 From: Danielle Ratson Date: Thu, 9 Jul 2020 16:18:20 +0300 Subject: devlink: Add a new devlink port split ability attribute and pass to netlink Add a new attribute that indicates the split ability of devlink port. Drivers are expected to set it via devlink_port_attrs_set(), before registering the port. Signed-off-by: Danielle Ratson Reviewed-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/core.c | 1 + drivers/net/ethernet/netronome/nfp/nfp_devlink.c | 1 + include/net/devlink.h | 4 +++- include/uapi/linux/devlink.h | 1 + net/core/devlink.c | 3 +++ 5 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c index f85f5d88d331..8b3791d73c99 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core.c +++ b/drivers/net/ethernet/mellanox/mlxsw/core.c @@ -2135,6 +2135,7 @@ static int __mlxsw_core_port_init(struct mlxsw_core *mlxsw_core, u8 local_port, attrs.split = split; attrs.lanes = lanes; + attrs.splittable = splittable; attrs.flavour = flavour; attrs.phys.port_number = port_number; attrs.phys.split_subport_number = split_port_subnumber; diff --git a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c index 71f4e624b3db..b6a10565309a 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c @@ -367,6 +367,7 @@ int nfp_devlink_port_register(struct nfp_app *app, struct nfp_port *port) return ret; attrs.split = eth_port.is_split; + attrs.splittable = !attrs.split; attrs.flavour = DEVLINK_PORT_FLAVOUR_PHYSICAL; attrs.phys.port_number = eth_port.label_port; attrs.phys.split_subport_number = eth_port.label_subport; diff --git a/include/net/devlink.h b/include/net/devlink.h index 91a9f8770d08..746bed538664 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -68,11 +68,13 @@ struct devlink_port_pci_vf_attrs { * struct devlink_port_attrs - devlink port object * @flavour: flavour of the port * @split: indicates if this is split port + * @splittable: indicates if the port can be split. * @lanes: maximum number of lanes the port supports. 0 value is not passed to netlink. * @switch_id: if the port is part of switch, this is buffer with ID, otherwise this is NULL */ struct devlink_port_attrs { - u8 split:1; + u8 split:1, + splittable:1; u32 lanes; enum devlink_port_flavour flavour; struct netdev_phys_item_id switch_id; diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index f741ab8d9cf0..cfef4245ea5a 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -456,6 +456,7 @@ enum devlink_attr { DEVLINK_ATTR_INFO_BOARD_SERIAL_NUMBER, /* string */ DEVLINK_ATTR_PORT_LANES, /* u32 */ + DEVLINK_ATTR_PORT_SPLITTABLE, /* u8 */ /* add new attributes above here, update the policy in devlink.c */ diff --git a/net/core/devlink.c b/net/core/devlink.c index 7f26d1054974..94c797b74378 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -534,6 +534,8 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg, if (nla_put_u32(msg, DEVLINK_ATTR_PORT_LANES, attrs->lanes)) return -EMSGSIZE; } + if (nla_put_u8(msg, DEVLINK_ATTR_PORT_SPLITTABLE, attrs->splittable)) + return -EMSGSIZE; if (nla_put_u16(msg, DEVLINK_ATTR_PORT_FLAVOUR, attrs->flavour)) return -EMSGSIZE; switch (devlink_port->attrs.flavour) { @@ -7547,6 +7549,7 @@ void devlink_port_attrs_set(struct devlink_port *devlink_port, ret = __devlink_port_attrs_set(devlink_port, attrs->flavour); if (ret) return; + WARN_ON(attrs->splittable && attrs->split); } EXPORT_SYMBOL_GPL(devlink_port_attrs_set); -- cgit v1.2.3-59-g8ed1b From 631beddc5466731b048263a4a9d3d67150e72f8d Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 9 Jul 2020 14:08:55 +0200 Subject: virt: vbox: Add support for the new VBG_IOCTL_ACQUIRE_GUEST_CAPABILITIES ioctl Add support for the new VBG_IOCTL_ACQUIRE_GUEST_CAPABILITIES ioctl, this is necessary for automatic resizing of the guest resolution to match the VM-window size to work with the new VMSVGA virtual GPU which is now the new default in VirtualBox. BugLink: https://bugzilla.redhat.com/show_bug.cgi?id=1789545 Acked-by: Arnd Bergmann Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20200709120858.63928-6-hdegoede@redhat.com Signed-off-by: Greg Kroah-Hartman --- drivers/virt/vboxguest/vboxguest_core.c | 163 +++++++++++++++++++++++++++++++- drivers/virt/vboxguest/vboxguest_core.h | 14 +++ include/uapi/linux/vboxguest.h | 24 +++++ 3 files changed, 200 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/drivers/virt/vboxguest/vboxguest_core.c b/drivers/virt/vboxguest/vboxguest_core.c index 15b3cb618c6e..4f1addaa3f6f 100644 --- a/drivers/virt/vboxguest/vboxguest_core.c +++ b/drivers/virt/vboxguest/vboxguest_core.c @@ -679,7 +679,7 @@ static int vbg_set_host_capabilities(struct vbg_dev *gdev, WARN_ON(!mutex_is_locked(&gdev->session_mutex)); - caps = gdev->set_guest_caps_tracker.mask; + caps = gdev->acquired_guest_caps | gdev->set_guest_caps_tracker.mask; if (gdev->guest_caps_host == caps) return 0; @@ -703,6 +703,113 @@ static int vbg_set_host_capabilities(struct vbg_dev *gdev, return vbg_status_code_to_errno(rc); } +/** + * Acquire (get exclusive access) guest capabilities for a session. + * Takes the session mutex. + * Return: 0 or negative errno value. + * @gdev: The Guest extension device. + * @session: The session. + * @flags: Flags (VBGL_IOC_AGC_FLAGS_XXX). + * @or_mask: The capabilities to add. + * @not_mask: The capabilities to remove. + * @session_termination: Set if we're called by the session cleanup code. + * This tweaks the error handling so we perform + * proper session cleanup even if the host + * misbehaves. + */ +static int vbg_acquire_session_capabilities(struct vbg_dev *gdev, + struct vbg_session *session, + u32 or_mask, u32 not_mask, + u32 flags, bool session_termination) +{ + unsigned long irqflags; + bool wakeup = false; + int ret = 0; + + mutex_lock(&gdev->session_mutex); + + if (gdev->set_guest_caps_tracker.mask & or_mask) { + vbg_err("%s error: cannot acquire caps which are currently set\n", + __func__); + ret = -EINVAL; + goto out; + } + + /* + * Mark any caps in the or_mask as now being in acquire-mode. Note + * once caps are in acquire_mode they always stay in this mode. + * This impacts event handling, so we take the event-lock. + */ + spin_lock_irqsave(&gdev->event_spinlock, irqflags); + gdev->acquire_mode_guest_caps |= or_mask; + spin_unlock_irqrestore(&gdev->event_spinlock, irqflags); + + /* If we only have to switch the caps to acquire mode, we're done. */ + if (flags & VBGL_IOC_AGC_FLAGS_CONFIG_ACQUIRE_MODE) + goto out; + + not_mask &= ~or_mask; /* or_mask takes priority over not_mask */ + not_mask &= session->acquired_guest_caps; + or_mask &= ~session->acquired_guest_caps; + + if (or_mask == 0 && not_mask == 0) + goto out; + + if (gdev->acquired_guest_caps & or_mask) { + ret = -EBUSY; + goto out; + } + + gdev->acquired_guest_caps |= or_mask; + gdev->acquired_guest_caps &= ~not_mask; + /* session->acquired_guest_caps impacts event handling, take the lock */ + spin_lock_irqsave(&gdev->event_spinlock, irqflags); + session->acquired_guest_caps |= or_mask; + session->acquired_guest_caps &= ~not_mask; + spin_unlock_irqrestore(&gdev->event_spinlock, irqflags); + + ret = vbg_set_host_capabilities(gdev, session, session_termination); + /* Roll back on failure, unless it's session termination time. */ + if (ret < 0 && !session_termination) { + gdev->acquired_guest_caps &= ~or_mask; + gdev->acquired_guest_caps |= not_mask; + spin_lock_irqsave(&gdev->event_spinlock, irqflags); + session->acquired_guest_caps &= ~or_mask; + session->acquired_guest_caps |= not_mask; + spin_unlock_irqrestore(&gdev->event_spinlock, irqflags); + } + + /* + * If we added a capability, check if that means some other thread in + * our session should be unblocked because there are events pending + * (the result of vbg_get_allowed_event_mask_for_session() may change). + * + * HACK ALERT! When the seamless support capability is added we generate + * a seamless change event so that the ring-3 client can sync with + * the seamless state. + */ + if (ret == 0 && or_mask != 0) { + spin_lock_irqsave(&gdev->event_spinlock, irqflags); + + if (or_mask & VMMDEV_GUEST_SUPPORTS_SEAMLESS) + gdev->pending_events |= + VMMDEV_EVENT_SEAMLESS_MODE_CHANGE_REQUEST; + + if (gdev->pending_events) + wakeup = true; + + spin_unlock_irqrestore(&gdev->event_spinlock, irqflags); + + if (wakeup) + wake_up(&gdev->event_wq); + } + +out: + mutex_unlock(&gdev->session_mutex); + + return ret; +} + /** * Sets the guest capabilities for a session. Takes the session spinlock. * Return: 0 or negative errno value. @@ -725,6 +832,13 @@ static int vbg_set_session_capabilities(struct vbg_dev *gdev, mutex_lock(&gdev->session_mutex); + if (gdev->acquire_mode_guest_caps & or_mask) { + vbg_err("%s error: cannot set caps which are in acquire_mode\n", + __func__); + ret = -EBUSY; + goto out; + } + /* Apply the changes to the session mask. */ previous = session->set_guest_caps; session->set_guest_caps |= or_mask; @@ -962,6 +1076,7 @@ void vbg_core_close_session(struct vbg_session *session) struct vbg_dev *gdev = session->gdev; int i, rc; + vbg_acquire_session_capabilities(gdev, session, 0, U32_MAX, 0, true); vbg_set_session_capabilities(gdev, session, 0, U32_MAX, true); vbg_set_session_event_filter(gdev, session, 0, U32_MAX, true); @@ -1019,6 +1134,25 @@ static int vbg_ioctl_driver_version_info( return 0; } +/* Must be called with the event_lock held */ +static u32 vbg_get_allowed_event_mask_for_session(struct vbg_dev *gdev, + struct vbg_session *session) +{ + u32 acquire_mode_caps = gdev->acquire_mode_guest_caps; + u32 session_acquired_caps = session->acquired_guest_caps; + u32 allowed_events = VMMDEV_EVENT_VALID_EVENT_MASK; + + if ((acquire_mode_caps & VMMDEV_GUEST_SUPPORTS_GRAPHICS) && + !(session_acquired_caps & VMMDEV_GUEST_SUPPORTS_GRAPHICS)) + allowed_events &= ~VMMDEV_EVENT_DISPLAY_CHANGE_REQUEST; + + if ((acquire_mode_caps & VMMDEV_GUEST_SUPPORTS_SEAMLESS) && + !(session_acquired_caps & VMMDEV_GUEST_SUPPORTS_SEAMLESS)) + allowed_events &= ~VMMDEV_EVENT_SEAMLESS_MODE_CHANGE_REQUEST; + + return allowed_events; +} + static bool vbg_wait_event_cond(struct vbg_dev *gdev, struct vbg_session *session, u32 event_mask) @@ -1030,6 +1164,7 @@ static bool vbg_wait_event_cond(struct vbg_dev *gdev, spin_lock_irqsave(&gdev->event_spinlock, flags); events = gdev->pending_events & event_mask; + events &= vbg_get_allowed_event_mask_for_session(gdev, session); wakeup = events || session->cancel_waiters; spin_unlock_irqrestore(&gdev->event_spinlock, flags); @@ -1044,6 +1179,7 @@ static u32 vbg_consume_events_locked(struct vbg_dev *gdev, { u32 events = gdev->pending_events & event_mask; + events &= vbg_get_allowed_event_mask_for_session(gdev, session); gdev->pending_events &= ~events; return events; } @@ -1445,6 +1581,29 @@ static int vbg_ioctl_change_filter_mask(struct vbg_dev *gdev, false); } +static int vbg_ioctl_acquire_guest_capabilities(struct vbg_dev *gdev, + struct vbg_session *session, + struct vbg_ioctl_acquire_guest_caps *caps) +{ + u32 flags, or_mask, not_mask; + + if (vbg_ioctl_chk(&caps->hdr, sizeof(caps->u.in), 0)) + return -EINVAL; + + flags = caps->u.in.flags; + or_mask = caps->u.in.or_mask; + not_mask = caps->u.in.not_mask; + + if (flags & ~VBGL_IOC_AGC_FLAGS_VALID_MASK) + return -EINVAL; + + if ((or_mask | not_mask) & ~VMMDEV_GUEST_CAPABILITIES_MASK) + return -EINVAL; + + return vbg_acquire_session_capabilities(gdev, session, or_mask, + not_mask, flags, false); +} + static int vbg_ioctl_change_guest_capabilities(struct vbg_dev *gdev, struct vbg_session *session, struct vbg_ioctl_set_guest_caps *caps) { @@ -1554,6 +1713,8 @@ int vbg_core_ioctl(struct vbg_session *session, unsigned int req, void *data) return vbg_ioctl_interrupt_all_wait_events(gdev, session, data); case VBG_IOCTL_CHANGE_FILTER_MASK: return vbg_ioctl_change_filter_mask(gdev, session, data); + case VBG_IOCTL_ACQUIRE_GUEST_CAPABILITIES: + return vbg_ioctl_acquire_guest_capabilities(gdev, session, data); case VBG_IOCTL_CHANGE_GUEST_CAPABILITIES: return vbg_ioctl_change_guest_capabilities(gdev, session, data); case VBG_IOCTL_CHECK_BALLOON: diff --git a/drivers/virt/vboxguest/vboxguest_core.h b/drivers/virt/vboxguest/vboxguest_core.h index dc745a033164..ab4bf64e2cec 100644 --- a/drivers/virt/vboxguest/vboxguest_core.h +++ b/drivers/virt/vboxguest/vboxguest_core.h @@ -117,6 +117,15 @@ struct vbg_dev { */ u32 event_filter_host; + /** + * Guest capabilities which have been switched to acquire_mode. + */ + u32 acquire_mode_guest_caps; + /** + * Guest capabilities acquired by vbg_acquire_session_capabilities(). + * Only one session can acquire a capability at a time. + */ + u32 acquired_guest_caps; /** * Usage counters for guest capabilities requested through * vbg_set_session_capabilities(). Indexed by capability bit @@ -164,6 +173,11 @@ struct vbg_session { * host filter. Protected by vbg_gdev.session_mutex. */ u32 event_filter; + /** + * Guest capabilities acquired by vbg_acquire_session_capabilities(). + * Only one session can acquire a capability at a time. + */ + u32 acquired_guest_caps; /** * Guest capabilities set through vbg_set_session_capabilities(). * A capability claimed by any guest session will be reported to the diff --git a/include/uapi/linux/vboxguest.h b/include/uapi/linux/vboxguest.h index f79d7abe27db..15125f6ec60d 100644 --- a/include/uapi/linux/vboxguest.h +++ b/include/uapi/linux/vboxguest.h @@ -257,6 +257,30 @@ VMMDEV_ASSERT_SIZE(vbg_ioctl_change_filter, 24 + 8); _IOWR('V', 12, struct vbg_ioctl_change_filter) +/** VBG_IOCTL_ACQUIRE_GUEST_CAPABILITIES data structure. */ +struct vbg_ioctl_acquire_guest_caps { + /** The header. */ + struct vbg_ioctl_hdr hdr; + union { + struct { + /** Flags (VBGL_IOC_AGC_FLAGS_XXX). */ + __u32 flags; + /** Capabilities to set (VMMDEV_GUEST_SUPPORTS_XXX). */ + __u32 or_mask; + /** Capabilities to drop (VMMDEV_GUEST_SUPPORTS_XXX). */ + __u32 not_mask; + } in; + } u; +}; +VMMDEV_ASSERT_SIZE(vbg_ioctl_acquire_guest_caps, 24 + 12); + +#define VBGL_IOC_AGC_FLAGS_CONFIG_ACQUIRE_MODE 0x00000001 +#define VBGL_IOC_AGC_FLAGS_VALID_MASK 0x00000001 + +#define VBG_IOCTL_ACQUIRE_GUEST_CAPABILITIES \ + _IOWR('V', 13, struct vbg_ioctl_acquire_guest_caps) + + /** VBG_IOCTL_CHANGE_GUEST_CAPABILITIES data structure. */ struct vbg_ioctl_set_guest_caps { /** The header. */ -- cgit v1.2.3-59-g8ed1b From 316b0035402f05fe9e9e5334d1ff65dae285cb7c Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 9 Jul 2020 14:08:56 +0200 Subject: virt: vbox: Add a few new vmmdev request types to the userspace whitelist Upstream VirtualBox has defined and is using a few new request types for vmmdev requests passed through /dev/vboxguest to the hypervisor. Add the defines for these to vbox_vmmdev_types.h and add add them to the whitelists of vmmdev requests which userspace is allowed to make. BugLink: https://bugzilla.redhat.com/show_bug.cgi?id=1789545 Acked-by: Arnd Bergmann Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20200709120858.63928-7-hdegoede@redhat.com Signed-off-by: Greg Kroah-Hartman --- drivers/virt/vboxguest/vboxguest_core.c | 2 ++ include/uapi/linux/vbox_vmmdev_types.h | 3 +++ 2 files changed, 5 insertions(+) (limited to 'include/uapi') diff --git a/drivers/virt/vboxguest/vboxguest_core.c b/drivers/virt/vboxguest/vboxguest_core.c index 4f1addaa3f6f..ffd76b949276 100644 --- a/drivers/virt/vboxguest/vboxguest_core.c +++ b/drivers/virt/vboxguest/vboxguest_core.c @@ -1299,7 +1299,9 @@ static int vbg_req_allowed(struct vbg_dev *gdev, struct vbg_session *session, case VMMDEVREQ_VIDEO_ACCEL_ENABLE: case VMMDEVREQ_VIDEO_ACCEL_FLUSH: case VMMDEVREQ_VIDEO_SET_VISIBLE_REGION: + case VMMDEVREQ_VIDEO_UPDATE_MONITOR_POSITIONS: case VMMDEVREQ_GET_DISPLAY_CHANGE_REQEX: + case VMMDEVREQ_GET_DISPLAY_CHANGE_REQ_MULTI: case VMMDEVREQ_GET_SEAMLESS_CHANGE_REQ: case VMMDEVREQ_GET_VRDPCHANGE_REQ: case VMMDEVREQ_LOG_STRING: diff --git a/include/uapi/linux/vbox_vmmdev_types.h b/include/uapi/linux/vbox_vmmdev_types.h index c27289fd619a..f8a8d6b3c521 100644 --- a/include/uapi/linux/vbox_vmmdev_types.h +++ b/include/uapi/linux/vbox_vmmdev_types.h @@ -63,6 +63,7 @@ enum vmmdev_request_type { VMMDEVREQ_SET_GUEST_CAPABILITIES = 56, VMMDEVREQ_VIDEMODE_SUPPORTED2 = 57, /* since version 3.2.0 */ VMMDEVREQ_GET_DISPLAY_CHANGE_REQEX = 80, /* since version 4.2.4 */ + VMMDEVREQ_GET_DISPLAY_CHANGE_REQ_MULTI = 81, VMMDEVREQ_HGCM_CONNECT = 60, VMMDEVREQ_HGCM_DISCONNECT = 61, VMMDEVREQ_HGCM_CALL32 = 62, @@ -92,6 +93,8 @@ enum vmmdev_request_type { VMMDEVREQ_WRITE_COREDUMP = 218, VMMDEVREQ_GUEST_HEARTBEAT = 219, VMMDEVREQ_HEARTBEAT_CONFIGURE = 220, + VMMDEVREQ_NT_BUG_CHECK = 221, + VMMDEVREQ_VIDEO_UPDATE_MONITOR_POSITIONS = 222, /* Ensure the enum is a 32 bit data-type */ VMMDEVREQ_SIZEHACK = 0x7fffffff }; -- cgit v1.2.3-59-g8ed1b From 04aaca197f16aff608e19ee98b1e55f535d746be Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 17 Jun 2020 17:33:13 +0900 Subject: char: raw: do not leak CONFIG_MAX_RAW_DEVS to userspace include/uapi/linux/raw.h leaks CONFIG_MAX_RAW_DEVS to userspace. Userspace programs cannot use MAX_RAW_MINORS since CONFIG_MAX_RAW_DEVS is not available anyway. Remove the MAX_RAW_MINORS definition from the exported header, and use CONFIG_MAX_RAW_DEVS in drivers/char/raw.c While I was here, I converted printk(KERN_WARNING ...) to pr_warn(...) and stretched the warning message. Signed-off-by: Masahiro Yamada Link: https://lore.kernel.org/r/20200617083313.183184-1-masahiroy@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/char/raw.c | 8 ++++---- include/uapi/linux/raw.h | 2 -- scripts/headers_install.sh | 1 - 3 files changed, 4 insertions(+), 7 deletions(-) (limited to 'include/uapi') diff --git a/drivers/char/raw.c b/drivers/char/raw.c index 3484e9145aea..380bf518338e 100644 --- a/drivers/char/raw.c +++ b/drivers/char/raw.c @@ -37,7 +37,7 @@ static struct raw_device_data *raw_devices; static DEFINE_MUTEX(raw_mutex); static const struct file_operations raw_ctl_fops; /* forward declaration */ -static int max_raw_minors = MAX_RAW_MINORS; +static int max_raw_minors = CONFIG_MAX_RAW_DEVS; module_param(max_raw_minors, int, 0); MODULE_PARM_DESC(max_raw_minors, "Maximum number of raw devices (1-65536)"); @@ -317,9 +317,9 @@ static int __init raw_init(void) int ret; if (max_raw_minors < 1 || max_raw_minors > 65536) { - printk(KERN_WARNING "raw: invalid max_raw_minors (must be" - " between 1 and 65536), using %d\n", MAX_RAW_MINORS); - max_raw_minors = MAX_RAW_MINORS; + pr_warn("raw: invalid max_raw_minors (must be between 1 and 65536), using %d\n", + CONFIG_MAX_RAW_DEVS); + max_raw_minors = CONFIG_MAX_RAW_DEVS; } raw_devices = vzalloc(array_size(max_raw_minors, diff --git a/include/uapi/linux/raw.h b/include/uapi/linux/raw.h index dc96dda479d6..47874919d0b9 100644 --- a/include/uapi/linux/raw.h +++ b/include/uapi/linux/raw.h @@ -14,6 +14,4 @@ struct raw_config_request __u64 block_minor; }; -#define MAX_RAW_MINORS CONFIG_MAX_RAW_DEVS - #endif /* __LINUX_RAW_H */ diff --git a/scripts/headers_install.sh b/scripts/headers_install.sh index 224f51012b6e..cdd66038818c 100755 --- a/scripts/headers_install.sh +++ b/scripts/headers_install.sh @@ -90,7 +90,6 @@ include/uapi/linux/elfcore.h:CONFIG_BINFMT_ELF_FDPIC include/uapi/linux/eventpoll.h:CONFIG_PM_SLEEP include/uapi/linux/hw_breakpoint.h:CONFIG_HAVE_MIXED_BREAKPOINTS_REGS include/uapi/linux/pktcdvd.h:CONFIG_CDROM_PKTCDVD_WCACHE -include/uapi/linux/raw.h:CONFIG_MAX_RAW_DEVS " for c in $configs -- cgit v1.2.3-59-g8ed1b From 7c97f3aded10aa86fc1944341288434117e9c926 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Thu, 2 Jul 2020 11:29:31 +0300 Subject: RDMA/counter: Add PID category support in auto mode With the "PID" category QPs have same PID will be bound to same counter; If this category is not set then QPs have different PIDs will be bound to same counter. This is implemented for 2 reasons: 1. The counter is a limited resource, while there may be dozens of applications, each of which creates several types of QPs, which means it may doesn't have enough counter. 2. The system administrator needs all QPs created by all applications with same type bound to one counter. The counter name and PID is only make sense when "PID" category are configured. This category can also be used in combine with others, e.g. QP type. Link: https://lore.kernel.org/r/20200702082933.424537-2-leon@kernel.org Signed-off-by: Mark Zhang Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/counters.c | 20 +++++--------------- drivers/infiniband/core/nldev.c | 8 ++++++-- include/uapi/rdma/rdma_netlink.h | 1 + 3 files changed, 12 insertions(+), 17 deletions(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/core/counters.c b/drivers/infiniband/core/counters.c index 2257d7f7810f..40204c6caa5b 100644 --- a/drivers/infiniband/core/counters.c +++ b/drivers/infiniband/core/counters.c @@ -8,7 +8,7 @@ #include "core_priv.h" #include "restrack.h" -#define ALL_AUTO_MODE_MASKS (RDMA_COUNTER_MASK_QP_TYPE) +#define ALL_AUTO_MODE_MASKS (RDMA_COUNTER_MASK_QP_TYPE | RDMA_COUNTER_MASK_PID) static int __counter_set_mode(struct rdma_counter_mode *curr, enum rdma_nl_counter_mode new_mode, @@ -149,23 +149,13 @@ static bool auto_mode_match(struct ib_qp *qp, struct rdma_counter *counter, struct auto_mode_param *param = &counter->mode.param; bool match = true; - /* - * Ensure that counter belongs to the right PID. This operation can - * race with user space which kills the process and leaves QP and - * counters orphans. - * - * It is not a big deal because exitted task will leave both QP and - * counter in the same bucket of zombie process. Just ensure that - * process is still alive before procedding. - * - */ - if (task_pid_nr(counter->res.task) != task_pid_nr(qp->res.task) || - !task_pid_nr(qp->res.task)) - return false; - if (auto_mask & RDMA_COUNTER_MASK_QP_TYPE) match &= (param->qp_type == qp->qp_type); + if (auto_mask & RDMA_COUNTER_MASK_PID) + match &= (task_pid_nr(counter->res.task) == + task_pid_nr(qp->res.task)); + return match; } diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 1051b5622b62..76af7ea2875d 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -711,11 +711,16 @@ static int fill_stat_counter_mode(struct sk_buff *msg, if (nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_MODE, m->mode)) return -EMSGSIZE; - if (m->mode == RDMA_COUNTER_MODE_AUTO) + if (m->mode == RDMA_COUNTER_MODE_AUTO) { if ((m->mask & RDMA_COUNTER_MASK_QP_TYPE) && nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, m->param.qp_type)) return -EMSGSIZE; + if ((m->mask & RDMA_COUNTER_MASK_PID) && + fill_res_name_pid(msg, &counter->res)) + return -EMSGSIZE; + } + return 0; } @@ -855,7 +860,6 @@ static int fill_res_counter_entry(struct sk_buff *msg, bool has_cap_net_admin, if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, counter->port) || nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, counter->id) || - fill_res_name_pid(msg, &counter->res) || fill_stat_counter_mode(msg, counter) || fill_stat_counter_qps(msg, counter) || fill_stat_counter_hwcounters(msg, counter)) diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 3826143d420d..d2f5b8396243 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -569,5 +569,6 @@ enum rdma_nl_counter_mode { */ enum rdma_nl_counter_mask { RDMA_COUNTER_MASK_QP_TYPE = 1, + RDMA_COUNTER_MASK_PID = 1 << 1, }; #endif /* _UAPI_RDMA_NETLINK_H */ -- cgit v1.2.3-59-g8ed1b From c7d759eb7b12f91a25f4d3cd03ff5209046ddfc2 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 9 Jul 2020 17:42:47 -0700 Subject: ethtool: add tunnel info interface Add an interface to report offloaded UDP ports via ethtool netlink. Now that core takes care of tracking which UDP tunnel ports the NICs are aware of we can quite easily export this information out to user space. The responsibility of writing the netlink dumps is split between ethtool code and udp_tunnel_nic.c - since udp_tunnel module may not always be loaded, yet we should always report the capabilities of the NIC. $ ethtool --show-tunnels eth0 Tunnel information for eth0: UDP port table 0: Size: 4 Types: vxlan No entries UDP port table 1: Size: 4 Types: geneve, vxlan-gpe Entries (1): port 1230, vxlan-gpe v4: - back to v2, build fix is now directly in udp_tunnel.h v3: - don't compile ETHTOOL_MSG_TUNNEL_INFO_GET in if CONFIG_INET not set. v2: - fix string set count, - reorder enums in the uAPI, - fix type of ETHTOOL_A_TUNNEL_UDP_TABLE_TYPES to bitset in docs and comments. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 33 ++++ include/net/udp_tunnel.h | 21 +++ include/uapi/linux/ethtool.h | 2 + include/uapi/linux/ethtool_netlink.h | 55 ++++++ net/ethtool/Makefile | 3 +- net/ethtool/common.c | 9 + net/ethtool/common.h | 1 + net/ethtool/netlink.c | 12 ++ net/ethtool/netlink.h | 4 + net/ethtool/strset.c | 5 + net/ethtool/tunnels.c | 259 +++++++++++++++++++++++++++ net/ipv4/udp_tunnel_nic.c | 69 +++++++ 12 files changed, 472 insertions(+), 1 deletion(-) create mode 100644 net/ethtool/tunnels.c (limited to 'include/uapi') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index 459a0d11cfde..7d75f1e32152 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -1230,6 +1230,39 @@ used to report the amplitude of the reflection for a given pair. | | | ``ETHTOOL_A_CABLE_AMPLITUDE_mV`` | s16 | Reflection amplitude | +-+-+-----------------------------------------+--------+----------------------+ +TUNNEL_INFO +=========== + +Gets information about the tunnel state NIC is aware of. + +Request contents: + + ===================================== ====== ========================== + ``ETHTOOL_A_TUNNEL_INFO_HEADER`` nested request header + ===================================== ====== ========================== + +Kernel response contents: + + +---------------------------------------------+--------+---------------------+ + | ``ETHTOOL_A_TUNNEL_INFO_HEADER`` | nested | reply header | + +---------------------------------------------+--------+---------------------+ + | ``ETHTOOL_A_TUNNEL_INFO_UDP_PORTS`` | nested | all UDP port tables | + +-+-------------------------------------------+--------+---------------------+ + | | ``ETHTOOL_A_TUNNEL_UDP_TABLE`` | nested | one UDP port table | + +-+-+-----------------------------------------+--------+---------------------+ + | | | ``ETHTOOL_A_TUNNEL_UDP_TABLE_SIZE`` | u32 | max size of the | + | | | | | table | + +-+-+-----------------------------------------+--------+---------------------+ + | | | ``ETHTOOL_A_TUNNEL_UDP_TABLE_TYPES`` | bitset | tunnel types which | + | | | | | table can hold | + +-+-+-----------------------------------------+--------+---------------------+ + | | | ``ETHTOOL_A_TUNNEL_UDP_TABLE_ENTRY`` | nested | offloaded UDP port | + +-+-+-+---------------------------------------+--------+---------------------+ + | | | | ``ETHTOOL_A_TUNNEL_UDP_ENTRY_PORT`` | be16 | UDP port | + +-+-+-+---------------------------------------+--------+---------------------+ + | | | | ``ETHTOOL_A_TUNNEL_UDP_ENTRY_TYPE`` | u32 | tunnel type | + +-+-+-+---------------------------------------+--------+---------------------+ + Request translation =================== diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index ee34619e4cfa..dd20ce99740c 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -255,6 +255,10 @@ struct udp_tunnel_nic_ops { void (*add_port)(struct net_device *dev, struct udp_tunnel_info *ti); void (*del_port)(struct net_device *dev, struct udp_tunnel_info *ti); void (*reset_ntf)(struct net_device *dev); + + size_t (*dump_size)(struct net_device *dev, unsigned int table); + int (*dump_write)(struct net_device *dev, unsigned int table, + struct sk_buff *skb); }; #ifdef CONFIG_INET @@ -318,4 +322,21 @@ static inline void udp_tunnel_nic_reset_ntf(struct net_device *dev) if (udp_tunnel_nic_ops) udp_tunnel_nic_ops->reset_ntf(dev); } + +static inline size_t +udp_tunnel_nic_dump_size(struct net_device *dev, unsigned int table) +{ + if (!udp_tunnel_nic_ops) + return 0; + return udp_tunnel_nic_ops->dump_size(dev, table); +} + +static inline int +udp_tunnel_nic_dump_write(struct net_device *dev, unsigned int table, + struct sk_buff *skb) +{ + if (!udp_tunnel_nic_ops) + return 0; + return udp_tunnel_nic_ops->dump_write(dev, table, skb); +} #endif diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 60856e0f9618..b4f2d134e713 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -669,6 +669,7 @@ enum ethtool_link_ext_substate_cable_issue { * @ETH_SS_SOF_TIMESTAMPING: SOF_TIMESTAMPING_* flags * @ETH_SS_TS_TX_TYPES: timestamping Tx types * @ETH_SS_TS_RX_FILTERS: timestamping Rx filters + * @ETH_SS_UDP_TUNNEL_TYPES: UDP tunnel types */ enum ethtool_stringset { ETH_SS_TEST = 0, @@ -686,6 +687,7 @@ enum ethtool_stringset { ETH_SS_SOF_TIMESTAMPING, ETH_SS_TS_TX_TYPES, ETH_SS_TS_RX_FILTERS, + ETH_SS_UDP_TUNNEL_TYPES, /* add new constants above here */ ETH_SS_COUNT diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index c12ce4df4b6b..5dcd24cb33ea 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -41,6 +41,7 @@ enum { ETHTOOL_MSG_TSINFO_GET, ETHTOOL_MSG_CABLE_TEST_ACT, ETHTOOL_MSG_CABLE_TEST_TDR_ACT, + ETHTOOL_MSG_TUNNEL_INFO_GET, /* add new constants above here */ __ETHTOOL_MSG_USER_CNT, @@ -556,6 +557,60 @@ enum { ETHTOOL_A_CABLE_TEST_TDR_NTF_MAX = __ETHTOOL_A_CABLE_TEST_TDR_NTF_CNT - 1 }; +/* TUNNEL INFO */ + +enum { + ETHTOOL_UDP_TUNNEL_TYPE_VXLAN, + ETHTOOL_UDP_TUNNEL_TYPE_GENEVE, + ETHTOOL_UDP_TUNNEL_TYPE_VXLAN_GPE, + + __ETHTOOL_UDP_TUNNEL_TYPE_CNT +}; + +enum { + ETHTOOL_A_TUNNEL_UDP_ENTRY_UNSPEC, + + ETHTOOL_A_TUNNEL_UDP_ENTRY_PORT, /* be16 */ + ETHTOOL_A_TUNNEL_UDP_ENTRY_TYPE, /* u32 */ + + /* add new constants above here */ + __ETHTOOL_A_TUNNEL_UDP_ENTRY_CNT, + ETHTOOL_A_TUNNEL_UDP_ENTRY_MAX = (__ETHTOOL_A_TUNNEL_UDP_ENTRY_CNT - 1) +}; + +enum { + ETHTOOL_A_TUNNEL_UDP_TABLE_UNSPEC, + + ETHTOOL_A_TUNNEL_UDP_TABLE_SIZE, /* u32 */ + ETHTOOL_A_TUNNEL_UDP_TABLE_TYPES, /* bitset */ + ETHTOOL_A_TUNNEL_UDP_TABLE_ENTRY, /* nest - _UDP_ENTRY_* */ + + /* add new constants above here */ + __ETHTOOL_A_TUNNEL_UDP_TABLE_CNT, + ETHTOOL_A_TUNNEL_UDP_TABLE_MAX = (__ETHTOOL_A_TUNNEL_UDP_TABLE_CNT - 1) +}; + +enum { + ETHTOOL_A_TUNNEL_UDP_UNSPEC, + + ETHTOOL_A_TUNNEL_UDP_TABLE, /* nest - _UDP_TABLE_* */ + + /* add new constants above here */ + __ETHTOOL_A_TUNNEL_UDP_CNT, + ETHTOOL_A_TUNNEL_UDP_MAX = (__ETHTOOL_A_TUNNEL_UDP_CNT - 1) +}; + +enum { + ETHTOOL_A_TUNNEL_INFO_UNSPEC, + ETHTOOL_A_TUNNEL_INFO_HEADER, /* nest - _A_HEADER_* */ + + ETHTOOL_A_TUNNEL_INFO_UDP_PORTS, /* nest - _UDP_TABLE */ + + /* add new constants above here */ + __ETHTOOL_A_TUNNEL_INFO_CNT, + ETHTOOL_A_TUNNEL_INFO_MAX = (__ETHTOOL_A_TUNNEL_INFO_CNT - 1) +}; + /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" #define ETHTOOL_GENL_VERSION 1 diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile index 0c2b94f20499..7a849ff22dad 100644 --- a/net/ethtool/Makefile +++ b/net/ethtool/Makefile @@ -6,4 +6,5 @@ obj-$(CONFIG_ETHTOOL_NETLINK) += ethtool_nl.o ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o \ linkstate.o debug.o wol.o features.o privflags.o rings.o \ - channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o + channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o \ + tunnels.o diff --git a/net/ethtool/common.c b/net/ethtool/common.c index c54166713797..ed19573fccd7 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only +#include #include #include #include @@ -272,6 +273,14 @@ const char ts_rx_filter_names[][ETH_GSTRING_LEN] = { }; static_assert(ARRAY_SIZE(ts_rx_filter_names) == __HWTSTAMP_FILTER_CNT); +const char udp_tunnel_type_names[][ETH_GSTRING_LEN] = { + [ETHTOOL_UDP_TUNNEL_TYPE_VXLAN] = "vxlan", + [ETHTOOL_UDP_TUNNEL_TYPE_GENEVE] = "geneve", + [ETHTOOL_UDP_TUNNEL_TYPE_VXLAN_GPE] = "vxlan-gpe", +}; +static_assert(ARRAY_SIZE(udp_tunnel_type_names) == + __ETHTOOL_UDP_TUNNEL_TYPE_CNT); + /* return false if legacy contained non-0 deprecated fields * maxtxpkt/maxrxpkt. rest of ksettings always updated */ diff --git a/net/ethtool/common.h b/net/ethtool/common.h index b83bef38368c..3d9251c95a8b 100644 --- a/net/ethtool/common.h +++ b/net/ethtool/common.h @@ -28,6 +28,7 @@ extern const char wol_mode_names[][ETH_GSTRING_LEN]; extern const char sof_timestamping_names[][ETH_GSTRING_LEN]; extern const char ts_tx_type_names[][ETH_GSTRING_LEN]; extern const char ts_rx_filter_names[][ETH_GSTRING_LEN]; +extern const char udp_tunnel_type_names[][ETH_GSTRING_LEN]; int __ethtool_get_link(struct net_device *dev); diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 88fd07f47040..fb9d096faaa4 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -181,6 +181,12 @@ err: return NULL; } +void *ethnl_dump_put(struct sk_buff *skb, struct netlink_callback *cb, u8 cmd) +{ + return genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + ðtool_genl_family, 0, cmd); +} + void *ethnl_bcastmsg_put(struct sk_buff *skb, u8 cmd) { return genlmsg_put(skb, 0, ++ethnl_bcast_seq, ðtool_genl_family, 0, @@ -849,6 +855,12 @@ static const struct genl_ops ethtool_genl_ops[] = { .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_act_cable_test_tdr, }, + { + .cmd = ETHTOOL_MSG_TUNNEL_INFO_GET, + .doit = ethnl_tunnel_info_doit, + .start = ethnl_tunnel_info_start, + .dumpit = ethnl_tunnel_info_dumpit, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 9a96b6e90dc2..e2085005caac 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -19,6 +19,7 @@ int ethnl_fill_reply_header(struct sk_buff *skb, struct net_device *dev, struct sk_buff *ethnl_reply_init(size_t payload, struct net_device *dev, u8 cmd, u16 hdr_attrtype, struct genl_info *info, void **ehdrp); +void *ethnl_dump_put(struct sk_buff *skb, struct netlink_callback *cb, u8 cmd); void *ethnl_bcastmsg_put(struct sk_buff *skb, u8 cmd); int ethnl_multicast(struct sk_buff *skb, struct net_device *dev); @@ -361,5 +362,8 @@ int ethnl_set_pause(struct sk_buff *skb, struct genl_info *info); int ethnl_set_eee(struct sk_buff *skb, struct genl_info *info); int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info); int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info); +int ethnl_tunnel_info_doit(struct sk_buff *skb, struct genl_info *info); +int ethnl_tunnel_info_start(struct netlink_callback *cb); +int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb); #endif /* _NET_ETHTOOL_NETLINK_H */ diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c index 773634b6b048..82707b662fe4 100644 --- a/net/ethtool/strset.c +++ b/net/ethtool/strset.c @@ -75,6 +75,11 @@ static const struct strset_info info_template[] = { .count = __HWTSTAMP_FILTER_CNT, .strings = ts_rx_filter_names, }, + [ETH_SS_UDP_TUNNEL_TYPES] = { + .per_dev = false, + .count = __ETHTOOL_UDP_TUNNEL_TYPE_CNT, + .strings = udp_tunnel_type_names, + }, }; struct strset_req_info { diff --git a/net/ethtool/tunnels.c b/net/ethtool/tunnels.c new file mode 100644 index 000000000000..6b89255f1231 --- /dev/null +++ b/net/ethtool/tunnels.c @@ -0,0 +1,259 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include + +#include "bitset.h" +#include "common.h" +#include "netlink.h" + +static const struct nla_policy +ethtool_tunnel_info_policy[ETHTOOL_A_TUNNEL_INFO_MAX + 1] = { + [ETHTOOL_A_TUNNEL_INFO_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_TUNNEL_INFO_HEADER] = { .type = NLA_NESTED }, +}; + +static_assert(ETHTOOL_UDP_TUNNEL_TYPE_VXLAN == ilog2(UDP_TUNNEL_TYPE_VXLAN)); +static_assert(ETHTOOL_UDP_TUNNEL_TYPE_GENEVE == ilog2(UDP_TUNNEL_TYPE_GENEVE)); +static_assert(ETHTOOL_UDP_TUNNEL_TYPE_VXLAN_GPE == + ilog2(UDP_TUNNEL_TYPE_VXLAN_GPE)); + +static ssize_t +ethnl_tunnel_info_reply_size(const struct ethnl_req_info *req_base, + struct netlink_ext_ack *extack) +{ + bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; + const struct udp_tunnel_nic_info *info; + unsigned int i; + size_t size; + int ret; + + info = req_base->dev->udp_tunnel_nic_info; + if (!info) { + NL_SET_ERR_MSG(extack, + "device does not report tunnel offload info"); + return -EOPNOTSUPP; + } + + size = nla_total_size(0); /* _INFO_UDP_PORTS */ + + for (i = 0; i < UDP_TUNNEL_NIC_MAX_TABLES; i++) { + if (!info->tables[i].n_entries) + return size; + + size += nla_total_size(0); /* _UDP_TABLE */ + size += nla_total_size(sizeof(u32)); /* _UDP_TABLE_SIZE */ + ret = ethnl_bitset32_size(&info->tables[i].tunnel_types, NULL, + __ETHTOOL_UDP_TUNNEL_TYPE_CNT, + udp_tunnel_type_names, compact); + if (ret < 0) + return ret; + size += ret; + + size += udp_tunnel_nic_dump_size(req_base->dev, i); + } + + return size; +} + +static int +ethnl_tunnel_info_fill_reply(const struct ethnl_req_info *req_base, + struct sk_buff *skb) +{ + bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; + const struct udp_tunnel_nic_info *info; + struct nlattr *ports, *table; + unsigned int i; + + info = req_base->dev->udp_tunnel_nic_info; + if (!info) + return -EOPNOTSUPP; + + ports = nla_nest_start(skb, ETHTOOL_A_TUNNEL_INFO_UDP_PORTS); + if (!ports) + return -EMSGSIZE; + + for (i = 0; i < UDP_TUNNEL_NIC_MAX_TABLES; i++) { + if (!info->tables[i].n_entries) + break; + + table = nla_nest_start(skb, ETHTOOL_A_TUNNEL_UDP_TABLE); + if (!table) + goto err_cancel_ports; + + if (nla_put_u32(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_SIZE, + info->tables[i].n_entries)) + goto err_cancel_table; + + if (ethnl_put_bitset32(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_TYPES, + &info->tables[i].tunnel_types, NULL, + __ETHTOOL_UDP_TUNNEL_TYPE_CNT, + udp_tunnel_type_names, compact)) + goto err_cancel_table; + + if (udp_tunnel_nic_dump_write(req_base->dev, i, skb)) + goto err_cancel_table; + + nla_nest_end(skb, table); + } + + nla_nest_end(skb, ports); + + return 0; + +err_cancel_table: + nla_nest_cancel(skb, table); +err_cancel_ports: + nla_nest_cancel(skb, ports); + return -EMSGSIZE; +} + +static int +ethnl_tunnel_info_req_parse(struct ethnl_req_info *req_info, + const struct nlmsghdr *nlhdr, struct net *net, + struct netlink_ext_ack *extack, bool require_dev) +{ + struct nlattr *tb[ETHTOOL_A_TUNNEL_INFO_MAX + 1]; + int ret; + + ret = nlmsg_parse(nlhdr, GENL_HDRLEN, tb, ETHTOOL_A_TUNNEL_INFO_MAX, + ethtool_tunnel_info_policy, extack); + if (ret < 0) + return ret; + + return ethnl_parse_header_dev_get(req_info, + tb[ETHTOOL_A_TUNNEL_INFO_HEADER], + net, extack, require_dev); +} + +int ethnl_tunnel_info_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct ethnl_req_info req_info = {}; + struct sk_buff *rskb; + void *reply_payload; + int reply_len; + int ret; + + ret = ethnl_tunnel_info_req_parse(&req_info, info->nlhdr, + genl_info_net(info), info->extack, + true); + if (ret < 0) + return ret; + + rtnl_lock(); + ret = ethnl_tunnel_info_reply_size(&req_info, info->extack); + if (ret < 0) + goto err_unlock_rtnl; + reply_len = ret + ethnl_reply_header_size(); + + rskb = ethnl_reply_init(reply_len, req_info.dev, + ETHTOOL_MSG_TUNNEL_INFO_GET, + ETHTOOL_A_TUNNEL_INFO_HEADER, + info, &reply_payload); + if (!rskb) { + ret = -ENOMEM; + goto err_unlock_rtnl; + } + + ret = ethnl_tunnel_info_fill_reply(&req_info, rskb); + if (ret) + goto err_free_msg; + rtnl_unlock(); + dev_put(req_info.dev); + genlmsg_end(rskb, reply_payload); + + return genlmsg_reply(rskb, info); + +err_free_msg: + nlmsg_free(rskb); +err_unlock_rtnl: + rtnl_unlock(); + dev_put(req_info.dev); + return ret; +} + +struct ethnl_tunnel_info_dump_ctx { + struct ethnl_req_info req_info; + int pos_hash; + int pos_idx; +}; + +int ethnl_tunnel_info_start(struct netlink_callback *cb) +{ + struct ethnl_tunnel_info_dump_ctx *ctx = (void *)cb->ctx; + int ret; + + BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); + + memset(ctx, 0, sizeof(*ctx)); + + ret = ethnl_tunnel_info_req_parse(&ctx->req_info, cb->nlh, + sock_net(cb->skb->sk), cb->extack, + false); + if (ctx->req_info.dev) { + dev_put(ctx->req_info.dev); + ctx->req_info.dev = NULL; + } + + return ret; +} + +int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct ethnl_tunnel_info_dump_ctx *ctx = (void *)cb->ctx; + struct net *net = sock_net(skb->sk); + int s_idx = ctx->pos_idx; + int h, idx = 0; + int ret = 0; + void *ehdr; + + rtnl_lock(); + cb->seq = net->dev_base_seq; + for (h = ctx->pos_hash; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { + struct hlist_head *head; + struct net_device *dev; + + head = &net->dev_index_head[h]; + idx = 0; + hlist_for_each_entry(dev, head, index_hlist) { + if (idx < s_idx) + goto cont; + + ehdr = ethnl_dump_put(skb, cb, + ETHTOOL_MSG_TUNNEL_INFO_GET); + if (!ehdr) { + ret = -EMSGSIZE; + goto out; + } + + ret = ethnl_fill_reply_header(skb, dev, ETHTOOL_A_TUNNEL_INFO_HEADER); + if (ret < 0) { + genlmsg_cancel(skb, ehdr); + goto out; + } + + ctx->req_info.dev = dev; + ret = ethnl_tunnel_info_fill_reply(&ctx->req_info, skb); + ctx->req_info.dev = NULL; + if (ret < 0) { + genlmsg_cancel(skb, ehdr); + if (ret == -EOPNOTSUPP) + goto cont; + goto out; + } + genlmsg_end(skb, ehdr); +cont: + idx++; + } + } +out: + rtnl_unlock(); + + ctx->pos_hash = h; + ctx->pos_idx = idx; + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); + + if (ret == -EMSGSIZE && skb->len) + return skb->len; + return ret; +} diff --git a/net/ipv4/udp_tunnel_nic.c b/net/ipv4/udp_tunnel_nic.c index 056cfe0b770e..f0dbd9905a53 100644 --- a/net/ipv4/udp_tunnel_nic.c +++ b/net/ipv4/udp_tunnel_nic.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only // Copyright (c) 2020 Facebook Inc. +#include #include #include #include @@ -72,6 +73,12 @@ udp_tunnel_nic_entry_is_free(struct udp_tunnel_nic_table_entry *entry) return entry->use_cnt == 0 && !entry->flags; } +static bool +udp_tunnel_nic_entry_is_present(struct udp_tunnel_nic_table_entry *entry) +{ + return entry->use_cnt && !(entry->flags & ~UDP_TUNNEL_NIC_ENTRY_FROZEN); +} + static bool udp_tunnel_nic_entry_is_frozen(struct udp_tunnel_nic_table_entry *entry) { @@ -564,12 +571,74 @@ static void __udp_tunnel_nic_reset_ntf(struct net_device *dev) __udp_tunnel_nic_device_sync(dev, utn); } +static size_t +__udp_tunnel_nic_dump_size(struct net_device *dev, unsigned int table) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + struct udp_tunnel_nic *utn; + unsigned int j; + size_t size; + + utn = dev->udp_tunnel_nic; + if (!utn) + return 0; + + size = 0; + for (j = 0; j < info->tables[table].n_entries; j++) { + if (!udp_tunnel_nic_entry_is_present(&utn->entries[table][j])) + continue; + + size += nla_total_size(0) + /* _TABLE_ENTRY */ + nla_total_size(sizeof(__be16)) + /* _ENTRY_PORT */ + nla_total_size(sizeof(u32)); /* _ENTRY_TYPE */ + } + + return size; +} + +static int +__udp_tunnel_nic_dump_write(struct net_device *dev, unsigned int table, + struct sk_buff *skb) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + struct udp_tunnel_nic *utn; + struct nlattr *nest; + unsigned int j; + + utn = dev->udp_tunnel_nic; + if (!utn) + return 0; + + for (j = 0; j < info->tables[table].n_entries; j++) { + if (!udp_tunnel_nic_entry_is_present(&utn->entries[table][j])) + continue; + + nest = nla_nest_start(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_ENTRY); + + if (nla_put_be16(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_PORT, + utn->entries[table][j].port) || + nla_put_u32(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_TYPE, + ilog2(utn->entries[table][j].type))) + goto err_cancel; + + nla_nest_end(skb, nest); + } + + return 0; + +err_cancel: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + static const struct udp_tunnel_nic_ops __udp_tunnel_nic_ops = { .get_port = __udp_tunnel_nic_get_port, .set_port_priv = __udp_tunnel_nic_set_port_priv, .add_port = __udp_tunnel_nic_add_port, .del_port = __udp_tunnel_nic_del_port, .reset_ntf = __udp_tunnel_nic_reset_ntf, + .dump_size = __udp_tunnel_nic_dump_size, + .dump_write = __udp_tunnel_nic_dump_write, }; static void -- cgit v1.2.3-59-g8ed1b From 3edd68399dc155b80335244c8c2673eaa652931a Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Fri, 10 Jul 2020 17:48:11 +0200 Subject: KVM: x86: Add a capability for GUEST_MAXPHYADDR < HOST_MAXPHYADDR support This patch adds a new capability KVM_CAP_SMALLER_MAXPHYADDR which allows userspace to query if the underlying architecture would support GUEST_MAXPHYADDR < HOST_MAXPHYADDR and hence act accordingly (e.g. qemu can decide if it should warn for -cpu ..,phys-bits=X) The complications in this patch are due to unexpected (but documented) behaviour we see with NPF vmexit handling in AMD processor. If SVM is modified to add guest physical address checks in the NPF and guest #PF paths, we see the followning error multiple times in the 'access' test in kvm-unit-tests: test pte.p pte.36 pde.p: FAIL: pte 2000021 expected 2000001 Dump mapping: address: 0x123400000000 ------L4: 24c3027 ------L3: 24c4027 ------L2: 24c5021 ------L1: 1002000021 This is because the PTE's accessed bit is set by the CPU hardware before the NPF vmexit. This is handled completely by hardware and cannot be fixed in software. Therefore, availability of the new capability depends on a boolean variable allow_smaller_maxphyaddr which is set individually by VMX and SVM init routines. On VMX it's always set to true, on SVM it's only set to true when NPT is not enabled. CC: Tom Lendacky CC: Babu Moger Signed-off-by: Mohammed Gamal Message-Id: <20200710154811.418214-10-mgamal@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/svm/svm.c | 15 +++++++++++++++ arch/x86/kvm/vmx/vmx.c | 7 +++++++ arch/x86/kvm/x86.c | 6 ++++++ include/uapi/linux/kvm.h | 2 ++ 5 files changed, 31 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 1df95f10c903..1bab87a444d7 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1263,7 +1263,7 @@ struct kvm_arch_async_pf { }; extern u64 __read_mostly host_efer; - +extern bool __read_mostly allow_smaller_maxphyaddr; extern struct kvm_x86_ops kvm_x86_ops; #define __KVM_HAVE_ARCH_VM_ALLOC diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 2371b1e40f39..783330d0e7b8 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -924,6 +924,21 @@ static __init int svm_hardware_setup(void) svm_set_cpu_caps(); + /* + * It seems that on AMD processors PTE's accessed bit is + * being set by the CPU hardware before the NPF vmexit. + * This is not expected behaviour and our tests fail because + * of it. + * A workaround here is to disable support for + * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled. + * In this case userspace can know if there is support using + * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle + * it + * If future AMD CPU models change the behaviour described above, + * this variable can be changed accordingly + */ + allow_smaller_maxphyaddr = !npt_enabled; + return 0; err: diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 962a78c7dde5..1bb59ae5016d 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -8309,6 +8309,13 @@ static int __init vmx_init(void) #endif vmx_check_vmcs12_offsets(); + /* + * Intel processors don't have problems with + * GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable + * it for VMX by default + */ + allow_smaller_maxphyaddr = true; + return 0; } module_init(vmx_init); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 35abe69aad28..95ef62922869 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -187,6 +187,9 @@ static struct kvm_shared_msrs __percpu *shared_msrs; u64 __read_mostly host_efer; EXPORT_SYMBOL_GPL(host_efer); +bool __read_mostly allow_smaller_maxphyaddr; +EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr); + static u64 __read_mostly host_xss; u64 __read_mostly supported_xss; EXPORT_SYMBOL_GPL(supported_xss); @@ -3574,6 +3577,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: r = kvm_x86_ops.nested_ops->enable_evmcs != NULL; break; + case KVM_CAP_SMALLER_MAXPHYADDR: + r = (int) allow_smaller_maxphyaddr; + break; default: break; } diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index ff9b335620d0..2c73dcfb3dbb 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1033,6 +1033,8 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_HALT_POLL 182 #define KVM_CAP_ASYNC_PF_INT 183 #define KVM_CAP_LAST_CPU 184 +#define KVM_CAP_SMALLER_MAXPHYADDR 185 + #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3-59-g8ed1b From 47e33c05f9f07cac3de833e531bcac9ae052c7ca Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 15 Jun 2020 15:42:46 -0700 Subject: seccomp: Fix ioctl number for SECCOMP_IOCTL_NOTIF_ID_VALID When SECCOMP_IOCTL_NOTIF_ID_VALID was first introduced it had the wrong direction flag set. While this isn't a big deal as nothing currently enforces these bits in the kernel, it should be defined correctly. Fix the define and provide support for the old command until it is no longer needed for backward compatibility. Fixes: 6a21cc50f0c7 ("seccomp: add a return code to trap to userspace") Signed-off-by: Kees Cook --- include/uapi/linux/seccomp.h | 3 ++- kernel/seccomp.c | 9 +++++++++ tools/testing/selftests/seccomp/seccomp_bpf.c | 2 +- 3 files changed, 12 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h index c1735455bc53..965290f7dcc2 100644 --- a/include/uapi/linux/seccomp.h +++ b/include/uapi/linux/seccomp.h @@ -123,5 +123,6 @@ struct seccomp_notif_resp { #define SECCOMP_IOCTL_NOTIF_RECV SECCOMP_IOWR(0, struct seccomp_notif) #define SECCOMP_IOCTL_NOTIF_SEND SECCOMP_IOWR(1, \ struct seccomp_notif_resp) -#define SECCOMP_IOCTL_NOTIF_ID_VALID SECCOMP_IOR(2, __u64) +#define SECCOMP_IOCTL_NOTIF_ID_VALID SECCOMP_IOW(2, __u64) + #endif /* _UAPI_LINUX_SECCOMP_H */ diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 5f0e3f3a7a5d..0ed57e8c49d0 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -44,6 +44,14 @@ #include #include +/* + * When SECCOMP_IOCTL_NOTIF_ID_VALID was first introduced, it had the + * wrong direction flag in the ioctl number. This is the broken one, + * which the kernel needs to keep supporting until all userspaces stop + * using the wrong command number. + */ +#define SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR SECCOMP_IOR(2, __u64) + enum notify_state { SECCOMP_NOTIFY_INIT, SECCOMP_NOTIFY_SENT, @@ -1236,6 +1244,7 @@ static long seccomp_notify_ioctl(struct file *file, unsigned int cmd, return seccomp_notify_recv(filter, buf); case SECCOMP_IOCTL_NOTIF_SEND: return seccomp_notify_send(filter, buf); + case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR: case SECCOMP_IOCTL_NOTIF_ID_VALID: return seccomp_notify_id_valid(filter, buf); default: diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 43884b6625fc..61f9ac200001 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -186,7 +186,7 @@ struct seccomp_metadata { #define SECCOMP_IOCTL_NOTIF_RECV SECCOMP_IOWR(0, struct seccomp_notif) #define SECCOMP_IOCTL_NOTIF_SEND SECCOMP_IOWR(1, \ struct seccomp_notif_resp) -#define SECCOMP_IOCTL_NOTIF_ID_VALID SECCOMP_IOR(2, __u64) +#define SECCOMP_IOCTL_NOTIF_ID_VALID SECCOMP_IOW(2, __u64) struct seccomp_notif { __u64 id; -- cgit v1.2.3-59-g8ed1b From 21249616f02dc3f4c5efc3b3e9ba48e428b0131c Mon Sep 17 00:00:00 2001 From: Kent Gibson Date: Wed, 8 Jul 2020 12:15:57 +0800 Subject: gpio: uapi: fix misplaced comment line The second line of the description for event_type is before the first. Move it to after the first line. Signed-off-by: Kent Gibson Signed-off-by: Bartosz Golaszewski --- include/uapi/linux/gpio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/gpio.h b/include/uapi/linux/gpio.h index 0206383c0383..9c27cecf406f 100644 --- a/include/uapi/linux/gpio.h +++ b/include/uapi/linux/gpio.h @@ -71,8 +71,8 @@ enum { * of a GPIO line * @info: updated line information * @timestamp: estimate of time of status change occurrence, in nanoseconds - * and GPIOLINE_CHANGED_CONFIG * @event_type: one of GPIOLINE_CHANGED_REQUESTED, GPIOLINE_CHANGED_RELEASED + * and GPIOLINE_CHANGED_CONFIG * * Note: struct gpioline_info embedded here has 32-bit alignment on its own, * but it works fine with 64-bit alignment too. With its 72 byte size, we can -- cgit v1.2.3-59-g8ed1b From c1326210477ecc06c53221f0005c64419aba30d6 Mon Sep 17 00:00:00 2001 From: Frank van der Linden Date: Tue, 23 Jun 2020 22:39:20 +0000 Subject: nfs,nfsd: NFSv4.2 extended attribute protocol definitions Add definitions for the new operations, errors and flags as defined in RFC 8276 (File System Extended Attributes in NFSv4). Signed-off-by: Frank van der Linden Signed-off-by: Chuck Lever --- include/linux/nfs4.h | 20 ++++++++++++++++++++ include/uapi/linux/nfs4.h | 3 +++ 2 files changed, 23 insertions(+) (limited to 'include/uapi') diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 4dba3c948932..e6ca9d1d2e76 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -150,6 +150,12 @@ enum nfs_opnum4 { OP_WRITE_SAME = 70, OP_CLONE = 71, + /* xattr support (RFC8726) */ + OP_GETXATTR = 72, + OP_SETXATTR = 73, + OP_LISTXATTRS = 74, + OP_REMOVEXATTR = 75, + OP_ILLEGAL = 10044, }; @@ -280,6 +286,10 @@ enum nfsstat4 { NFS4ERR_WRONG_LFS = 10092, NFS4ERR_BADLABEL = 10093, NFS4ERR_OFFLOAD_NO_REQS = 10094, + + /* xattr (RFC8276) */ + NFS4ERR_NOXATTR = 10095, + NFS4ERR_XATTR2BIG = 10096, }; static inline bool seqid_mutating_err(u32 err) @@ -452,6 +462,7 @@ enum change_attr_type4 { #define FATTR4_WORD2_CHANGE_ATTR_TYPE (1UL << 15) #define FATTR4_WORD2_SECURITY_LABEL (1UL << 16) #define FATTR4_WORD2_MODE_UMASK (1UL << 17) +#define FATTR4_WORD2_XATTR_SUPPORT (1UL << 18) /* MDS threshold bitmap bits */ #define THRESHOLD_RD (1UL << 0) @@ -700,4 +711,13 @@ struct nl4_server { struct nfs42_netaddr nl4_addr; /* NL4_NETADDR */ } u; }; + +/* + * Options for setxattr. These match the flags for setxattr(2). + */ +enum nfs4_setxattr_options { + SETXATTR4_EITHER = 0, + SETXATTR4_CREATE = 1, + SETXATTR4_REPLACE = 2, +}; #endif diff --git a/include/uapi/linux/nfs4.h b/include/uapi/linux/nfs4.h index 8572930cf5b0..bf197e99b98f 100644 --- a/include/uapi/linux/nfs4.h +++ b/include/uapi/linux/nfs4.h @@ -33,6 +33,9 @@ #define NFS4_ACCESS_EXTEND 0x0008 #define NFS4_ACCESS_DELETE 0x0010 #define NFS4_ACCESS_EXECUTE 0x0020 +#define NFS4_ACCESS_XAREAD 0x0040 +#define NFS4_ACCESS_XAWRITE 0x0080 +#define NFS4_ACCESS_XALIST 0x0100 #define NFS4_FH_PERSISTENT 0x0000 #define NFS4_FH_NOEXPIRE_WITH_OPEN 0x0001 -- cgit v1.2.3-59-g8ed1b From 95ad37f90c338e3fd4abf61cecfe02b6f3e080f0 Mon Sep 17 00:00:00 2001 From: Frank van der Linden Date: Tue, 23 Jun 2020 22:39:04 +0000 Subject: NFSv4.2: add client side xattr caching. Implement client side caching for NFSv4.2 extended attributes. The cache is a per-inode hashtable, with name/value entries. There is one special entry for the listxattr cache. NFS inodes have a pointer to a cache structure. The cache structure is allocated on demand, freed when the cache is invalidated. Memory shrinkers keep the size in check. Large entries (> PAGE_SIZE) are collected by a separate shrinker, and freed more aggressively than others. Signed-off-by: Frank van der Linden Signed-off-by: Trond Myklebust --- fs/nfs/Makefile | 2 +- fs/nfs/inode.c | 9 +- fs/nfs/nfs42proc.c | 12 + fs/nfs/nfs42xattr.c | 1083 +++++++++++++++++++++++++++++++++++++++++++ fs/nfs/nfs4_fs.h | 22 + fs/nfs/nfs4proc.c | 42 +- fs/nfs/nfs4super.c | 10 + include/linux/nfs_fs.h | 6 + include/uapi/linux/nfs_fs.h | 1 + 9 files changed, 1179 insertions(+), 8 deletions(-) create mode 100644 fs/nfs/nfs42xattr.c (limited to 'include/uapi') diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index 2433c3e03cfa..22d11fdc6deb 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -30,7 +30,7 @@ nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o nfsv4-$(CONFIG_SYSCTL) += nfs4sysctl.o nfsv4-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o pnfs_nfs.o -nfsv4-$(CONFIG_NFS_V4_2) += nfs42proc.o +nfsv4-$(CONFIG_NFS_V4_2) += nfs42proc.o nfs42xattr.o obj-$(CONFIG_PNFS_FILE_LAYOUT) += filelayout/ obj-$(CONFIG_PNFS_BLOCK) += blocklayout/ diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 629af798dfc9..10048eed485d 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -193,6 +193,7 @@ bool nfs_check_cache_invalid(struct inode *inode, unsigned long flags) return nfs_check_cache_invalid_not_delegated(inode, flags); } +EXPORT_SYMBOL_GPL(nfs_check_cache_invalid); static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) { @@ -234,11 +235,13 @@ static void nfs_zap_caches_locked(struct inode *inode) | NFS_INO_INVALID_DATA | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL + | NFS_INO_INVALID_XATTR | NFS_INO_REVAL_PAGECACHE); } else nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL + | NFS_INO_INVALID_XATTR | NFS_INO_REVAL_PAGECACHE); nfs_zap_label_cache_locked(nfsi); } @@ -1897,7 +1900,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if (!(have_writers || have_delegation)) { invalid |= NFS_INO_INVALID_DATA | NFS_INO_INVALID_ACCESS - | NFS_INO_INVALID_ACL; + | NFS_INO_INVALID_ACL + | NFS_INO_INVALID_XATTR; /* Force revalidate of all attributes */ save_cache_validity |= NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME @@ -2100,6 +2104,9 @@ struct inode *nfs_alloc_inode(struct super_block *sb) #if IS_ENABLED(CONFIG_NFS_V4) nfsi->nfs4_acl = NULL; #endif /* CONFIG_NFS_V4 */ +#ifdef CONFIG_NFS_V4_2 + nfsi->xattr_cache = NULL; +#endif return &nfsi->vfs_inode; } EXPORT_SYMBOL_GPL(nfs_alloc_inode); diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 8c2e52bc986a..e200522469af 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -1182,6 +1182,18 @@ static ssize_t _nfs42_proc_getxattr(struct inode *inode, const char *name, if (ret < 0) return ret; + /* + * Normally, the caching is done one layer up, but for successful + * RPCS, always cache the result here, even if the caller was + * just querying the length, or if the reply was too big for + * the caller. This avoids a second RPC in the case of the + * common query-alloc-retrieve cycle for xattrs. + * + * Note that xattr_len is always capped to XATTR_SIZE_MAX. + */ + + nfs4_xattr_cache_add(inode, name, NULL, pages, res.xattr_len); + if (buflen) { if (res.xattr_len > buflen) return -ERANGE; diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c new file mode 100644 index 000000000000..23fdab977a2a --- /dev/null +++ b/fs/nfs/nfs42xattr.c @@ -0,0 +1,1083 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2019, 2020 Amazon.com, Inc. or its affiliates. All rights reserved. + * + * User extended attribute client side cache functions. + * + * Author: Frank van der Linden + */ +#include +#include +#include +#include +#include + +#include "nfs4_fs.h" +#include "internal.h" + +/* + * User extended attributes client side caching is implemented by having + * a cache structure attached to NFS inodes. This structure is allocated + * when needed, and freed when the cache is zapped. + * + * The cache structure contains as hash table of entries, and a pointer + * to a special-cased entry for the listxattr cache. + * + * Accessing and allocating / freeing the caches is done via reference + * counting. The cache entries use a similar refcounting scheme. + * + * This makes freeing a cache, both from the shrinker and from the + * zap cache path, easy. It also means that, in current use cases, + * the large majority of inodes will not waste any memory, as they + * will never have any user extended attributes assigned to them. + * + * Attribute entries are hashed in to a simple hash table. They are + * also part of an LRU. + * + * There are three shrinkers. + * + * Two shrinkers deal with the cache entries themselves: one for + * large entries (> PAGE_SIZE), and one for smaller entries. The + * shrinker for the larger entries works more aggressively than + * those for the smaller entries. + * + * The other shrinker frees the cache structures themselves. + */ + +/* + * 64 buckets is a good default. There is likely no reasonable + * workload that uses more than even 64 user extended attributes. + * You can certainly add a lot more - but you get what you ask for + * in those circumstances. + */ +#define NFS4_XATTR_HASH_SIZE 64 + +#define NFSDBG_FACILITY NFSDBG_XATTRCACHE + +struct nfs4_xattr_cache; +struct nfs4_xattr_entry; + +struct nfs4_xattr_bucket { + spinlock_t lock; + struct hlist_head hlist; + struct nfs4_xattr_cache *cache; + bool draining; +}; + +struct nfs4_xattr_cache { + struct kref ref; + spinlock_t hash_lock; /* protects hashtable and lru */ + struct nfs4_xattr_bucket buckets[NFS4_XATTR_HASH_SIZE]; + struct list_head lru; + struct list_head dispose; + atomic_long_t nent; + spinlock_t listxattr_lock; + struct inode *inode; + struct nfs4_xattr_entry *listxattr; + struct work_struct work; +}; + +struct nfs4_xattr_entry { + struct kref ref; + struct hlist_node hnode; + struct list_head lru; + struct list_head dispose; + char *xattr_name; + void *xattr_value; + size_t xattr_size; + struct nfs4_xattr_bucket *bucket; + uint32_t flags; +}; + +#define NFS4_XATTR_ENTRY_EXTVAL 0x0001 + +/* + * LRU list of NFS inodes that have xattr caches. + */ +static struct list_lru nfs4_xattr_cache_lru; +static struct list_lru nfs4_xattr_entry_lru; +static struct list_lru nfs4_xattr_large_entry_lru; + +static struct kmem_cache *nfs4_xattr_cache_cachep; + +static struct workqueue_struct *nfs4_xattr_cache_wq; + +/* + * Hashing helper functions. + */ +static void +nfs4_xattr_hash_init(struct nfs4_xattr_cache *cache) +{ + unsigned int i; + + for (i = 0; i < NFS4_XATTR_HASH_SIZE; i++) { + INIT_HLIST_HEAD(&cache->buckets[i].hlist); + spin_lock_init(&cache->buckets[i].lock); + cache->buckets[i].cache = cache; + cache->buckets[i].draining = false; + } +} + +/* + * Locking order: + * 1. inode i_lock or bucket lock + * 2. list_lru lock (taken by list_lru_* functions) + */ + +/* + * Wrapper functions to add a cache entry to the right LRU. + */ +static bool +nfs4_xattr_entry_lru_add(struct nfs4_xattr_entry *entry) +{ + struct list_lru *lru; + + lru = (entry->flags & NFS4_XATTR_ENTRY_EXTVAL) ? + &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru; + + return list_lru_add(lru, &entry->lru); +} + +static bool +nfs4_xattr_entry_lru_del(struct nfs4_xattr_entry *entry) +{ + struct list_lru *lru; + + lru = (entry->flags & NFS4_XATTR_ENTRY_EXTVAL) ? + &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru; + + return list_lru_del(lru, &entry->lru); +} + +/* + * This function allocates cache entries. They are the normal + * extended attribute name/value pairs, but may also be a listxattr + * cache. Those allocations use the same entry so that they can be + * treated as one by the memory shrinker. + * + * xattr cache entries are allocated together with names. If the + * value fits in to one page with the entry structure and the name, + * it will also be part of the same allocation (kmalloc). This is + * expected to be the vast majority of cases. Larger allocations + * have a value pointer that is allocated separately by kvmalloc. + * + * Parameters: + * + * @name: Name of the extended attribute. NULL for listxattr cache + * entry. + * @value: Value of attribute, or listxattr cache. NULL if the + * value is to be copied from pages instead. + * @pages: Pages to copy the value from, if not NULL. Passed in to + * make it easier to copy the value after an RPC, even if + * the value will not be passed up to application (e.g. + * for a 'query' getxattr with NULL buffer). + * @len: Length of the value. Can be 0 for zero-length attribues. + * @value and @pages will be NULL if @len is 0. + */ +static struct nfs4_xattr_entry * +nfs4_xattr_alloc_entry(const char *name, const void *value, + struct page **pages, size_t len) +{ + struct nfs4_xattr_entry *entry; + void *valp; + char *namep; + size_t alloclen, slen; + char *buf; + uint32_t flags; + + BUILD_BUG_ON(sizeof(struct nfs4_xattr_entry) + + XATTR_NAME_MAX + 1 > PAGE_SIZE); + + alloclen = sizeof(struct nfs4_xattr_entry); + if (name != NULL) { + slen = strlen(name) + 1; + alloclen += slen; + } else + slen = 0; + + if (alloclen + len <= PAGE_SIZE) { + alloclen += len; + flags = 0; + } else { + flags = NFS4_XATTR_ENTRY_EXTVAL; + } + + buf = kmalloc(alloclen, GFP_KERNEL_ACCOUNT | GFP_NOFS); + if (buf == NULL) + return NULL; + entry = (struct nfs4_xattr_entry *)buf; + + if (name != NULL) { + namep = buf + sizeof(struct nfs4_xattr_entry); + memcpy(namep, name, slen); + } else { + namep = NULL; + } + + + if (flags & NFS4_XATTR_ENTRY_EXTVAL) { + valp = kvmalloc(len, GFP_KERNEL_ACCOUNT | GFP_NOFS); + if (valp == NULL) { + kfree(buf); + return NULL; + } + } else if (len != 0) { + valp = buf + sizeof(struct nfs4_xattr_entry) + slen; + } else + valp = NULL; + + if (valp != NULL) { + if (value != NULL) + memcpy(valp, value, len); + else + _copy_from_pages(valp, pages, 0, len); + } + + entry->flags = flags; + entry->xattr_value = valp; + kref_init(&entry->ref); + entry->xattr_name = namep; + entry->xattr_size = len; + entry->bucket = NULL; + INIT_LIST_HEAD(&entry->lru); + INIT_LIST_HEAD(&entry->dispose); + INIT_HLIST_NODE(&entry->hnode); + + return entry; +} + +static void +nfs4_xattr_free_entry(struct nfs4_xattr_entry *entry) +{ + if (entry->flags & NFS4_XATTR_ENTRY_EXTVAL) + kvfree(entry->xattr_value); + kfree(entry); +} + +static void +nfs4_xattr_free_entry_cb(struct kref *kref) +{ + struct nfs4_xattr_entry *entry; + + entry = container_of(kref, struct nfs4_xattr_entry, ref); + + if (WARN_ON(!list_empty(&entry->lru))) + return; + + nfs4_xattr_free_entry(entry); +} + +static void +nfs4_xattr_free_cache_cb(struct kref *kref) +{ + struct nfs4_xattr_cache *cache; + int i; + + cache = container_of(kref, struct nfs4_xattr_cache, ref); + + for (i = 0; i < NFS4_XATTR_HASH_SIZE; i++) { + if (WARN_ON(!hlist_empty(&cache->buckets[i].hlist))) + return; + cache->buckets[i].draining = false; + } + + cache->listxattr = NULL; + + kmem_cache_free(nfs4_xattr_cache_cachep, cache); + +} + +static struct nfs4_xattr_cache * +nfs4_xattr_alloc_cache(void) +{ + struct nfs4_xattr_cache *cache; + + cache = kmem_cache_alloc(nfs4_xattr_cache_cachep, + GFP_KERNEL_ACCOUNT | GFP_NOFS); + if (cache == NULL) + return NULL; + + kref_init(&cache->ref); + atomic_long_set(&cache->nent, 0); + + return cache; +} + +/* + * Set the listxattr cache, which is a special-cased cache entry. + * The special value ERR_PTR(-ESTALE) is used to indicate that + * the cache is being drained - this prevents a new listxattr + * cache from being added to what is now a stale cache. + */ +static int +nfs4_xattr_set_listcache(struct nfs4_xattr_cache *cache, + struct nfs4_xattr_entry *new) +{ + struct nfs4_xattr_entry *old; + int ret = 1; + + spin_lock(&cache->listxattr_lock); + + old = cache->listxattr; + + if (old == ERR_PTR(-ESTALE)) { + ret = 0; + goto out; + } + + cache->listxattr = new; + if (new != NULL && new != ERR_PTR(-ESTALE)) + nfs4_xattr_entry_lru_add(new); + + if (old != NULL) { + nfs4_xattr_entry_lru_del(old); + kref_put(&old->ref, nfs4_xattr_free_entry_cb); + } +out: + spin_unlock(&cache->listxattr_lock); + + return ret; +} + +/* + * Unlink a cache from its parent inode, clearing out an invalid + * cache. Must be called with i_lock held. + */ +static struct nfs4_xattr_cache * +nfs4_xattr_cache_unlink(struct inode *inode) +{ + struct nfs_inode *nfsi; + struct nfs4_xattr_cache *oldcache; + + nfsi = NFS_I(inode); + + oldcache = nfsi->xattr_cache; + if (oldcache != NULL) { + list_lru_del(&nfs4_xattr_cache_lru, &oldcache->lru); + oldcache->inode = NULL; + } + nfsi->xattr_cache = NULL; + nfsi->cache_validity &= ~NFS_INO_INVALID_XATTR; + + return oldcache; + +} + +/* + * Discard a cache. Usually called by a worker, since walking all + * the entries can take up some cycles that we don't want to waste + * in the I/O path. Can also be called from the shrinker callback. + * + * The cache is dead, it has already been unlinked from its inode, + * and no longer appears on the cache LRU list. + * + * Mark all buckets as draining, so that no new entries are added. This + * could still happen in the unlikely, but possible case that another + * thread had grabbed a reference before it was unlinked from the inode, + * and is still holding it for an add operation. + * + * Remove all entries from the LRU lists, so that there is no longer + * any way to 'find' this cache. Then, remove the entries from the hash + * table. + * + * At that point, the cache will remain empty and can be freed when the final + * reference drops, which is very likely the kref_put at the end of + * this function, or the one called immediately afterwards in the + * shrinker callback. + */ +static void +nfs4_xattr_discard_cache(struct nfs4_xattr_cache *cache) +{ + unsigned int i; + struct nfs4_xattr_entry *entry; + struct nfs4_xattr_bucket *bucket; + struct hlist_node *n; + + nfs4_xattr_set_listcache(cache, ERR_PTR(-ESTALE)); + + for (i = 0; i < NFS4_XATTR_HASH_SIZE; i++) { + bucket = &cache->buckets[i]; + + spin_lock(&bucket->lock); + bucket->draining = true; + hlist_for_each_entry_safe(entry, n, &bucket->hlist, hnode) { + nfs4_xattr_entry_lru_del(entry); + hlist_del_init(&entry->hnode); + kref_put(&entry->ref, nfs4_xattr_free_entry_cb); + } + spin_unlock(&bucket->lock); + } + + atomic_long_set(&cache->nent, 0); + + kref_put(&cache->ref, nfs4_xattr_free_cache_cb); +} + +static void +nfs4_xattr_discard_cache_worker(struct work_struct *work) +{ + struct nfs4_xattr_cache *cache = container_of(work, + struct nfs4_xattr_cache, work); + + nfs4_xattr_discard_cache(cache); +} + +static void +nfs4_xattr_reap_cache(struct nfs4_xattr_cache *cache) +{ + queue_work(nfs4_xattr_cache_wq, &cache->work); +} + +/* + * Get a referenced copy of the cache structure. Avoid doing allocs + * while holding i_lock. Which means that we do some optimistic allocation, + * and might have to free the result in rare cases. + * + * This function only checks the NFS_INO_INVALID_XATTR cache validity bit + * and acts accordingly, replacing the cache when needed. For the read case + * (!add), this means that the caller must make sure that the cache + * is valid before caling this function. getxattr and listxattr call + * revalidate_inode to do this. The attribute cache timeout (for the + * non-delegated case) is expected to be dealt with in the revalidate + * call. + */ + +static struct nfs4_xattr_cache * +nfs4_xattr_get_cache(struct inode *inode, int add) +{ + struct nfs_inode *nfsi; + struct nfs4_xattr_cache *cache, *oldcache, *newcache; + + nfsi = NFS_I(inode); + + cache = oldcache = NULL; + + spin_lock(&inode->i_lock); + + if (nfsi->cache_validity & NFS_INO_INVALID_XATTR) + oldcache = nfs4_xattr_cache_unlink(inode); + else + cache = nfsi->xattr_cache; + + if (cache != NULL) + kref_get(&cache->ref); + + spin_unlock(&inode->i_lock); + + if (add && cache == NULL) { + newcache = NULL; + + cache = nfs4_xattr_alloc_cache(); + if (cache == NULL) + goto out; + + spin_lock(&inode->i_lock); + if (nfsi->cache_validity & NFS_INO_INVALID_XATTR) { + /* + * The cache was invalidated again. Give up, + * since what we want to enter is now likely + * outdated anyway. + */ + spin_unlock(&inode->i_lock); + kref_put(&cache->ref, nfs4_xattr_free_cache_cb); + cache = NULL; + goto out; + } + + /* + * Check if someone beat us to it. + */ + if (nfsi->xattr_cache != NULL) { + newcache = nfsi->xattr_cache; + kref_get(&newcache->ref); + } else { + kref_get(&cache->ref); + nfsi->xattr_cache = cache; + cache->inode = inode; + list_lru_add(&nfs4_xattr_cache_lru, &cache->lru); + } + + spin_unlock(&inode->i_lock); + + /* + * If there was a race, throw away the cache we just + * allocated, and use the new one allocated by someone + * else. + */ + if (newcache != NULL) { + kref_put(&cache->ref, nfs4_xattr_free_cache_cb); + cache = newcache; + } + } + +out: + /* + * Discarding an old cache is done via a workqueue. + */ + if (oldcache != NULL) + nfs4_xattr_reap_cache(oldcache); + + return cache; +} + +static inline struct nfs4_xattr_bucket * +nfs4_xattr_hash_bucket(struct nfs4_xattr_cache *cache, const char *name) +{ + return &cache->buckets[jhash(name, strlen(name), 0) & + (ARRAY_SIZE(cache->buckets) - 1)]; +} + +static struct nfs4_xattr_entry * +nfs4_xattr_get_entry(struct nfs4_xattr_bucket *bucket, const char *name) +{ + struct nfs4_xattr_entry *entry; + + entry = NULL; + + hlist_for_each_entry(entry, &bucket->hlist, hnode) { + if (!strcmp(entry->xattr_name, name)) + break; + } + + return entry; +} + +static int +nfs4_xattr_hash_add(struct nfs4_xattr_cache *cache, + struct nfs4_xattr_entry *entry) +{ + struct nfs4_xattr_bucket *bucket; + struct nfs4_xattr_entry *oldentry = NULL; + int ret = 1; + + bucket = nfs4_xattr_hash_bucket(cache, entry->xattr_name); + entry->bucket = bucket; + + spin_lock(&bucket->lock); + + if (bucket->draining) { + ret = 0; + goto out; + } + + oldentry = nfs4_xattr_get_entry(bucket, entry->xattr_name); + if (oldentry != NULL) { + hlist_del_init(&oldentry->hnode); + nfs4_xattr_entry_lru_del(oldentry); + } else { + atomic_long_inc(&cache->nent); + } + + hlist_add_head(&entry->hnode, &bucket->hlist); + nfs4_xattr_entry_lru_add(entry); + +out: + spin_unlock(&bucket->lock); + + if (oldentry != NULL) + kref_put(&oldentry->ref, nfs4_xattr_free_entry_cb); + + return ret; +} + +static void +nfs4_xattr_hash_remove(struct nfs4_xattr_cache *cache, const char *name) +{ + struct nfs4_xattr_bucket *bucket; + struct nfs4_xattr_entry *entry; + + bucket = nfs4_xattr_hash_bucket(cache, name); + + spin_lock(&bucket->lock); + + entry = nfs4_xattr_get_entry(bucket, name); + if (entry != NULL) { + hlist_del_init(&entry->hnode); + nfs4_xattr_entry_lru_del(entry); + atomic_long_dec(&cache->nent); + } + + spin_unlock(&bucket->lock); + + if (entry != NULL) + kref_put(&entry->ref, nfs4_xattr_free_entry_cb); +} + +static struct nfs4_xattr_entry * +nfs4_xattr_hash_find(struct nfs4_xattr_cache *cache, const char *name) +{ + struct nfs4_xattr_bucket *bucket; + struct nfs4_xattr_entry *entry; + + bucket = nfs4_xattr_hash_bucket(cache, name); + + spin_lock(&bucket->lock); + + entry = nfs4_xattr_get_entry(bucket, name); + if (entry != NULL) + kref_get(&entry->ref); + + spin_unlock(&bucket->lock); + + return entry; +} + +/* + * Entry point to retrieve an entry from the cache. + */ +ssize_t nfs4_xattr_cache_get(struct inode *inode, const char *name, char *buf, + ssize_t buflen) +{ + struct nfs4_xattr_cache *cache; + struct nfs4_xattr_entry *entry; + ssize_t ret; + + cache = nfs4_xattr_get_cache(inode, 0); + if (cache == NULL) + return -ENOENT; + + ret = 0; + entry = nfs4_xattr_hash_find(cache, name); + + if (entry != NULL) { + dprintk("%s: cache hit '%s', len %lu\n", __func__, + entry->xattr_name, (unsigned long)entry->xattr_size); + if (buflen == 0) { + /* Length probe only */ + ret = entry->xattr_size; + } else if (buflen < entry->xattr_size) + ret = -ERANGE; + else { + memcpy(buf, entry->xattr_value, entry->xattr_size); + ret = entry->xattr_size; + } + kref_put(&entry->ref, nfs4_xattr_free_entry_cb); + } else { + dprintk("%s: cache miss '%s'\n", __func__, name); + ret = -ENOENT; + } + + kref_put(&cache->ref, nfs4_xattr_free_cache_cb); + + return ret; +} + +/* + * Retrieve a cached list of xattrs from the cache. + */ +ssize_t nfs4_xattr_cache_list(struct inode *inode, char *buf, ssize_t buflen) +{ + struct nfs4_xattr_cache *cache; + struct nfs4_xattr_entry *entry; + ssize_t ret; + + cache = nfs4_xattr_get_cache(inode, 0); + if (cache == NULL) + return -ENOENT; + + spin_lock(&cache->listxattr_lock); + + entry = cache->listxattr; + + if (entry != NULL && entry != ERR_PTR(-ESTALE)) { + if (buflen == 0) { + /* Length probe only */ + ret = entry->xattr_size; + } else if (entry->xattr_size > buflen) + ret = -ERANGE; + else { + memcpy(buf, entry->xattr_value, entry->xattr_size); + ret = entry->xattr_size; + } + } else { + ret = -ENOENT; + } + + spin_unlock(&cache->listxattr_lock); + + kref_put(&cache->ref, nfs4_xattr_free_cache_cb); + + return ret; +} + +/* + * Add an xattr to the cache. + * + * This also invalidates the xattr list cache. + */ +void nfs4_xattr_cache_add(struct inode *inode, const char *name, + const char *buf, struct page **pages, ssize_t buflen) +{ + struct nfs4_xattr_cache *cache; + struct nfs4_xattr_entry *entry; + + dprintk("%s: add '%s' len %lu\n", __func__, + name, (unsigned long)buflen); + + cache = nfs4_xattr_get_cache(inode, 1); + if (cache == NULL) + return; + + entry = nfs4_xattr_alloc_entry(name, buf, pages, buflen); + if (entry == NULL) + goto out; + + (void)nfs4_xattr_set_listcache(cache, NULL); + + if (!nfs4_xattr_hash_add(cache, entry)) + kref_put(&entry->ref, nfs4_xattr_free_entry_cb); + +out: + kref_put(&cache->ref, nfs4_xattr_free_cache_cb); +} + + +/* + * Remove an xattr from the cache. + * + * This also invalidates the xattr list cache. + */ +void nfs4_xattr_cache_remove(struct inode *inode, const char *name) +{ + struct nfs4_xattr_cache *cache; + + dprintk("%s: remove '%s'\n", __func__, name); + + cache = nfs4_xattr_get_cache(inode, 0); + if (cache == NULL) + return; + + (void)nfs4_xattr_set_listcache(cache, NULL); + nfs4_xattr_hash_remove(cache, name); + + kref_put(&cache->ref, nfs4_xattr_free_cache_cb); +} + +/* + * Cache listxattr output, replacing any possible old one. + */ +void nfs4_xattr_cache_set_list(struct inode *inode, const char *buf, + ssize_t buflen) +{ + struct nfs4_xattr_cache *cache; + struct nfs4_xattr_entry *entry; + + cache = nfs4_xattr_get_cache(inode, 1); + if (cache == NULL) + return; + + entry = nfs4_xattr_alloc_entry(NULL, buf, NULL, buflen); + if (entry == NULL) + goto out; + + /* + * This is just there to be able to get to bucket->cache, + * which is obviously the same for all buckets, so just + * use bucket 0. + */ + entry->bucket = &cache->buckets[0]; + + if (!nfs4_xattr_set_listcache(cache, entry)) + kref_put(&entry->ref, nfs4_xattr_free_entry_cb); + +out: + kref_put(&cache->ref, nfs4_xattr_free_cache_cb); +} + +/* + * Zap the entire cache. Called when an inode is evicted. + */ +void nfs4_xattr_cache_zap(struct inode *inode) +{ + struct nfs4_xattr_cache *oldcache; + + spin_lock(&inode->i_lock); + oldcache = nfs4_xattr_cache_unlink(inode); + spin_unlock(&inode->i_lock); + + if (oldcache) + nfs4_xattr_discard_cache(oldcache); +} + +/* + * The entry LRU is shrunk more aggressively than the cache LRU, + * by settings @seeks to 1. + * + * Cache structures are freed only when they've become empty, after + * pruning all but one entry. + */ + +static unsigned long nfs4_xattr_cache_count(struct shrinker *shrink, + struct shrink_control *sc); +static unsigned long nfs4_xattr_entry_count(struct shrinker *shrink, + struct shrink_control *sc); +static unsigned long nfs4_xattr_cache_scan(struct shrinker *shrink, + struct shrink_control *sc); +static unsigned long nfs4_xattr_entry_scan(struct shrinker *shrink, + struct shrink_control *sc); + +static struct shrinker nfs4_xattr_cache_shrinker = { + .count_objects = nfs4_xattr_cache_count, + .scan_objects = nfs4_xattr_cache_scan, + .seeks = DEFAULT_SEEKS, + .flags = SHRINKER_MEMCG_AWARE, +}; + +static struct shrinker nfs4_xattr_entry_shrinker = { + .count_objects = nfs4_xattr_entry_count, + .scan_objects = nfs4_xattr_entry_scan, + .seeks = DEFAULT_SEEKS, + .batch = 512, + .flags = SHRINKER_MEMCG_AWARE, +}; + +static struct shrinker nfs4_xattr_large_entry_shrinker = { + .count_objects = nfs4_xattr_entry_count, + .scan_objects = nfs4_xattr_entry_scan, + .seeks = 1, + .batch = 512, + .flags = SHRINKER_MEMCG_AWARE, +}; + +static enum lru_status +cache_lru_isolate(struct list_head *item, + struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) +{ + struct list_head *dispose = arg; + struct inode *inode; + struct nfs4_xattr_cache *cache = container_of(item, + struct nfs4_xattr_cache, lru); + + if (atomic_long_read(&cache->nent) > 1) + return LRU_SKIP; + + /* + * If a cache structure is on the LRU list, we know that + * its inode is valid. Try to lock it to break the link. + * Since we're inverting the lock order here, only try. + */ + inode = cache->inode; + + if (!spin_trylock(&inode->i_lock)) + return LRU_SKIP; + + kref_get(&cache->ref); + + cache->inode = NULL; + NFS_I(inode)->xattr_cache = NULL; + NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_XATTR; + list_lru_isolate(lru, &cache->lru); + + spin_unlock(&inode->i_lock); + + list_add_tail(&cache->dispose, dispose); + return LRU_REMOVED; +} + +static unsigned long +nfs4_xattr_cache_scan(struct shrinker *shrink, struct shrink_control *sc) +{ + LIST_HEAD(dispose); + unsigned long freed; + struct nfs4_xattr_cache *cache; + + freed = list_lru_shrink_walk(&nfs4_xattr_cache_lru, sc, + cache_lru_isolate, &dispose); + while (!list_empty(&dispose)) { + cache = list_first_entry(&dispose, struct nfs4_xattr_cache, + dispose); + list_del_init(&cache->dispose); + nfs4_xattr_discard_cache(cache); + kref_put(&cache->ref, nfs4_xattr_free_cache_cb); + } + + return freed; +} + + +static unsigned long +nfs4_xattr_cache_count(struct shrinker *shrink, struct shrink_control *sc) +{ + unsigned long count; + + count = list_lru_count(&nfs4_xattr_cache_lru); + return vfs_pressure_ratio(count); +} + +static enum lru_status +entry_lru_isolate(struct list_head *item, + struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) +{ + struct list_head *dispose = arg; + struct nfs4_xattr_bucket *bucket; + struct nfs4_xattr_cache *cache; + struct nfs4_xattr_entry *entry = container_of(item, + struct nfs4_xattr_entry, lru); + + bucket = entry->bucket; + cache = bucket->cache; + + /* + * Unhook the entry from its parent (either a cache bucket + * or a cache structure if it's a listxattr buf), so that + * it's no longer found. Then add it to the isolate list, + * to be freed later. + * + * In both cases, we're reverting lock order, so use + * trylock and skip the entry if we can't get the lock. + */ + if (entry->xattr_name != NULL) { + /* Regular cache entry */ + if (!spin_trylock(&bucket->lock)) + return LRU_SKIP; + + kref_get(&entry->ref); + + hlist_del_init(&entry->hnode); + atomic_long_dec(&cache->nent); + list_lru_isolate(lru, &entry->lru); + + spin_unlock(&bucket->lock); + } else { + /* Listxattr cache entry */ + if (!spin_trylock(&cache->listxattr_lock)) + return LRU_SKIP; + + kref_get(&entry->ref); + + cache->listxattr = NULL; + list_lru_isolate(lru, &entry->lru); + + spin_unlock(&cache->listxattr_lock); + } + + list_add_tail(&entry->dispose, dispose); + return LRU_REMOVED; +} + +static unsigned long +nfs4_xattr_entry_scan(struct shrinker *shrink, struct shrink_control *sc) +{ + LIST_HEAD(dispose); + unsigned long freed; + struct nfs4_xattr_entry *entry; + struct list_lru *lru; + + lru = (shrink == &nfs4_xattr_large_entry_shrinker) ? + &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru; + + freed = list_lru_shrink_walk(lru, sc, entry_lru_isolate, &dispose); + + while (!list_empty(&dispose)) { + entry = list_first_entry(&dispose, struct nfs4_xattr_entry, + dispose); + list_del_init(&entry->dispose); + + /* + * Drop two references: the one that we just grabbed + * in entry_lru_isolate, and the one that was set + * when the entry was first allocated. + */ + kref_put(&entry->ref, nfs4_xattr_free_entry_cb); + kref_put(&entry->ref, nfs4_xattr_free_entry_cb); + } + + return freed; +} + +static unsigned long +nfs4_xattr_entry_count(struct shrinker *shrink, struct shrink_control *sc) +{ + unsigned long count; + struct list_lru *lru; + + lru = (shrink == &nfs4_xattr_large_entry_shrinker) ? + &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru; + + count = list_lru_count(lru); + return vfs_pressure_ratio(count); +} + + +static void nfs4_xattr_cache_init_once(void *p) +{ + struct nfs4_xattr_cache *cache = (struct nfs4_xattr_cache *)p; + + spin_lock_init(&cache->listxattr_lock); + atomic_long_set(&cache->nent, 0); + nfs4_xattr_hash_init(cache); + cache->listxattr = NULL; + INIT_WORK(&cache->work, nfs4_xattr_discard_cache_worker); + INIT_LIST_HEAD(&cache->lru); + INIT_LIST_HEAD(&cache->dispose); +} + +int __init nfs4_xattr_cache_init(void) +{ + int ret = 0; + + nfs4_xattr_cache_cachep = kmem_cache_create("nfs4_xattr_cache_cache", + sizeof(struct nfs4_xattr_cache), 0, + (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT), + nfs4_xattr_cache_init_once); + if (nfs4_xattr_cache_cachep == NULL) + return -ENOMEM; + + ret = list_lru_init_memcg(&nfs4_xattr_large_entry_lru, + &nfs4_xattr_large_entry_shrinker); + if (ret) + goto out4; + + ret = list_lru_init_memcg(&nfs4_xattr_entry_lru, + &nfs4_xattr_entry_shrinker); + if (ret) + goto out3; + + ret = list_lru_init_memcg(&nfs4_xattr_cache_lru, + &nfs4_xattr_cache_shrinker); + if (ret) + goto out2; + + nfs4_xattr_cache_wq = alloc_workqueue("nfs4_xattr", WQ_MEM_RECLAIM, 0); + if (nfs4_xattr_cache_wq == NULL) + goto out1; + + ret = register_shrinker(&nfs4_xattr_cache_shrinker); + if (ret) + goto out0; + + ret = register_shrinker(&nfs4_xattr_entry_shrinker); + if (ret) + goto out; + + ret = register_shrinker(&nfs4_xattr_large_entry_shrinker); + if (!ret) + return 0; + + unregister_shrinker(&nfs4_xattr_entry_shrinker); +out: + unregister_shrinker(&nfs4_xattr_cache_shrinker); +out0: + destroy_workqueue(nfs4_xattr_cache_wq); +out1: + list_lru_destroy(&nfs4_xattr_cache_lru); +out2: + list_lru_destroy(&nfs4_xattr_entry_lru); +out3: + list_lru_destroy(&nfs4_xattr_large_entry_lru); +out4: + kmem_cache_destroy(nfs4_xattr_cache_cachep); + + return ret; +} + +void nfs4_xattr_cache_exit(void) +{ + unregister_shrinker(&nfs4_xattr_entry_shrinker); + unregister_shrinker(&nfs4_xattr_cache_shrinker); + list_lru_destroy(&nfs4_xattr_entry_lru); + list_lru_destroy(&nfs4_xattr_cache_lru); + kmem_cache_destroy(nfs4_xattr_cache_cachep); + destroy_workqueue(nfs4_xattr_cache_wq); +} diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 2fa9e4ea98d2..7e16a586f3fc 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -626,12 +626,34 @@ static inline bool nfs4_state_match_open_stateid_other(const struct nfs4_state * nfs4_stateid_match_other(&state->open_stateid, stateid); } +/* nfs42xattr.c */ +#ifdef CONFIG_NFS_V4_2 +extern int __init nfs4_xattr_cache_init(void); +extern void nfs4_xattr_cache_exit(void); +extern void nfs4_xattr_cache_add(struct inode *inode, const char *name, + const char *buf, struct page **pages, + ssize_t buflen); +extern void nfs4_xattr_cache_remove(struct inode *inode, const char *name); +extern ssize_t nfs4_xattr_cache_get(struct inode *inode, const char *name, + char *buf, ssize_t buflen); +extern void nfs4_xattr_cache_set_list(struct inode *inode, const char *buf, + ssize_t buflen); +extern ssize_t nfs4_xattr_cache_list(struct inode *inode, char *buf, + ssize_t buflen); +extern void nfs4_xattr_cache_zap(struct inode *inode); #else +static inline void nfs4_xattr_cache_zap(struct inode *inode) +{ +} +#endif /* CONFIG_NFS_V4_2 */ + +#else /* CONFIG_NFS_V4 */ #define nfs4_close_state(a, b) do { } while (0) #define nfs4_close_sync(a, b) do { } while (0) #define nfs4_state_protect(a, b, c, d) do { } while (0) #define nfs4_state_protect_write(a, b, c, d) do { } while (0) + #endif /* CONFIG_NFS_V4 */ #endif /* __LINUX_FS_NFS_NFS4_FS.H */ diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 92a07956f07b..f670ff64b31e 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -7448,6 +7448,7 @@ static int nfs4_xattr_set_nfs4_user(const struct xattr_handler *handler, size_t buflen, int flags) { struct nfs_access_entry cache; + int ret; if (!nfs_server_capable(inode, NFS_CAP_XATTR)) return -EOPNOTSUPP; @@ -7466,10 +7467,17 @@ static int nfs4_xattr_set_nfs4_user(const struct xattr_handler *handler, return -EACCES; } - if (buf == NULL) - return nfs42_proc_removexattr(inode, key); - else - return nfs42_proc_setxattr(inode, key, buf, buflen, flags); + if (buf == NULL) { + ret = nfs42_proc_removexattr(inode, key); + if (!ret) + nfs4_xattr_cache_remove(inode, key); + } else { + ret = nfs42_proc_setxattr(inode, key, buf, buflen, flags); + if (!ret) + nfs4_xattr_cache_add(inode, key, buf, NULL, buflen); + } + + return ret; } static int nfs4_xattr_get_nfs4_user(const struct xattr_handler *handler, @@ -7477,6 +7485,7 @@ static int nfs4_xattr_get_nfs4_user(const struct xattr_handler *handler, const char *key, void *buf, size_t buflen) { struct nfs_access_entry cache; + ssize_t ret; if (!nfs_server_capable(inode, NFS_CAP_XATTR)) return -EOPNOTSUPP; @@ -7486,7 +7495,17 @@ static int nfs4_xattr_get_nfs4_user(const struct xattr_handler *handler, return -EACCES; } - return nfs42_proc_getxattr(inode, key, buf, buflen); + ret = nfs_revalidate_inode(NFS_SERVER(inode), inode); + if (ret) + return ret; + + ret = nfs4_xattr_cache_get(inode, key, buf, buflen); + if (ret >= 0 || (ret < 0 && ret != -ENOENT)) + return ret; + + ret = nfs42_proc_getxattr(inode, key, buf, buflen); + + return ret; } static ssize_t @@ -7494,7 +7513,7 @@ nfs4_listxattr_nfs4_user(struct inode *inode, char *list, size_t list_len) { u64 cookie; bool eof; - int ret, size; + ssize_t ret, size; char *buf; size_t buflen; struct nfs_access_entry cache; @@ -7507,6 +7526,14 @@ nfs4_listxattr_nfs4_user(struct inode *inode, char *list, size_t list_len) return 0; } + ret = nfs_revalidate_inode(NFS_SERVER(inode), inode); + if (ret) + return ret; + + ret = nfs4_xattr_cache_list(inode, list, list_len); + if (ret >= 0 || (ret < 0 && ret != -ENOENT)) + return ret; + cookie = 0; eof = false; buflen = list_len ? list_len : XATTR_LIST_MAX; @@ -7526,6 +7553,9 @@ nfs4_listxattr_nfs4_user(struct inode *inode, char *list, size_t list_len) size += ret; } + if (list_len) + nfs4_xattr_cache_set_list(inode, list, size); + return size; } diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c index 1475f932d7da..0c1ab846b83d 100644 --- a/fs/nfs/nfs4super.c +++ b/fs/nfs/nfs4super.c @@ -69,6 +69,7 @@ static void nfs4_evict_inode(struct inode *inode) pnfs_destroy_layout(NFS_I(inode)); /* First call standard NFS clear_inode() code */ nfs_clear_inode(inode); + nfs4_xattr_cache_zap(inode); } struct nfs_referral_count { @@ -268,6 +269,12 @@ static int __init init_nfs_v4(void) if (err) goto out1; +#ifdef CONFIG_NFS_V4_2 + err = nfs4_xattr_cache_init(); + if (err) + goto out2; +#endif + err = nfs4_register_sysctl(); if (err) goto out2; @@ -288,6 +295,9 @@ static void __exit exit_nfs_v4(void) nfs4_pnfs_v3_ds_connect_unload(); unregister_nfs_version(&nfs_v4); +#ifdef CONFIG_NFS_V4_2 + nfs4_xattr_cache_exit(); +#endif nfs4_unregister_sysctl(); nfs_idmap_quit(); nfs_dns_resolver_destroy(); diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 943ee750d68c..a2c6455ea3fa 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -102,6 +102,8 @@ struct nfs_delegation; struct posix_acl; +struct nfs4_xattr_cache; + /* * nfs fs inode data in memory */ @@ -188,6 +190,10 @@ struct nfs_inode { struct fscache_cookie *fscache; #endif struct inode vfs_inode; + +#ifdef CONFIG_NFS_V4_2 + struct nfs4_xattr_cache *xattr_cache; +#endif }; struct nfs4_copy_state { diff --git a/include/uapi/linux/nfs_fs.h b/include/uapi/linux/nfs_fs.h index 7bcc8cd6831d..3afe3767c55d 100644 --- a/include/uapi/linux/nfs_fs.h +++ b/include/uapi/linux/nfs_fs.h @@ -56,6 +56,7 @@ #define NFSDBG_PNFS 0x1000 #define NFSDBG_PNFS_LD 0x2000 #define NFSDBG_STATE 0x4000 +#define NFSDBG_XATTRCACHE 0x8000 #define NFSDBG_ALL 0xFFFF -- cgit v1.2.3-59-g8ed1b From 8aa5a33578e9685d06020bd10d1637557423e945 Mon Sep 17 00:00:00 2001 From: Ciara Loftus Date: Wed, 8 Jul 2020 07:28:33 +0000 Subject: xsk: Add new statistics It can be useful for the user to know the reason behind a dropped packet. Introduce new counters which track drops on the receive path caused by: 1. rx ring being full 2. fill ring being empty Also, on the tx path introduce a counter which tracks the number of times we attempt pull from the tx ring when it is empty. Signed-off-by: Ciara Loftus Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200708072835.4427-2-ciara.loftus@intel.com --- include/net/xdp_sock.h | 4 ++++ include/uapi/linux/if_xdp.h | 5 ++++- net/xdp/xsk.c | 36 +++++++++++++++++++++++++++++++----- net/xdp/xsk_buff_pool.c | 1 + net/xdp/xsk_queue.h | 6 ++++++ tools/include/uapi/linux/if_xdp.h | 5 ++++- 6 files changed, 50 insertions(+), 7 deletions(-) (limited to 'include/uapi') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 96bfc5f5f24e..c9d87cc40c11 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -69,7 +69,11 @@ struct xdp_sock { spinlock_t tx_completion_lock; /* Protects generic receive. */ spinlock_t rx_lock; + + /* Statistics */ u64 rx_dropped; + u64 rx_queue_full; + struct list_head map_list; /* Protects map_list */ spinlock_t map_list_lock; diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index be328c59389d..a78a8096f4ce 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -73,9 +73,12 @@ struct xdp_umem_reg { }; struct xdp_statistics { - __u64 rx_dropped; /* Dropped for reasons other than invalid desc */ + __u64 rx_dropped; /* Dropped for other reasons */ __u64 rx_invalid_descs; /* Dropped due to invalid descriptor */ __u64 tx_invalid_descs; /* Dropped due to invalid descriptor */ + __u64 rx_ring_full; /* Dropped due to rx ring being full */ + __u64 rx_fill_ring_empty_descs; /* Failed to retrieve item from fill ring */ + __u64 tx_ring_empty_descs; /* Failed to retrieve item from tx ring */ }; struct xdp_options { diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 3700266229f6..26e3bba8c204 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -123,7 +123,7 @@ static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) addr = xp_get_handle(xskb); err = xskq_prod_reserve_desc(xs->rx, addr, len); if (err) { - xs->rx_dropped++; + xs->rx_queue_full++; return err; } @@ -274,8 +274,10 @@ bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc) rcu_read_lock(); list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) { - if (!xskq_cons_peek_desc(xs->tx, desc, umem)) + if (!xskq_cons_peek_desc(xs->tx, desc, umem)) { + xs->tx->queue_empty_descs++; continue; + } /* This is the backpressure mechanism for the Tx path. * Reserve space in the completion queue and only proceed @@ -387,6 +389,8 @@ static int xsk_generic_xmit(struct sock *sk) sent_frame = true; } + xs->tx->queue_empty_descs++; + out: if (sent_frame) sk->sk_write_space(sk); @@ -812,6 +816,12 @@ static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring) ring->desc = offsetof(struct xdp_umem_ring, desc); } +struct xdp_statistics_v1 { + __u64 rx_dropped; + __u64 rx_invalid_descs; + __u64 tx_invalid_descs; +}; + static int xsk_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { @@ -831,19 +841,35 @@ static int xsk_getsockopt(struct socket *sock, int level, int optname, case XDP_STATISTICS: { struct xdp_statistics stats; + bool extra_stats = true; + size_t stats_size; - if (len < sizeof(stats)) + if (len < sizeof(struct xdp_statistics_v1)) { return -EINVAL; + } else if (len < sizeof(stats)) { + extra_stats = false; + stats_size = sizeof(struct xdp_statistics_v1); + } else { + stats_size = sizeof(stats); + } mutex_lock(&xs->mutex); stats.rx_dropped = xs->rx_dropped; + if (extra_stats) { + stats.rx_ring_full = xs->rx_queue_full; + stats.rx_fill_ring_empty_descs = + xs->umem ? xskq_nb_queue_empty_descs(xs->umem->fq) : 0; + stats.tx_ring_empty_descs = xskq_nb_queue_empty_descs(xs->tx); + } else { + stats.rx_dropped += xs->rx_queue_full; + } stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx); stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx); mutex_unlock(&xs->mutex); - if (copy_to_user(optval, &stats, sizeof(stats))) + if (copy_to_user(optval, &stats, stats_size)) return -EFAULT; - if (put_user(sizeof(stats), optlen)) + if (put_user(stats_size, optlen)) return -EFAULT; return 0; diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 540ed75e4482..89cf3551d3e9 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -235,6 +235,7 @@ static struct xdp_buff_xsk *__xp_alloc(struct xsk_buff_pool *pool) for (;;) { if (!xskq_cons_peek_addr_unchecked(pool->fq, &addr)) { + pool->fq->queue_empty_descs++; xp_release(xskb); return NULL; } diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 5b5d24d2dd37..bf42cfd74b89 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -38,6 +38,7 @@ struct xsk_queue { u32 cached_cons; struct xdp_ring *ring; u64 invalid_descs; + u64 queue_empty_descs; }; /* The structure of the shared state of the rings are the same as the @@ -354,6 +355,11 @@ static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q) return q ? q->invalid_descs : 0; } +static inline u64 xskq_nb_queue_empty_descs(struct xsk_queue *q) +{ + return q ? q->queue_empty_descs : 0; +} + struct xsk_queue *xskq_create(u32 nentries, bool umem_queue); void xskq_destroy(struct xsk_queue *q_ops); diff --git a/tools/include/uapi/linux/if_xdp.h b/tools/include/uapi/linux/if_xdp.h index be328c59389d..a78a8096f4ce 100644 --- a/tools/include/uapi/linux/if_xdp.h +++ b/tools/include/uapi/linux/if_xdp.h @@ -73,9 +73,12 @@ struct xdp_umem_reg { }; struct xdp_statistics { - __u64 rx_dropped; /* Dropped for reasons other than invalid desc */ + __u64 rx_dropped; /* Dropped for other reasons */ __u64 rx_invalid_descs; /* Dropped due to invalid descriptor */ __u64 tx_invalid_descs; /* Dropped due to invalid descriptor */ + __u64 rx_ring_full; /* Dropped due to rx ring being full */ + __u64 rx_fill_ring_empty_descs; /* Failed to retrieve item from fill ring */ + __u64 tx_ring_empty_descs; /* Failed to retrieve item from tx ring */ }; struct xdp_options { -- cgit v1.2.3-59-g8ed1b From 0d80cb4612aa32dc0faa17fa3ab6f96f33e2b4a7 Mon Sep 17 00:00:00 2001 From: Ciara Loftus Date: Wed, 8 Jul 2020 07:28:35 +0000 Subject: xsk: Add xdp statistics to xsk_diag Add xdp statistics to the information dumped through the xsk_diag interface Signed-off-by: Ciara Loftus Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200708072835.4427-4-ciara.loftus@intel.com --- include/uapi/linux/xdp_diag.h | 11 +++++++++++ net/xdp/xsk_diag.c | 17 +++++++++++++++++ 2 files changed, 28 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/xdp_diag.h b/include/uapi/linux/xdp_diag.h index 78b2591a7782..66b9973b4f4c 100644 --- a/include/uapi/linux/xdp_diag.h +++ b/include/uapi/linux/xdp_diag.h @@ -30,6 +30,7 @@ struct xdp_diag_msg { #define XDP_SHOW_RING_CFG (1 << 1) #define XDP_SHOW_UMEM (1 << 2) #define XDP_SHOW_MEMINFO (1 << 3) +#define XDP_SHOW_STATS (1 << 4) enum { XDP_DIAG_NONE, @@ -41,6 +42,7 @@ enum { XDP_DIAG_UMEM_FILL_RING, XDP_DIAG_UMEM_COMPLETION_RING, XDP_DIAG_MEMINFO, + XDP_DIAG_STATS, __XDP_DIAG_MAX, }; @@ -69,4 +71,13 @@ struct xdp_diag_umem { __u32 refs; }; +struct xdp_diag_stats { + __u64 n_rx_dropped; + __u64 n_rx_invalid; + __u64 n_rx_full; + __u64 n_fill_ring_empty; + __u64 n_tx_invalid; + __u64 n_tx_ring_empty; +}; + #endif /* _LINUX_XDP_DIAG_H */ diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c index 0163b26aaf63..21e9c2d123ee 100644 --- a/net/xdp/xsk_diag.c +++ b/net/xdp/xsk_diag.c @@ -76,6 +76,19 @@ static int xsk_diag_put_umem(const struct xdp_sock *xs, struct sk_buff *nlskb) return err; } +static int xsk_diag_put_stats(const struct xdp_sock *xs, struct sk_buff *nlskb) +{ + struct xdp_diag_stats du = {}; + + du.n_rx_dropped = xs->rx_dropped; + du.n_rx_invalid = xskq_nb_invalid_descs(xs->rx); + du.n_rx_full = xs->rx_queue_full; + du.n_fill_ring_empty = xs->umem ? xskq_nb_queue_empty_descs(xs->umem->fq) : 0; + du.n_tx_invalid = xskq_nb_invalid_descs(xs->tx); + du.n_tx_ring_empty = xskq_nb_queue_empty_descs(xs->tx); + return nla_put(nlskb, XDP_DIAG_STATS, sizeof(du), &du); +} + static int xsk_diag_fill(struct sock *sk, struct sk_buff *nlskb, struct xdp_diag_req *req, struct user_namespace *user_ns, @@ -118,6 +131,10 @@ static int xsk_diag_fill(struct sock *sk, struct sk_buff *nlskb, sock_diag_put_meminfo(sk, nlskb, XDP_DIAG_MEMINFO)) goto out_nlmsg_trim; + if ((req->xdiag_show & XDP_SHOW_STATS) && + xsk_diag_put_stats(xs, nlskb)) + goto out_nlmsg_trim; + mutex_unlock(&xs->mutex); nlmsg_end(nlskb, nlh); return 0; -- cgit v1.2.3-59-g8ed1b From ed757328c34015be4ec51861a90bb3bcc807ad58 Mon Sep 17 00:00:00 2001 From: "Alexander A. Klimov" Date: Mon, 13 Jul 2020 12:24:18 +0200 Subject: atm: Replace HTTP links with HTTPS ones Rationale: Reduces attack surface on kernel devs opening the links for MITM as HTTPS traffic is much harder to manipulate. Deterministic algorithm: For each file: If not .svg: For each line: If doesn't contain `\bxmlns\b`: For each link, `\bhttp://[^# \t\r\n]*(?:\w|/)`: If neither `\bgnu\.org/license`, nor `\bmozilla\.org/MPL\b`: If both the HTTP and HTTPS versions return 200 OK and serve the same content: Replace HTTP with HTTPS. Signed-off-by: Alexander A. Klimov Signed-off-by: David S. Miller --- drivers/atm/solos-pci.c | 2 +- include/uapi/linux/atmioc.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/drivers/atm/solos-pci.c b/drivers/atm/solos-pci.c index c32f7dd9879a..b7646ae55942 100644 --- a/drivers/atm/solos-pci.c +++ b/drivers/atm/solos-pci.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* * Driver for the Solos PCI ADSL2+ card, designed to support Linux by - * Traverse Technologies -- http://www.traverse.com.au/ + * Traverse Technologies -- https://www.traverse.com.au/ * Xrio Limited -- http://www.xrio.com/ * * Copyright © 2008 Traverse Technologies diff --git a/include/uapi/linux/atmioc.h b/include/uapi/linux/atmioc.h index cd7655e40c77..a9030bcc8d56 100644 --- a/include/uapi/linux/atmioc.h +++ b/include/uapi/linux/atmioc.h @@ -5,7 +5,7 @@ /* - * See http://icawww1.epfl.ch/linux-atm/magic.html for the complete list of + * See https://icawww1.epfl.ch/linux-atm/magic.html for the complete list of * "magic" ioctl numbers. */ -- cgit v1.2.3-59-g8ed1b From 2801758391ba6b0c20e253b956355e1b15ad85a2 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Tue, 14 Jul 2020 09:34:48 +0200 Subject: bridge: uapi: mrp: Extend MRP attributes for MRP interconnect Extend the existing MRP netlink attributes to allow to configure MRP Interconnect: IFLA_BRIDGE_MRP_IN_ROLE - the parameter type is br_mrp_in_role which contains the interconnect id, the ring id, the interconnect role(MIM or MIC) and the port ifindex that represents the interconnect port. IFLA_BRIDGE_MRP_IN_STATE - the parameter type is br_mrp_in_state which contains the interconnect id and the interconnect state. IFLA_BRIDGE_MRP_IN_TEST - the parameter type is br_mrp_start_in_test which contains the interconnect id, the interval at which to send MRP_InTest frames, how many test frames can be missed before declaring the interconnect ring open and the period which represents for how long to send MRP_InTest frames. Signed-off-by: Horatiu Vultur Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 53 +++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/mrp_bridge.h | 38 +++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index c114c1c2bd53..d840a3e37a37 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -167,6 +167,9 @@ enum { IFLA_BRIDGE_MRP_RING_ROLE, IFLA_BRIDGE_MRP_START_TEST, IFLA_BRIDGE_MRP_INFO, + IFLA_BRIDGE_MRP_IN_ROLE, + IFLA_BRIDGE_MRP_IN_STATE, + IFLA_BRIDGE_MRP_START_IN_TEST, __IFLA_BRIDGE_MRP_MAX, }; @@ -245,6 +248,37 @@ enum { #define IFLA_BRIDGE_MRP_INFO_MAX (__IFLA_BRIDGE_MRP_INFO_MAX - 1) +enum { + IFLA_BRIDGE_MRP_IN_STATE_UNSPEC, + IFLA_BRIDGE_MRP_IN_STATE_IN_ID, + IFLA_BRIDGE_MRP_IN_STATE_STATE, + __IFLA_BRIDGE_MRP_IN_STATE_MAX, +}; + +#define IFLA_BRIDGE_MRP_IN_STATE_MAX (__IFLA_BRIDGE_MRP_IN_STATE_MAX - 1) + +enum { + IFLA_BRIDGE_MRP_IN_ROLE_UNSPEC, + IFLA_BRIDGE_MRP_IN_ROLE_RING_ID, + IFLA_BRIDGE_MRP_IN_ROLE_IN_ID, + IFLA_BRIDGE_MRP_IN_ROLE_ROLE, + IFLA_BRIDGE_MRP_IN_ROLE_I_IFINDEX, + __IFLA_BRIDGE_MRP_IN_ROLE_MAX, +}; + +#define IFLA_BRIDGE_MRP_IN_ROLE_MAX (__IFLA_BRIDGE_MRP_IN_ROLE_MAX - 1) + +enum { + IFLA_BRIDGE_MRP_START_IN_TEST_UNSPEC, + IFLA_BRIDGE_MRP_START_IN_TEST_IN_ID, + IFLA_BRIDGE_MRP_START_IN_TEST_INTERVAL, + IFLA_BRIDGE_MRP_START_IN_TEST_MAX_MISS, + IFLA_BRIDGE_MRP_START_IN_TEST_PERIOD, + __IFLA_BRIDGE_MRP_START_IN_TEST_MAX, +}; + +#define IFLA_BRIDGE_MRP_START_IN_TEST_MAX (__IFLA_BRIDGE_MRP_START_IN_TEST_MAX - 1) + struct br_mrp_instance { __u32 ring_id; __u32 p_ifindex; @@ -270,6 +304,25 @@ struct br_mrp_start_test { __u32 monitor; }; +struct br_mrp_in_state { + __u32 in_state; + __u16 in_id; +}; + +struct br_mrp_in_role { + __u32 ring_id; + __u32 in_role; + __u32 i_ifindex; + __u16 in_id; +}; + +struct br_mrp_start_in_test { + __u32 interval; + __u32 max_miss; + __u32 period; + __u16 in_id; +}; + struct bridge_stp_xstats { __u64 transition_blk; __u64 transition_fwd; diff --git a/include/uapi/linux/mrp_bridge.h b/include/uapi/linux/mrp_bridge.h index bee366540212..6aeb13ef0b1e 100644 --- a/include/uapi/linux/mrp_bridge.h +++ b/include/uapi/linux/mrp_bridge.h @@ -21,11 +21,22 @@ enum br_mrp_ring_role_type { BR_MRP_RING_ROLE_MRA, }; +enum br_mrp_in_role_type { + BR_MRP_IN_ROLE_DISABLED, + BR_MRP_IN_ROLE_MIC, + BR_MRP_IN_ROLE_MIM, +}; + enum br_mrp_ring_state_type { BR_MRP_RING_STATE_OPEN, BR_MRP_RING_STATE_CLOSED, }; +enum br_mrp_in_state_type { + BR_MRP_IN_STATE_OPEN, + BR_MRP_IN_STATE_CLOSED, +}; + enum br_mrp_port_state_type { BR_MRP_PORT_STATE_DISABLED, BR_MRP_PORT_STATE_BLOCKED, @@ -36,6 +47,7 @@ enum br_mrp_port_state_type { enum br_mrp_port_role_type { BR_MRP_PORT_ROLE_PRIMARY, BR_MRP_PORT_ROLE_SECONDARY, + BR_MRP_PORT_ROLE_INTER, }; enum br_mrp_tlv_header_type { @@ -45,6 +57,10 @@ enum br_mrp_tlv_header_type { BR_MRP_TLV_HEADER_RING_TOPO = 0x3, BR_MRP_TLV_HEADER_RING_LINK_DOWN = 0x4, BR_MRP_TLV_HEADER_RING_LINK_UP = 0x5, + BR_MRP_TLV_HEADER_IN_TEST = 0x6, + BR_MRP_TLV_HEADER_IN_TOPO = 0x7, + BR_MRP_TLV_HEADER_IN_LINK_DOWN = 0x8, + BR_MRP_TLV_HEADER_IN_LINK_UP = 0x9, BR_MRP_TLV_HEADER_OPTION = 0x7f, }; @@ -118,4 +134,26 @@ struct br_mrp_oui_hdr { __u8 oui[MRP_OUI_LENGTH]; }; +struct br_mrp_in_test_hdr { + __be16 id; + __u8 sa[ETH_ALEN]; + __be16 port_role; + __be16 state; + __be16 transitions; + __be32 timestamp; +}; + +struct br_mrp_in_topo_hdr { + __u8 sa[ETH_ALEN]; + __be16 id; + __be16 interval; +}; + +struct br_mrp_in_link_hdr { + __u8 sa[ETH_ALEN]; + __be16 port_role; + __be16 id; + __be16 interval; +}; + #endif -- cgit v1.2.3-59-g8ed1b From 559139cb0405d38816e5e725adee9000db993235 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Tue, 14 Jul 2020 09:34:56 +0200 Subject: bridge: uapi: mrp: Extend MRP_INFO attributes for interconnect status Extend the existing MRP_INFO to return status of MRP interconnect. In case there is no MRP interconnect on the node then the role will be disabled so the other attributes can be ignored. Signed-off-by: Horatiu Vultur Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index d840a3e37a37..c1227aecd38f 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -243,6 +243,11 @@ enum { IFLA_BRIDGE_MRP_INFO_TEST_INTERVAL, IFLA_BRIDGE_MRP_INFO_TEST_MAX_MISS, IFLA_BRIDGE_MRP_INFO_TEST_MONITOR, + IFLA_BRIDGE_MRP_INFO_I_IFINDEX, + IFLA_BRIDGE_MRP_INFO_IN_STATE, + IFLA_BRIDGE_MRP_INFO_IN_ROLE, + IFLA_BRIDGE_MRP_INFO_IN_TEST_INTERVAL, + IFLA_BRIDGE_MRP_INFO_IN_TEST_MAX_MISS, __IFLA_BRIDGE_MRP_INFO_MAX, }; -- cgit v1.2.3-59-g8ed1b From ffb3adba64801f70c472303c9e386eb5eaec193d Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Tue, 14 Jul 2020 09:34:58 +0200 Subject: net: bridge: Add port attribute IFLA_BRPORT_MRP_IN_OPEN This patch adds a new port attribute, IFLA_BRPORT_MRP_IN_OPEN, which allows to notify the userspace when the node lost the contiuity of MRP_InTest frames. Signed-off-by: Horatiu Vultur Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 1 + net/bridge/br_netlink.c | 3 +++ tools/include/uapi/linux/if_link.h | 1 + 3 files changed, 5 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index cc185a007ade..26842ffd0501 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -344,6 +344,7 @@ enum { IFLA_BRPORT_ISOLATED, IFLA_BRPORT_BACKUP_PORT, IFLA_BRPORT_MRP_RING_OPEN, + IFLA_BRPORT_MRP_IN_OPEN, __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index c532fa65c983..147d52596e17 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -152,6 +152,7 @@ static inline size_t br_port_info_size(void) #endif + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_GROUP_FWD_MASK */ + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MRP_RING_OPEN */ + + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MRP_IN_OPEN */ + 0; } @@ -216,6 +217,8 @@ static int br_port_fill_attrs(struct sk_buff *skb, !!(p->flags & BR_NEIGH_SUPPRESS)) || nla_put_u8(skb, IFLA_BRPORT_MRP_RING_OPEN, !!(p->flags & BR_MRP_LOST_CONT)) || + nla_put_u8(skb, IFLA_BRPORT_MRP_IN_OPEN, + !!(p->flags & BR_MRP_LOST_IN_CONT)) || nla_put_u8(skb, IFLA_BRPORT_ISOLATED, !!(p->flags & BR_ISOLATED))) return -EMSGSIZE; diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h index cafedbbfefbe..781e482dc499 100644 --- a/tools/include/uapi/linux/if_link.h +++ b/tools/include/uapi/linux/if_link.h @@ -344,6 +344,7 @@ enum { IFLA_BRPORT_ISOLATED, IFLA_BRPORT_BACKUP_PORT, IFLA_BRPORT_MRP_RING_OPEN, + IFLA_BRPORT_MRP_IN_OPEN, __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) -- cgit v1.2.3-59-g8ed1b From 7cf97b12545503992020796c74bd84078eb39299 Mon Sep 17 00:00:00 2001 From: Sargun Dhillon Date: Tue, 2 Jun 2020 18:10:43 -0700 Subject: seccomp: Introduce addfd ioctl to seccomp user notifier The current SECCOMP_RET_USER_NOTIF API allows for syscall supervision over an fd. It is often used in settings where a supervising task emulates syscalls on behalf of a supervised task in userspace, either to further restrict the supervisee's syscall abilities or to circumvent kernel enforced restrictions the supervisor deems safe to lift (e.g. actually performing a mount(2) for an unprivileged container). While SECCOMP_RET_USER_NOTIF allows for the interception of any syscall, only a certain subset of syscalls could be correctly emulated. Over the last few development cycles, the set of syscalls which can't be emulated has been reduced due to the addition of pidfd_getfd(2). With this we are now able to, for example, intercept syscalls that require the supervisor to operate on file descriptors of the supervisee such as connect(2). However, syscalls that cause new file descriptors to be installed can not currently be correctly emulated since there is no way for the supervisor to inject file descriptors into the supervisee. This patch adds a new addfd ioctl to remove this restriction by allowing the supervisor to install file descriptors into the intercepted task. By implementing this feature via seccomp the supervisor effectively instructs the supervisee to install a set of file descriptors into its own file descriptor table during the intercepted syscall. This way it is possible to intercept syscalls such as open() or accept(), and install (or replace, like dup2(2)) the supervisor's resulting fd into the supervisee. One replacement use-case would be to redirect the stdout and stderr of a supervisee into log file descriptors opened by the supervisor. The ioctl handling is based on the discussions[1] of how Extensible Arguments should interact with ioctls. Instead of building size into the addfd structure, make it a function of the ioctl command (which is how sizes are normally passed to ioctls). To support forward and backward compatibility, just mask out the direction and size, and match everything. The size (and any future direction) checks are done along with copy_struct_from_user() logic. As a note, the seccomp_notif_addfd structure is laid out based on 8-byte alignment without requiring packing as there have been packing issues with uapi highlighted before[2][3]. Although we could overload the newfd field and use -1 to indicate that it is not to be used, doing so requires changing the size of the fd field, and introduces struct packing complexity. [1]: https://lore.kernel.org/lkml/87o8w9bcaf.fsf@mid.deneb.enyo.de/ [2]: https://lore.kernel.org/lkml/a328b91d-fd8f-4f27-b3c2-91a9c45f18c0@rasmusvillemoes.dk/ [3]: https://lore.kernel.org/lkml/20200612104629.GA15814@ircssh-2.c.rugged-nimbus-611.internal Cc: Christoph Hellwig Cc: Christian Brauner Cc: Tycho Andersen Cc: Jann Horn Cc: Robert Sesek Cc: Chris Palmer Cc: Al Viro Cc: linux-fsdevel@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-api@vger.kernel.org Suggested-by: Matt Denton Link: https://lore.kernel.org/r/20200603011044.7972-4-sargun@sargun.me Signed-off-by: Sargun Dhillon Reviewed-by: Will Drewry Co-developed-by: Kees Cook Signed-off-by: Kees Cook --- include/linux/seccomp.h | 4 + include/uapi/linux/seccomp.h | 22 ++++++ kernel/seccomp.c | 175 ++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 199 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index babcd6c02d09..881c90b6aa25 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -10,6 +10,10 @@ SECCOMP_FILTER_FLAG_NEW_LISTENER | \ SECCOMP_FILTER_FLAG_TSYNC_ESRCH) +/* sizeof() the first published struct seccomp_notif_addfd */ +#define SECCOMP_NOTIFY_ADDFD_SIZE_VER0 24 +#define SECCOMP_NOTIFY_ADDFD_SIZE_LATEST SECCOMP_NOTIFY_ADDFD_SIZE_VER0 + #ifdef CONFIG_SECCOMP #include diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h index 965290f7dcc2..6ba18b82a02e 100644 --- a/include/uapi/linux/seccomp.h +++ b/include/uapi/linux/seccomp.h @@ -113,6 +113,25 @@ struct seccomp_notif_resp { __u32 flags; }; +/* valid flags for seccomp_notif_addfd */ +#define SECCOMP_ADDFD_FLAG_SETFD (1UL << 0) /* Specify remote fd */ + +/** + * struct seccomp_notif_addfd + * @id: The ID of the seccomp notification + * @flags: SECCOMP_ADDFD_FLAG_* + * @srcfd: The local fd number + * @newfd: Optional remote FD number if SETFD option is set, otherwise 0. + * @newfd_flags: The O_* flags the remote FD should have applied + */ +struct seccomp_notif_addfd { + __u64 id; + __u32 flags; + __u32 srcfd; + __u32 newfd; + __u32 newfd_flags; +}; + #define SECCOMP_IOC_MAGIC '!' #define SECCOMP_IO(nr) _IO(SECCOMP_IOC_MAGIC, nr) #define SECCOMP_IOR(nr, type) _IOR(SECCOMP_IOC_MAGIC, nr, type) @@ -124,5 +143,8 @@ struct seccomp_notif_resp { #define SECCOMP_IOCTL_NOTIF_SEND SECCOMP_IOWR(1, \ struct seccomp_notif_resp) #define SECCOMP_IOCTL_NOTIF_ID_VALID SECCOMP_IOW(2, __u64) +/* On success, the return value is the remote process's added fd number */ +#define SECCOMP_IOCTL_NOTIF_ADDFD SECCOMP_IOW(3, \ + struct seccomp_notif_addfd) #endif /* _UAPI_LINUX_SECCOMP_H */ diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 866a432cd746..3ee59ce0a323 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -87,10 +87,42 @@ struct seccomp_knotif { long val; u32 flags; - /* Signals when this has entered SECCOMP_NOTIFY_REPLIED */ + /* + * Signals when this has changed states, such as the listener + * dying, a new seccomp addfd message, or changing to REPLIED + */ struct completion ready; struct list_head list; + + /* outstanding addfd requests */ + struct list_head addfd; +}; + +/** + * struct seccomp_kaddfd - container for seccomp_addfd ioctl messages + * + * @file: A reference to the file to install in the other task + * @fd: The fd number to install it at. If the fd number is -1, it means the + * installing process should allocate the fd as normal. + * @flags: The flags for the new file descriptor. At the moment, only O_CLOEXEC + * is allowed. + * @ret: The return value of the installing process. It is set to the fd num + * upon success (>= 0). + * @completion: Indicates that the installing process has completed fd + * installation, or gone away (either due to successful + * reply, or signal) + * + */ +struct seccomp_kaddfd { + struct file *file; + int fd; + unsigned int flags; + + /* To only be set on reply */ + int ret; + struct completion completion; + struct list_head list; }; /** @@ -793,6 +825,17 @@ static u64 seccomp_next_notify_id(struct seccomp_filter *filter) return filter->notif->next_id++; } +static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd) +{ + /* + * Remove the notification, and reset the list pointers, indicating + * that it has been handled. + */ + list_del_init(&addfd->list); + addfd->ret = receive_fd_replace(addfd->fd, addfd->file, addfd->flags); + complete(&addfd->completion); +} + static int seccomp_do_user_notification(int this_syscall, struct seccomp_filter *match, const struct seccomp_data *sd) @@ -801,6 +844,7 @@ static int seccomp_do_user_notification(int this_syscall, u32 flags = 0; long ret = 0; struct seccomp_knotif n = {}; + struct seccomp_kaddfd *addfd, *tmp; mutex_lock(&match->notify_lock); err = -ENOSYS; @@ -813,6 +857,7 @@ static int seccomp_do_user_notification(int this_syscall, n.id = seccomp_next_notify_id(match); init_completion(&n.ready); list_add(&n.list, &match->notif->notifications); + INIT_LIST_HEAD(&n.addfd); up(&match->notif->request); wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM); @@ -821,17 +866,34 @@ static int seccomp_do_user_notification(int this_syscall, /* * This is where we wait for a reply from userspace. */ +wait: err = wait_for_completion_interruptible(&n.ready); mutex_lock(&match->notify_lock); if (err == 0) { + /* Check if we were woken up by a addfd message */ + addfd = list_first_entry_or_null(&n.addfd, + struct seccomp_kaddfd, list); + if (addfd && n.state != SECCOMP_NOTIFY_REPLIED) { + seccomp_handle_addfd(addfd); + mutex_unlock(&match->notify_lock); + goto wait; + } ret = n.val; err = n.error; flags = n.flags; } + /* If there were any pending addfd calls, clear them out */ + list_for_each_entry_safe(addfd, tmp, &n.addfd, list) { + /* The process went away before we got a chance to handle it */ + addfd->ret = -ESRCH; + list_del_init(&addfd->list); + complete(&addfd->completion); + } + /* * Note that it's possible the listener died in between the time when - * we were notified of a respons (or a signal) and when we were able to + * we were notified of a response (or a signal) and when we were able to * re-acquire the lock, so only delete from the list if the * notification actually exists. * @@ -1069,6 +1131,11 @@ static int seccomp_notify_release(struct inode *inode, struct file *file) knotif->error = -ENOSYS; knotif->val = 0; + /* + * We do not need to wake up any pending addfd messages, as + * the notifier will do that for us, as this just looks + * like a standard reply. + */ complete(&knotif->ready); } @@ -1233,12 +1300,109 @@ static long seccomp_notify_id_valid(struct seccomp_filter *filter, return ret; } +static long seccomp_notify_addfd(struct seccomp_filter *filter, + struct seccomp_notif_addfd __user *uaddfd, + unsigned int size) +{ + struct seccomp_notif_addfd addfd; + struct seccomp_knotif *knotif; + struct seccomp_kaddfd kaddfd; + int ret; + + BUILD_BUG_ON(sizeof(addfd) < SECCOMP_NOTIFY_ADDFD_SIZE_VER0); + BUILD_BUG_ON(sizeof(addfd) != SECCOMP_NOTIFY_ADDFD_SIZE_LATEST); + + if (size < SECCOMP_NOTIFY_ADDFD_SIZE_VER0 || size >= PAGE_SIZE) + return -EINVAL; + + ret = copy_struct_from_user(&addfd, sizeof(addfd), uaddfd, size); + if (ret) + return ret; + + if (addfd.newfd_flags & ~O_CLOEXEC) + return -EINVAL; + + if (addfd.flags & ~SECCOMP_ADDFD_FLAG_SETFD) + return -EINVAL; + + if (addfd.newfd && !(addfd.flags & SECCOMP_ADDFD_FLAG_SETFD)) + return -EINVAL; + + kaddfd.file = fget(addfd.srcfd); + if (!kaddfd.file) + return -EBADF; + + kaddfd.flags = addfd.newfd_flags; + kaddfd.fd = (addfd.flags & SECCOMP_ADDFD_FLAG_SETFD) ? + addfd.newfd : -1; + init_completion(&kaddfd.completion); + + ret = mutex_lock_interruptible(&filter->notify_lock); + if (ret < 0) + goto out; + + knotif = find_notification(filter, addfd.id); + if (!knotif) { + ret = -ENOENT; + goto out_unlock; + } + + /* + * We do not want to allow for FD injection to occur before the + * notification has been picked up by a userspace handler, or after + * the notification has been replied to. + */ + if (knotif->state != SECCOMP_NOTIFY_SENT) { + ret = -EINPROGRESS; + goto out_unlock; + } + + list_add(&kaddfd.list, &knotif->addfd); + complete(&knotif->ready); + mutex_unlock(&filter->notify_lock); + + /* Now we wait for it to be processed or be interrupted */ + ret = wait_for_completion_interruptible(&kaddfd.completion); + if (ret == 0) { + /* + * We had a successful completion. The other side has already + * removed us from the addfd queue, and + * wait_for_completion_interruptible has a memory barrier upon + * success that lets us read this value directly without + * locking. + */ + ret = kaddfd.ret; + goto out; + } + + mutex_lock(&filter->notify_lock); + /* + * Even though we were woken up by a signal and not a successful + * completion, a completion may have happened in the mean time. + * + * We need to check again if the addfd request has been handled, + * and if not, we will remove it from the queue. + */ + if (list_empty(&kaddfd.list)) + ret = kaddfd.ret; + else + list_del(&kaddfd.list); + +out_unlock: + mutex_unlock(&filter->notify_lock); +out: + fput(kaddfd.file); + + return ret; +} + static long seccomp_notify_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct seccomp_filter *filter = file->private_data; void __user *buf = (void __user *)arg; + /* Fixed-size ioctls */ switch (cmd) { case SECCOMP_IOCTL_NOTIF_RECV: return seccomp_notify_recv(filter, buf); @@ -1247,6 +1411,13 @@ static long seccomp_notify_ioctl(struct file *file, unsigned int cmd, case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR: case SECCOMP_IOCTL_NOTIF_ID_VALID: return seccomp_notify_id_valid(filter, buf); + } + + /* Extensible Argument ioctls */ +#define EA_IOCTL(cmd) ((cmd) & ~(IOC_INOUT | IOCSIZE_MASK)) + switch (EA_IOCTL(cmd)) { + case EA_IOCTL(SECCOMP_IOCTL_NOTIF_ADDFD): + return seccomp_notify_addfd(filter, buf, _IOC_SIZE(cmd)); default: return -EINVAL; } -- cgit v1.2.3-59-g8ed1b From c201324b54553aebb32845193680f21eb493c6e5 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 15 Jul 2020 09:42:42 -0700 Subject: net: caif: drop duplicate words in comments Drop doubled words "or" and "the" in several comments. Signed-off-by: Randy Dunlap Cc: "David S. Miller" Cc: netdev@vger.kernel.org Signed-off-by: Jakub Kicinski --- include/net/caif/caif_layer.h | 4 ++-- include/uapi/linux/caif/caif_socket.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/net/caif/caif_layer.h b/include/net/caif/caif_layer.h index 064094101cb5..51f7bb42a936 100644 --- a/include/net/caif/caif_layer.h +++ b/include/net/caif/caif_layer.h @@ -156,7 +156,7 @@ struct cflayer { * CAIF packets upwards in the stack. * Packet handling rules: * - The CAIF packet (cfpkt) ownership is passed to the - * called receive function. This means that the the + * called receive function. This means that the * packet cannot be accessed after passing it to the * above layer using up->receive(). * @@ -184,7 +184,7 @@ struct cflayer { * CAIF packet downwards in the stack. * Packet handling rules: * - The CAIF packet (cfpkt) ownership is passed to the - * transmit function. This means that the the packet + * transmit function. This means that the packet * cannot be accessed after passing it to the below * layer using dn->transmit(). * diff --git a/include/uapi/linux/caif/caif_socket.h b/include/uapi/linux/caif/caif_socket.h index 10ec1d1cf68e..d9970bbaa156 100644 --- a/include/uapi/linux/caif/caif_socket.h +++ b/include/uapi/linux/caif/caif_socket.h @@ -169,7 +169,7 @@ struct sockaddr_caif { * @CAIFSO_LINK_SELECT: Selector used if multiple CAIF Link layers are * available. Either a high bandwidth * link can be selected (CAIF_LINK_HIGH_BANDW) or - * or a low latency link (CAIF_LINK_LOW_LATENCY). + * a low latency link (CAIF_LINK_LOW_LATENCY). * This option is of type __u32. * Alternatively SO_BINDTODEVICE can be used. * -- cgit v1.2.3-59-g8ed1b From 644bfe51fa49c22244d24e896cd3fe3ee2f2cfd1 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 14 Jul 2020 15:56:37 +0200 Subject: cpumap: Formalize map value as a named struct As it has been already done for devmap, introduce 'struct bpf_cpumap_val' to formalize the expected values that can be passed in for a CPUMAP. Update cpumap code to use the struct. Signed-off-by: Lorenzo Bianconi Signed-off-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/bpf/754f950674665dae6139c061d28c1d982aaf4170.1594734381.git.lorenzo@kernel.org --- include/uapi/linux/bpf.h | 9 +++++++++ kernel/bpf/cpumap.c | 28 +++++++++++++++------------- tools/include/uapi/linux/bpf.h | 9 +++++++++ 3 files changed, 33 insertions(+), 13 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 5e386389913a..109623527358 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3849,6 +3849,15 @@ struct bpf_devmap_val { } bpf_prog; }; +/* CPUMAP map-value layout + * + * The struct data-layout of map-value is a configuration interface. + * New members can only be added to the end of this structure. + */ +struct bpf_cpumap_val { + __u32 qsize; /* queue size to remote target CPU */ +}; + enum sk_action { SK_DROP = 0, SK_PASS, diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 323c91c4fab0..ff48dc00e8d0 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -52,7 +52,6 @@ struct xdp_bulk_queue { struct bpf_cpu_map_entry { u32 cpu; /* kthread CPU and map index */ int map_id; /* Back reference to map */ - u32 qsize; /* Queue size placeholder for map lookup */ /* XDP can run multiple RX-ring queues, need __percpu enqueue store */ struct xdp_bulk_queue __percpu *bulkq; @@ -62,10 +61,13 @@ struct bpf_cpu_map_entry { /* Queue with potential multi-producers, and single-consumer kthread */ struct ptr_ring *queue; struct task_struct *kthread; - struct work_struct kthread_stop_wq; + + struct bpf_cpumap_val value; atomic_t refcnt; /* Control when this struct can be free'ed */ struct rcu_head rcu; + + struct work_struct kthread_stop_wq; }; struct bpf_cpu_map { @@ -307,8 +309,8 @@ static int cpu_map_kthread_run(void *data) return 0; } -static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, - int map_id) +static struct bpf_cpu_map_entry * +__cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id) { gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; struct bpf_cpu_map_entry *rcpu; @@ -338,13 +340,13 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, if (!rcpu->queue) goto free_bulkq; - err = ptr_ring_init(rcpu->queue, qsize, gfp); + err = ptr_ring_init(rcpu->queue, value->qsize, gfp); if (err) goto free_queue; rcpu->cpu = cpu; rcpu->map_id = map_id; - rcpu->qsize = qsize; + rcpu->value.qsize = value->qsize; /* Setup kthread */ rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa, @@ -437,12 +439,12 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); + struct bpf_cpumap_val cpumap_value = {}; struct bpf_cpu_map_entry *rcpu; - /* Array index key correspond to CPU number */ u32 key_cpu = *(u32 *)key; - /* Value is the queue size */ - u32 qsize = *(u32 *)value; + + memcpy(&cpumap_value, value, map->value_size); if (unlikely(map_flags > BPF_EXIST)) return -EINVAL; @@ -450,18 +452,18 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, return -E2BIG; if (unlikely(map_flags == BPF_NOEXIST)) return -EEXIST; - if (unlikely(qsize > 16384)) /* sanity limit on qsize */ + if (unlikely(cpumap_value.qsize > 16384)) /* sanity limit on qsize */ return -EOVERFLOW; /* Make sure CPU is a valid possible cpu */ if (key_cpu >= nr_cpumask_bits || !cpu_possible(key_cpu)) return -ENODEV; - if (qsize == 0) { + if (cpumap_value.qsize == 0) { rcpu = NULL; /* Same as deleting */ } else { /* Updating qsize cause re-allocation of bpf_cpu_map_entry */ - rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id); + rcpu = __cpu_map_entry_alloc(&cpumap_value, key_cpu, map->id); if (!rcpu) return -ENOMEM; rcpu->cmap = cmap; @@ -523,7 +525,7 @@ static void *cpu_map_lookup_elem(struct bpf_map *map, void *key) struct bpf_cpu_map_entry *rcpu = __cpu_map_lookup_elem(map, *(u32 *)key); - return rcpu ? &rcpu->qsize : NULL; + return rcpu ? &rcpu->value : NULL; } static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 5e386389913a..109623527358 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3849,6 +3849,15 @@ struct bpf_devmap_val { } bpf_prog; }; +/* CPUMAP map-value layout + * + * The struct data-layout of map-value is a configuration interface. + * New members can only be added to the end of this structure. + */ +struct bpf_cpumap_val { + __u32 qsize; /* queue size to remote target CPU */ +}; + enum sk_action { SK_DROP = 0, SK_PASS, -- cgit v1.2.3-59-g8ed1b From 9216477449f33cdbc9c9a99d49f500b7fbb81702 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 14 Jul 2020 15:56:38 +0200 Subject: bpf: cpumap: Add the possibility to attach an eBPF program to cpumap Introduce the capability to attach an eBPF program to cpumap entries. The idea behind this feature is to add the possibility to define on which CPU run the eBPF program if the underlying hw does not support RSS. Current supported verdicts are XDP_DROP and XDP_PASS. This patch has been tested on Marvell ESPRESSObin using xdp_redirect_cpu sample available in the kernel tree to identify possible performance regressions. Results show there are no observable differences in packet-per-second: $./xdp_redirect_cpu --progname xdp_cpu_map0 --dev eth0 --cpu 1 rx: 354.8 Kpps rx: 356.0 Kpps rx: 356.8 Kpps rx: 356.3 Kpps rx: 356.6 Kpps rx: 356.6 Kpps rx: 356.7 Kpps rx: 355.8 Kpps rx: 356.8 Kpps rx: 356.8 Kpps Co-developed-by: Jesper Dangaard Brouer Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Lorenzo Bianconi Signed-off-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/bpf/5c9febdf903d810b3415732e5cd98491d7d9067a.1594734381.git.lorenzo@kernel.org --- include/linux/bpf.h | 6 ++ include/net/xdp.h | 5 ++ include/trace/events/xdp.h | 14 +++-- include/uapi/linux/bpf.h | 5 ++ kernel/bpf/cpumap.c | 121 ++++++++++++++++++++++++++++++++++++----- net/core/dev.c | 9 +++ tools/include/uapi/linux/bpf.h | 5 ++ 7 files changed, 148 insertions(+), 17 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c67c88ad35f8..54ad426dbea1 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1272,6 +1272,7 @@ struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key); void __cpu_map_flush(void); int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, struct net_device *dev_rx); +bool cpu_map_prog_allowed(struct bpf_map *map); /* Return map's numa specified by userspace */ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr) @@ -1432,6 +1433,11 @@ static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, return 0; } +static inline bool cpu_map_prog_allowed(struct bpf_map *map) +{ + return false; +} + static inline struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type) { diff --git a/include/net/xdp.h b/include/net/xdp.h index 5b383c450858..83b9e0142b52 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -98,6 +98,11 @@ struct xdp_frame { struct net_device *dev_rx; /* used by cpumap */ }; +struct xdp_cpumap_stats { + unsigned int pass; + unsigned int drop; +}; + /* Clear kernel pointers in xdp_frame */ static inline void xdp_scrub_frame(struct xdp_frame *frame) { diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index b73d3e141323..e2c99f5bee39 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -177,9 +177,9 @@ DEFINE_EVENT(xdp_redirect_template, xdp_redirect_map_err, TRACE_EVENT(xdp_cpumap_kthread, TP_PROTO(int map_id, unsigned int processed, unsigned int drops, - int sched), + int sched, struct xdp_cpumap_stats *xdp_stats), - TP_ARGS(map_id, processed, drops, sched), + TP_ARGS(map_id, processed, drops, sched, xdp_stats), TP_STRUCT__entry( __field(int, map_id) @@ -188,6 +188,8 @@ TRACE_EVENT(xdp_cpumap_kthread, __field(unsigned int, drops) __field(unsigned int, processed) __field(int, sched) + __field(unsigned int, xdp_pass) + __field(unsigned int, xdp_drop) ), TP_fast_assign( @@ -197,16 +199,20 @@ TRACE_EVENT(xdp_cpumap_kthread, __entry->drops = drops; __entry->processed = processed; __entry->sched = sched; + __entry->xdp_pass = xdp_stats->pass; + __entry->xdp_drop = xdp_stats->drop; ), TP_printk("kthread" " cpu=%d map_id=%d action=%s" " processed=%u drops=%u" - " sched=%d", + " sched=%d" + " xdp_pass=%u xdp_drop=%u", __entry->cpu, __entry->map_id, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->processed, __entry->drops, - __entry->sched) + __entry->sched, + __entry->xdp_pass, __entry->xdp_drop) ); TRACE_EVENT(xdp_cpumap_enqueue, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 109623527358..c010b57fce3f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -227,6 +227,7 @@ enum bpf_attach_type { BPF_CGROUP_INET6_GETSOCKNAME, BPF_XDP_DEVMAP, BPF_CGROUP_INET_SOCK_RELEASE, + BPF_XDP_CPUMAP, __MAX_BPF_ATTACH_TYPE }; @@ -3856,6 +3857,10 @@ struct bpf_devmap_val { */ struct bpf_cpumap_val { __u32 qsize; /* queue size to remote target CPU */ + union { + int fd; /* prog fd on map write */ + __u32 id; /* prog id on map read */ + } bpf_prog; }; enum sk_action { diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index ff48dc00e8d0..b3a8aea81ee5 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -63,6 +63,7 @@ struct bpf_cpu_map_entry { struct task_struct *kthread; struct bpf_cpumap_val value; + struct bpf_prog *prog; atomic_t refcnt; /* Control when this struct can be free'ed */ struct rcu_head rcu; @@ -82,6 +83,7 @@ static int bq_flush_to_queue(struct xdp_bulk_queue *bq); static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) { + u32 value_size = attr->value_size; struct bpf_cpu_map *cmap; int err = -ENOMEM; u64 cost; @@ -92,7 +94,9 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE) + (value_size != offsetofend(struct bpf_cpumap_val, qsize) && + value_size != offsetofend(struct bpf_cpumap_val, bpf_prog.fd)) || + attr->map_flags & ~BPF_F_NUMA_NODE) return ERR_PTR(-EINVAL); cmap = kzalloc(sizeof(*cmap), GFP_USER); @@ -214,6 +218,8 @@ static void __cpu_map_ring_cleanup(struct ptr_ring *ring) static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) { if (atomic_dec_and_test(&rcpu->refcnt)) { + if (rcpu->prog) + bpf_prog_put(rcpu->prog); /* The queue should be empty at this point */ __cpu_map_ring_cleanup(rcpu->queue); ptr_ring_cleanup(rcpu->queue, NULL); @@ -222,6 +228,62 @@ static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) } } +static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, + void **frames, int n, + struct xdp_cpumap_stats *stats) +{ + struct xdp_rxq_info rxq; + struct xdp_buff xdp; + int i, nframes = 0; + + if (!rcpu->prog) + return n; + + rcu_read_lock(); + + xdp_set_return_frame_no_direct(); + xdp.rxq = &rxq; + + for (i = 0; i < n; i++) { + struct xdp_frame *xdpf = frames[i]; + u32 act; + int err; + + rxq.dev = xdpf->dev_rx; + rxq.mem = xdpf->mem; + /* TODO: report queue_index to xdp_rxq_info */ + + xdp_convert_frame_to_buff(xdpf, &xdp); + + act = bpf_prog_run_xdp(rcpu->prog, &xdp); + switch (act) { + case XDP_PASS: + err = xdp_update_frame_from_buff(&xdp, xdpf); + if (err < 0) { + xdp_return_frame(xdpf); + stats->drop++; + } else { + frames[nframes++] = xdpf; + stats->pass++; + } + break; + default: + bpf_warn_invalid_xdp_action(act); + /* fallthrough */ + case XDP_DROP: + xdp_return_frame(xdpf); + stats->drop++; + break; + } + } + + xdp_clear_return_frame_no_direct(); + + rcu_read_unlock(); + + return nframes; +} + #define CPUMAP_BATCH 8 static int cpu_map_kthread_run(void *data) @@ -236,11 +298,12 @@ static int cpu_map_kthread_run(void *data) * kthread_stop signal until queue is empty. */ while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { + struct xdp_cpumap_stats stats = {}; /* zero stats */ + gfp_t gfp = __GFP_ZERO | GFP_ATOMIC; unsigned int drops = 0, sched = 0; void *frames[CPUMAP_BATCH]; void *skbs[CPUMAP_BATCH]; - gfp_t gfp = __GFP_ZERO | GFP_ATOMIC; - int i, n, m; + int i, n, m, nframes; /* Release CPU reschedule checks */ if (__ptr_ring_empty(rcpu->queue)) { @@ -261,8 +324,8 @@ static int cpu_map_kthread_run(void *data) * kthread CPU pinned. Lockless access to ptr_ring * consume side valid as no-resize allowed of queue. */ - n = __ptr_ring_consume_batched(rcpu->queue, frames, CPUMAP_BATCH); - + n = __ptr_ring_consume_batched(rcpu->queue, frames, + CPUMAP_BATCH); for (i = 0; i < n; i++) { void *f = frames[i]; struct page *page = virt_to_page(f); @@ -274,15 +337,19 @@ static int cpu_map_kthread_run(void *data) prefetchw(page); } - m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, n, skbs); - if (unlikely(m == 0)) { - for (i = 0; i < n; i++) - skbs[i] = NULL; /* effect: xdp_return_frame */ - drops = n; + /* Support running another XDP prog on this CPU */ + nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, n, &stats); + if (nframes) { + m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, nframes, skbs); + if (unlikely(m == 0)) { + for (i = 0; i < nframes; i++) + skbs[i] = NULL; /* effect: xdp_return_frame */ + drops += nframes; + } } local_bh_disable(); - for (i = 0; i < n; i++) { + for (i = 0; i < nframes; i++) { struct xdp_frame *xdpf = frames[i]; struct sk_buff *skb = skbs[i]; int ret; @@ -299,7 +366,7 @@ static int cpu_map_kthread_run(void *data) drops++; } /* Feedback loop via tracepoint */ - trace_xdp_cpumap_kthread(rcpu->map_id, n, drops, sched); + trace_xdp_cpumap_kthread(rcpu->map_id, n, drops, sched, &stats); local_bh_enable(); /* resched point, may call do_softirq() */ } @@ -309,13 +376,38 @@ static int cpu_map_kthread_run(void *data) return 0; } +bool cpu_map_prog_allowed(struct bpf_map *map) +{ + return map->map_type == BPF_MAP_TYPE_CPUMAP && + map->value_size != offsetofend(struct bpf_cpumap_val, qsize); +} + +static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, int fd) +{ + struct bpf_prog *prog; + + prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + if (prog->expected_attach_type != BPF_XDP_CPUMAP) { + bpf_prog_put(prog); + return -EINVAL; + } + + rcpu->value.bpf_prog.id = prog->aux->id; + rcpu->prog = prog; + + return 0; +} + static struct bpf_cpu_map_entry * __cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id) { + int numa, err, i, fd = value->bpf_prog.fd; gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; struct bpf_cpu_map_entry *rcpu; struct xdp_bulk_queue *bq; - int numa, err, i; /* Have map->numa_node, but choose node of redirect target CPU */ numa = cpu_to_node(cpu); @@ -357,6 +449,9 @@ __cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id) get_cpu_map_entry(rcpu); /* 1-refcnt for being in cmap->cpu_map[] */ get_cpu_map_entry(rcpu); /* 1-refcnt for kthread */ + if (fd > 0 && __cpu_map_load_bpf_program(rcpu, fd)) + goto free_ptr_ring; + /* Make sure kthread runs on a single CPU */ kthread_bind(rcpu->kthread, cpu); wake_up_process(rcpu->kthread); diff --git a/net/core/dev.c b/net/core/dev.c index b61075828358..b820527f0a8d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5448,6 +5448,8 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) for (i = 0; i < new->aux->used_map_cnt; i++) { if (dev_map_can_have_prog(new->aux->used_maps[i])) return -EINVAL; + if (cpu_map_prog_allowed(new->aux->used_maps[i])) + return -EINVAL; } } @@ -8875,6 +8877,13 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, return -EINVAL; } + if (prog->expected_attach_type == BPF_XDP_CPUMAP) { + NL_SET_ERR_MSG(extack, + "BPF_XDP_CPUMAP programs can not be attached to a device"); + bpf_prog_put(prog); + return -EINVAL; + } + /* prog->aux->id may be 0 for orphaned device-bound progs */ if (prog->aux->id && prog->aux->id == prog_id) { bpf_prog_put(prog); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 109623527358..c010b57fce3f 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -227,6 +227,7 @@ enum bpf_attach_type { BPF_CGROUP_INET6_GETSOCKNAME, BPF_XDP_DEVMAP, BPF_CGROUP_INET_SOCK_RELEASE, + BPF_XDP_CPUMAP, __MAX_BPF_ATTACH_TYPE }; @@ -3856,6 +3857,10 @@ struct bpf_devmap_val { */ struct bpf_cpumap_val { __u32 qsize; /* queue size to remote target CPU */ + union { + int fd; /* prog fd on map write */ + __u32 id; /* prog id on map read */ + } bpf_prog; }; enum sk_action { -- cgit v1.2.3-59-g8ed1b From bfdfa51702dec67e9fcd52568b4cf3c7f799db8b Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 15 Jul 2020 18:29:11 -0700 Subject: bpf: Drop duplicated words in uapi helper comments Drop doubled words "will" and "attach". Signed-off-by: Randy Dunlap Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/6b9f71ae-4f8e-0259-2c5d-187ddaefe6eb@infradead.org --- include/uapi/linux/bpf.h | 6 +++--- tools/include/uapi/linux/bpf.h | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c010b57fce3f..7ac3992dacfe 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2420,7 +2420,7 @@ union bpf_attr { * Look for an IPv6 socket. * * If the *netns* is a negative signed 32-bit integer, then the - * socket lookup table in the netns associated with the *ctx* will + * socket lookup table in the netns associated with the *ctx* * will be used. For the TC hooks, this is the netns of the device * in the skb. For socket hooks, this is the netns of the socket. * If *netns* is any other signed 32-bit value greater than or @@ -2457,7 +2457,7 @@ union bpf_attr { * Look for an IPv6 socket. * * If the *netns* is a negative signed 32-bit integer, then the - * socket lookup table in the netns associated with the *ctx* will + * socket lookup table in the netns associated with the *ctx* * will be used. For the TC hooks, this is the netns of the device * in the skb. For socket hooks, this is the netns of the socket. * If *netns* is any other signed 32-bit value greater than or @@ -4000,7 +4000,7 @@ struct bpf_link_info { /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed * by user and intended to be used by socket (e.g. to bind to, depends on - * attach attach type). + * attach type). */ struct bpf_sock_addr { __u32 user_family; /* Allows 4-byte read, but no write. */ diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index c010b57fce3f..7ac3992dacfe 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2420,7 +2420,7 @@ union bpf_attr { * Look for an IPv6 socket. * * If the *netns* is a negative signed 32-bit integer, then the - * socket lookup table in the netns associated with the *ctx* will + * socket lookup table in the netns associated with the *ctx* * will be used. For the TC hooks, this is the netns of the device * in the skb. For socket hooks, this is the netns of the socket. * If *netns* is any other signed 32-bit value greater than or @@ -2457,7 +2457,7 @@ union bpf_attr { * Look for an IPv6 socket. * * If the *netns* is a negative signed 32-bit integer, then the - * socket lookup table in the netns associated with the *ctx* will + * socket lookup table in the netns associated with the *ctx* * will be used. For the TC hooks, this is the netns of the device * in the skb. For socket hooks, this is the netns of the socket. * If *netns* is any other signed 32-bit value greater than or @@ -4000,7 +4000,7 @@ struct bpf_link_info { /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed * by user and intended to be used by socket (e.g. to bind to, depends on - * attach attach type). + * attach type). */ struct bpf_sock_addr { __u32 user_family; /* Allows 4-byte read, but no write. */ -- cgit v1.2.3-59-g8ed1b From bbe4f4245271bd0f21bf826996c0c5d87a3529c9 Mon Sep 17 00:00:00 2001 From: Michal Kalderon Date: Tue, 7 Jul 2020 09:30:59 +0300 Subject: RDMA/qedr: Add EDPM mode type for user-fw compatibility In older FW versions the completion flag was treated as the ack flag in edpm messages. commit ff937b916eb6 ("qed: Add EDPM mode type for user-fw compatibility") exposed the FW option of setting which mode the QP is in by adding a flag to the qedr <-> qed API. This patch adds the qedr <-> libqedr interface so that the libqedr can set the flag appropriately and qedr can pass it down to FW. Flag is added for backward compatibility with libqedr. For older libs, this flag didn't exist and therefore set to zero. Fixes: ac1b36e55a51 ("qedr: Add support for user context verbs") Link: https://lore.kernel.org/r/20200707063100.3811-2-michal.kalderon@marvell.com Signed-off-by: Yuval Bason Signed-off-by: Michal Kalderon Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/qedr/qedr.h | 1 + drivers/infiniband/hw/qedr/verbs.c | 11 ++++++++--- include/uapi/rdma/qedr-abi.h | 4 ++-- 3 files changed, 11 insertions(+), 5 deletions(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/hw/qedr/qedr.h b/drivers/infiniband/hw/qedr/qedr.h index aa332027da86..460292179b32 100644 --- a/drivers/infiniband/hw/qedr/qedr.h +++ b/drivers/infiniband/hw/qedr/qedr.h @@ -235,6 +235,7 @@ struct qedr_ucontext { u32 dpi_size; u16 dpi; bool db_rec; + u8 edpm_mode; }; union db_prod32 { diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index 42273aa0b5e1..5008149ea116 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -275,7 +275,8 @@ int qedr_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata) DP_ERR(dev, "Problem copying data from user space\n"); return -EFAULT; } - + ctx->edpm_mode = !!(ureq.context_flags & + QEDR_ALLOC_UCTX_EDPM_MODE); ctx->db_rec = !!(ureq.context_flags & QEDR_ALLOC_UCTX_DB_REC); } @@ -316,7 +317,8 @@ int qedr_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata) uresp.dpm_flags = QEDR_DPM_TYPE_IWARP_LEGACY; else uresp.dpm_flags = QEDR_DPM_TYPE_ROCE_ENHANCED | - QEDR_DPM_TYPE_ROCE_LEGACY; + QEDR_DPM_TYPE_ROCE_LEGACY | + QEDR_DPM_TYPE_ROCE_EDPM_MODE; uresp.dpm_flags |= QEDR_DPM_SIZES_SET; uresp.ldpm_limit_size = QEDR_LDPM_MAX_SIZE; @@ -1750,7 +1752,7 @@ static int qedr_create_user_qp(struct qedr_dev *dev, struct qed_rdma_create_qp_out_params out_params; struct qedr_pd *pd = get_qedr_pd(ibpd); struct qedr_create_qp_uresp uresp; - struct qedr_ucontext *ctx = NULL; + struct qedr_ucontext *ctx = pd ? pd->uctx : NULL; struct qedr_create_qp_ureq ureq; int alloc_and_init = rdma_protocol_roce(&dev->ibdev, 1); int rc = -EINVAL; @@ -1788,6 +1790,9 @@ static int qedr_create_user_qp(struct qedr_dev *dev, in_params.rq_pbl_ptr = qp->urq.pbl_tbl->pa; } + if (ctx) + SET_FIELD(in_params.flags, QED_ROCE_EDPM_MODE, ctx->edpm_mode); + qp->qed_qp = dev->ops->rdma_create_qp(dev->rdma_ctx, &in_params, &out_params); diff --git a/include/uapi/rdma/qedr-abi.h b/include/uapi/rdma/qedr-abi.h index a0b83c9d4498..b261c9fca07b 100644 --- a/include/uapi/rdma/qedr-abi.h +++ b/include/uapi/rdma/qedr-abi.h @@ -39,7 +39,7 @@ /* user kernel communication data structures. */ enum qedr_alloc_ucontext_flags { - QEDR_ALLOC_UCTX_RESERVED = 1 << 0, + QEDR_ALLOC_UCTX_EDPM_MODE = 1 << 0, QEDR_ALLOC_UCTX_DB_REC = 1 << 1 }; @@ -56,7 +56,7 @@ enum qedr_rdma_dpm_type { QEDR_DPM_TYPE_ROCE_ENHANCED = 1 << 0, QEDR_DPM_TYPE_ROCE_LEGACY = 1 << 1, QEDR_DPM_TYPE_IWARP_LEGACY = 1 << 2, - QEDR_DPM_TYPE_RESERVED = 1 << 3, + QEDR_DPM_TYPE_ROCE_EDPM_MODE = 1 << 3, QEDR_DPM_SIZES_SET = 1 << 4, }; -- cgit v1.2.3-59-g8ed1b From eb7f84e379daad69b4c92538baeaf93bbf493c14 Mon Sep 17 00:00:00 2001 From: Michal Kalderon Date: Tue, 7 Jul 2020 09:31:00 +0300 Subject: RDMA/qedr: Add EDPM max size to alloc ucontext response User space should receive the maximum edpm size from kernel driver, similar to other edpm/ldpm related limits. Add an additional parameter to the alloc_ucontext_resp structure for the edpm maximum size. In addition, pass an indication from user-space to kernel (and not just kernel to user) that the DPM sizes are supported. This is for supporting backward-forward compatibility between driver and lib for everything related to DPM transaction and limit sizes. This should have been part of commit mentioned in Fixes tag. Link: https://lore.kernel.org/r/20200707063100.3811-3-michal.kalderon@marvell.com Fixes: 93a3d05f9d68 ("RDMA/qedr: Add kernel capability flags for dpm enabled mode") Signed-off-by: Ariel Elior Signed-off-by: Michal Kalderon Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/qedr/verbs.c | 9 ++++++--- include/uapi/rdma/qedr-abi.h | 6 +++++- 2 files changed, 11 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index 5008149ea116..fcf2eaa3b459 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -320,9 +320,12 @@ int qedr_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata) QEDR_DPM_TYPE_ROCE_LEGACY | QEDR_DPM_TYPE_ROCE_EDPM_MODE; - uresp.dpm_flags |= QEDR_DPM_SIZES_SET; - uresp.ldpm_limit_size = QEDR_LDPM_MAX_SIZE; - uresp.edpm_trans_size = QEDR_EDPM_TRANS_SIZE; + if (ureq.context_flags & QEDR_SUPPORT_DPM_SIZES) { + uresp.dpm_flags |= QEDR_DPM_SIZES_SET; + uresp.ldpm_limit_size = QEDR_LDPM_MAX_SIZE; + uresp.edpm_trans_size = QEDR_EDPM_TRANS_SIZE; + uresp.edpm_limit_size = QEDR_EDPM_MAX_SIZE; + } uresp.wids_enabled = 1; uresp.wid_count = oparams.wid_count; diff --git a/include/uapi/rdma/qedr-abi.h b/include/uapi/rdma/qedr-abi.h index b261c9fca07b..bf7333b2b5d7 100644 --- a/include/uapi/rdma/qedr-abi.h +++ b/include/uapi/rdma/qedr-abi.h @@ -40,7 +40,8 @@ /* user kernel communication data structures. */ enum qedr_alloc_ucontext_flags { QEDR_ALLOC_UCTX_EDPM_MODE = 1 << 0, - QEDR_ALLOC_UCTX_DB_REC = 1 << 1 + QEDR_ALLOC_UCTX_DB_REC = 1 << 1, + QEDR_SUPPORT_DPM_SIZES = 1 << 2, }; struct qedr_alloc_ucontext_req { @@ -50,6 +51,7 @@ struct qedr_alloc_ucontext_req { #define QEDR_LDPM_MAX_SIZE (8192) #define QEDR_EDPM_TRANS_SIZE (64) +#define QEDR_EDPM_MAX_SIZE (ROCE_REQ_MAX_INLINE_DATA_SIZE) enum qedr_rdma_dpm_type { QEDR_DPM_TYPE_NONE = 0, @@ -77,6 +79,8 @@ struct qedr_alloc_ucontext_resp { __u16 ldpm_limit_size; __u8 edpm_trans_size; __u8 reserved; + __u16 edpm_limit_size; + __u8 padding[6]; }; struct qedr_alloc_pd_ureq { -- cgit v1.2.3-59-g8ed1b From e3a5a1e8b6548f5d37328e2d3571edc5c9e6d7c0 Mon Sep 17 00:00:00 2001 From: Priyaranjan Jha Date: Thu, 16 Jul 2020 12:12:35 -0700 Subject: tcp: add SNMP counter for no. of duplicate segments reported by DSACK There are two existing SNMP counters, TCPDSACKRecv and TCPDSACKOfoRecv, which are incremented depending on whether the DSACKed range is below the cumulative ACK sequence number or not. Unfortunately, these both implicitly assume each DSACK covers only one segment. This makes these counters unusable for estimating spurious retransmit rates, or real/non-spurious loss rate. This patch introduces a new SNMP counter, TCPDSACKRecvSegs, which tracks the estimated number of duplicate segments based on: (DSACKed sequence range) / MSS. This counter is usable for estimating spurious retransmit rates, or real/non-spurious loss rate. Signed-off-by: Priyaranjan Jha Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/uapi/linux/snmp.h | 1 + net/ipv4/proc.c | 1 + net/ipv4/tcp_input.c | 1 + 3 files changed, 3 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 7d91f4debc48..cee9f8e6fce3 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -287,6 +287,7 @@ enum LINUX_MIB_TCPFASTOPENPASSIVEALTKEY, /* TCPFastOpenPassiveAltKey */ LINUX_MIB_TCPTIMEOUTREHASH, /* TCPTimeoutRehash */ LINUX_MIB_TCPDUPLICATEDATAREHASH, /* TCPDuplicateDataRehash */ + LINUX_MIB_TCPDSACKRECVSEGS, /* TCPDSACKRecvSegs */ __LINUX_MIB_MAX }; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 75545a829a2b..1074df726ec0 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -292,6 +292,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPFastOpenPassiveAltKey", LINUX_MIB_TCPFASTOPENPASSIVEALTKEY), SNMP_MIB_ITEM("TcpTimeoutRehash", LINUX_MIB_TCPTIMEOUTREHASH), SNMP_MIB_ITEM("TcpDuplicateDataRehash", LINUX_MIB_TCPDUPLICATEDATAREHASH), + SNMP_MIB_ITEM("TCPDSACKRecvSegs", LINUX_MIB_TCPDSACKRECVSEGS), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5d6bbcb1e570..82906deb7874 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1153,6 +1153,7 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, } dup_segs = tcp_dsack_seen(tp, start_seq_0, end_seq_0, state); + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECVSEGS, dup_segs); /* D-SACK for already forgotten data... Do dumb counting. */ if (tp->undo_marker && tp->undo_retrans > 0 && -- cgit v1.2.3-59-g8ed1b From e9ddbb7707ff5891616240026062b8c1e29864ca Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Fri, 17 Jul 2020 12:35:23 +0200 Subject: bpf: Introduce SK_LOOKUP program type with a dedicated attach point Add a new program type BPF_PROG_TYPE_SK_LOOKUP with a dedicated attach type BPF_SK_LOOKUP. The new program kind is to be invoked by the transport layer when looking up a listening socket for a new connection request for connection oriented protocols, or when looking up an unconnected socket for a packet for connection-less protocols. When called, SK_LOOKUP BPF program can select a socket that will receive the packet. This serves as a mechanism to overcome the limits of what bind() API allows to express. Two use-cases driving this work are: (1) steer packets destined to an IP range, on fixed port to a socket 192.0.2.0/24, port 80 -> NGINX socket (2) steer packets destined to an IP address, on any port to a socket 198.51.100.1, any port -> L7 proxy socket In its run-time context program receives information about the packet that triggered the socket lookup. Namely IP version, L4 protocol identifier, and address 4-tuple. Context can be further extended to include ingress interface identifier. To select a socket BPF program fetches it from a map holding socket references, like SOCKMAP or SOCKHASH, and calls bpf_sk_assign(ctx, sk, ...) helper to record the selection. Transport layer then uses the selected socket as a result of socket lookup. In its basic form, SK_LOOKUP acts as a filter and hence must return either SK_PASS or SK_DROP. If the program returns with SK_PASS, transport should look for a socket to receive the packet, or use the one selected by the program if available, while SK_DROP informs the transport layer that the lookup should fail. This patch only enables the user to attach an SK_LOOKUP program to a network namespace. Subsequent patches hook it up to run on local delivery path in ipv4 and ipv6 stacks. Suggested-by: Marek Majkowski Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200717103536.397595-3-jakub@cloudflare.com --- include/linux/bpf-netns.h | 3 + include/linux/bpf.h | 1 + include/linux/bpf_types.h | 2 + include/linux/filter.h | 17 +++++ include/uapi/linux/bpf.h | 77 +++++++++++++++++++ kernel/bpf/net_namespace.c | 5 ++ kernel/bpf/syscall.c | 9 +++ kernel/bpf/verifier.c | 13 +++- net/core/filter.c | 180 +++++++++++++++++++++++++++++++++++++++++++++ scripts/bpf_helpers_doc.py | 9 ++- 10 files changed, 312 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/bpf-netns.h b/include/linux/bpf-netns.h index 47d5b0c708c9..722f799c1a2e 100644 --- a/include/linux/bpf-netns.h +++ b/include/linux/bpf-netns.h @@ -8,6 +8,7 @@ enum netns_bpf_attach_type { NETNS_BPF_INVALID = -1, NETNS_BPF_FLOW_DISSECTOR = 0, + NETNS_BPF_SK_LOOKUP, MAX_NETNS_BPF_ATTACH_TYPE }; @@ -17,6 +18,8 @@ to_netns_bpf_attach_type(enum bpf_attach_type attach_type) switch (attach_type) { case BPF_FLOW_DISSECTOR: return NETNS_BPF_FLOW_DISSECTOR; + case BPF_SK_LOOKUP: + return NETNS_BPF_SK_LOOKUP; default: return NETNS_BPF_INVALID; } diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c8c9eabcd106..adb16bdc5f0a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -249,6 +249,7 @@ enum bpf_arg_type { ARG_PTR_TO_INT, /* pointer to int */ ARG_PTR_TO_LONG, /* pointer to long */ ARG_PTR_TO_SOCKET, /* pointer to bpf_sock (fullsock) */ + ARG_PTR_TO_SOCKET_OR_NULL, /* pointer to bpf_sock (fullsock) or NULL */ ARG_PTR_TO_BTF_ID, /* pointer to in-kernel struct */ ARG_PTR_TO_ALLOC_MEM, /* pointer to dynamically allocated memory */ ARG_PTR_TO_ALLOC_MEM_OR_NULL, /* pointer to dynamically allocated memory or NULL */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index a18ae82a298a..a52a5688418e 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -64,6 +64,8 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2, #ifdef CONFIG_INET BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport, struct sk_reuseport_md, struct sk_reuseport_kern) +BPF_PROG_TYPE(BPF_PROG_TYPE_SK_LOOKUP, sk_lookup, + struct bpf_sk_lookup, struct bpf_sk_lookup_kern) #endif #if defined(CONFIG_BPF_JIT) BPF_PROG_TYPE(BPF_PROG_TYPE_STRUCT_OPS, bpf_struct_ops, diff --git a/include/linux/filter.h b/include/linux/filter.h index 0b0144752d78..fa1ea12ad2cd 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1278,4 +1278,21 @@ struct bpf_sockopt_kern { s32 retval; }; +struct bpf_sk_lookup_kern { + u16 family; + u16 protocol; + struct { + __be32 saddr; + __be32 daddr; + } v4; + struct { + const struct in6_addr *saddr; + const struct in6_addr *daddr; + } v6; + __be16 sport; + u16 dport; + struct sock *selected_sk; + bool no_reuseport; +}; + #endif /* __LINUX_FILTER_H__ */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7ac3992dacfe..54d0c886e3ba 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -189,6 +189,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_STRUCT_OPS, BPF_PROG_TYPE_EXT, BPF_PROG_TYPE_LSM, + BPF_PROG_TYPE_SK_LOOKUP, }; enum bpf_attach_type { @@ -228,6 +229,7 @@ enum bpf_attach_type { BPF_XDP_DEVMAP, BPF_CGROUP_INET_SOCK_RELEASE, BPF_XDP_CPUMAP, + BPF_SK_LOOKUP, __MAX_BPF_ATTACH_TYPE }; @@ -3069,6 +3071,10 @@ union bpf_attr { * * long bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags) * Description + * Helper is overloaded depending on BPF program type. This + * description applies to **BPF_PROG_TYPE_SCHED_CLS** and + * **BPF_PROG_TYPE_SCHED_ACT** programs. + * * Assign the *sk* to the *skb*. When combined with appropriate * routing configuration to receive the packet towards the socket, * will cause *skb* to be delivered to the specified socket. @@ -3094,6 +3100,56 @@ union bpf_attr { * **-ESOCKTNOSUPPORT** if the socket type is not supported * (reuseport). * + * long bpf_sk_assign(struct bpf_sk_lookup *ctx, struct bpf_sock *sk, u64 flags) + * Description + * Helper is overloaded depending on BPF program type. This + * description applies to **BPF_PROG_TYPE_SK_LOOKUP** programs. + * + * Select the *sk* as a result of a socket lookup. + * + * For the operation to succeed passed socket must be compatible + * with the packet description provided by the *ctx* object. + * + * L4 protocol (**IPPROTO_TCP** or **IPPROTO_UDP**) must + * be an exact match. While IP family (**AF_INET** or + * **AF_INET6**) must be compatible, that is IPv6 sockets + * that are not v6-only can be selected for IPv4 packets. + * + * Only TCP listeners and UDP unconnected sockets can be + * selected. *sk* can also be NULL to reset any previous + * selection. + * + * *flags* argument can combination of following values: + * + * * **BPF_SK_LOOKUP_F_REPLACE** to override the previous + * socket selection, potentially done by a BPF program + * that ran before us. + * + * * **BPF_SK_LOOKUP_F_NO_REUSEPORT** to skip + * load-balancing within reuseport group for the socket + * being selected. + * + * On success *ctx->sk* will point to the selected socket. + * + * Return + * 0 on success, or a negative errno in case of failure. + * + * * **-EAFNOSUPPORT** if socket family (*sk->family*) is + * not compatible with packet family (*ctx->family*). + * + * * **-EEXIST** if socket has been already selected, + * potentially by another program, and + * **BPF_SK_LOOKUP_F_REPLACE** flag was not specified. + * + * * **-EINVAL** if unsupported flags were specified. + * + * * **-EPROTOTYPE** if socket L4 protocol + * (*sk->protocol*) doesn't match packet protocol + * (*ctx->protocol*). + * + * * **-ESOCKTNOSUPPORT** if socket is not in allowed + * state (TCP listening or UDP unconnected). + * * u64 bpf_ktime_get_boot_ns(void) * Description * Return the time elapsed since system boot, in nanoseconds. @@ -3607,6 +3663,12 @@ enum { BPF_RINGBUF_HDR_SZ = 8, }; +/* BPF_FUNC_sk_assign flags in bpf_sk_lookup context. */ +enum { + BPF_SK_LOOKUP_F_REPLACE = (1ULL << 0), + BPF_SK_LOOKUP_F_NO_REUSEPORT = (1ULL << 1), +}; + /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, @@ -4349,4 +4411,19 @@ struct bpf_pidns_info { __u32 pid; __u32 tgid; }; + +/* User accessible data for SK_LOOKUP programs. Add new fields at the end. */ +struct bpf_sk_lookup { + __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */ + + __u32 family; /* Protocol family (AF_INET, AF_INET6) */ + __u32 protocol; /* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */ + __u32 remote_ip4; /* Network byte order */ + __u32 remote_ip6[4]; /* Network byte order */ + __u32 remote_port; /* Network byte order */ + __u32 local_ip4; /* Network byte order */ + __u32 local_ip6[4]; /* Network byte order */ + __u32 local_port; /* Host byte order */ +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index e9c8e26ac8f2..38b368bccda2 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -373,6 +373,8 @@ static int netns_bpf_max_progs(enum netns_bpf_attach_type type) switch (type) { case NETNS_BPF_FLOW_DISSECTOR: return 1; + case NETNS_BPF_SK_LOOKUP: + return 64; default: return 0; } @@ -403,6 +405,9 @@ static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, case NETNS_BPF_FLOW_DISSECTOR: err = flow_dissector_bpf_prog_attach_check(net, link->prog); break; + case NETNS_BPF_SK_LOOKUP: + err = 0; /* nothing to check */ + break; default: err = -EINVAL; break; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7ea9dfbebd8c..d07417d17712 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2022,6 +2022,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, default: return -EINVAL; } + case BPF_PROG_TYPE_SK_LOOKUP: + if (expected_attach_type == BPF_SK_LOOKUP) + return 0; + return -EINVAL; case BPF_PROG_TYPE_EXT: if (expected_attach_type) return -EINVAL; @@ -2756,6 +2760,7 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_CGROUP_SOCKOPT: + case BPF_PROG_TYPE_SK_LOOKUP: return attach_type == prog->expected_attach_type ? 0 : -EINVAL; case BPF_PROG_TYPE_CGROUP_SKB: if (!capable(CAP_NET_ADMIN)) @@ -2817,6 +2822,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) return BPF_PROG_TYPE_CGROUP_SOCKOPT; case BPF_TRACE_ITER: return BPF_PROG_TYPE_TRACING; + case BPF_SK_LOOKUP: + return BPF_PROG_TYPE_SK_LOOKUP; default: return BPF_PROG_TYPE_UNSPEC; } @@ -2953,6 +2960,7 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_LIRC_MODE2: return lirc_prog_query(attr, uattr); case BPF_FLOW_DISSECTOR: + case BPF_SK_LOOKUP: return netns_bpf_prog_query(attr, uattr); default: return -EINVAL; @@ -3891,6 +3899,7 @@ static int link_create(union bpf_attr *attr) ret = tracing_bpf_link_attach(attr, prog); break; case BPF_PROG_TYPE_FLOW_DISSECTOR: + case BPF_PROG_TYPE_SK_LOOKUP: ret = netns_bpf_link_create(attr, prog); break; default: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3c1efc9d08fd..9a6703bc3f36 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3878,10 +3878,14 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, } meta->ref_obj_id = reg->ref_obj_id; } - } else if (arg_type == ARG_PTR_TO_SOCKET) { + } else if (arg_type == ARG_PTR_TO_SOCKET || + arg_type == ARG_PTR_TO_SOCKET_OR_NULL) { expected_type = PTR_TO_SOCKET; - if (type != expected_type) - goto err_type; + if (!(register_is_null(reg) && + arg_type == ARG_PTR_TO_SOCKET_OR_NULL)) { + if (type != expected_type) + goto err_type; + } } else if (arg_type == ARG_PTR_TO_BTF_ID) { expected_type = PTR_TO_BTF_ID; if (type != expected_type) @@ -7354,6 +7358,9 @@ static int check_return_code(struct bpf_verifier_env *env) return -ENOTSUPP; } break; + case BPF_PROG_TYPE_SK_LOOKUP: + range = tnum_range(SK_DROP, SK_PASS); + break; case BPF_PROG_TYPE_EXT: /* freplace program can return anything as its return value * depends on the to-be-replaced kernel func or bpf program. diff --git a/net/core/filter.c b/net/core/filter.c index bdd2382e655d..d099436b3ff5 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -9229,6 +9229,186 @@ const struct bpf_verifier_ops sk_reuseport_verifier_ops = { const struct bpf_prog_ops sk_reuseport_prog_ops = { }; + +BPF_CALL_3(bpf_sk_lookup_assign, struct bpf_sk_lookup_kern *, ctx, + struct sock *, sk, u64, flags) +{ + if (unlikely(flags & ~(BPF_SK_LOOKUP_F_REPLACE | + BPF_SK_LOOKUP_F_NO_REUSEPORT))) + return -EINVAL; + if (unlikely(sk && sk_is_refcounted(sk))) + return -ESOCKTNOSUPPORT; /* reject non-RCU freed sockets */ + if (unlikely(sk && sk->sk_state == TCP_ESTABLISHED)) + return -ESOCKTNOSUPPORT; /* reject connected sockets */ + + /* Check if socket is suitable for packet L3/L4 protocol */ + if (sk && sk->sk_protocol != ctx->protocol) + return -EPROTOTYPE; + if (sk && sk->sk_family != ctx->family && + (sk->sk_family == AF_INET || ipv6_only_sock(sk))) + return -EAFNOSUPPORT; + + if (ctx->selected_sk && !(flags & BPF_SK_LOOKUP_F_REPLACE)) + return -EEXIST; + + /* Select socket as lookup result */ + ctx->selected_sk = sk; + ctx->no_reuseport = flags & BPF_SK_LOOKUP_F_NO_REUSEPORT; + return 0; +} + +static const struct bpf_func_proto bpf_sk_lookup_assign_proto = { + .func = bpf_sk_lookup_assign, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_SOCKET_OR_NULL, + .arg3_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto * +sk_lookup_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_perf_event_output: + return &bpf_event_output_data_proto; + case BPF_FUNC_sk_assign: + return &bpf_sk_lookup_assign_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; + default: + return bpf_base_func_proto(func_id); + } +} + +static bool sk_lookup_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (off < 0 || off >= sizeof(struct bpf_sk_lookup)) + return false; + if (off % size != 0) + return false; + if (type != BPF_READ) + return false; + + switch (off) { + case offsetof(struct bpf_sk_lookup, sk): + info->reg_type = PTR_TO_SOCKET_OR_NULL; + return size == sizeof(__u64); + + case bpf_ctx_range(struct bpf_sk_lookup, family): + case bpf_ctx_range(struct bpf_sk_lookup, protocol): + case bpf_ctx_range(struct bpf_sk_lookup, remote_ip4): + case bpf_ctx_range(struct bpf_sk_lookup, local_ip4): + case bpf_ctx_range_till(struct bpf_sk_lookup, remote_ip6[0], remote_ip6[3]): + case bpf_ctx_range_till(struct bpf_sk_lookup, local_ip6[0], local_ip6[3]): + case bpf_ctx_range(struct bpf_sk_lookup, remote_port): + case bpf_ctx_range(struct bpf_sk_lookup, local_port): + bpf_ctx_record_field_size(info, sizeof(__u32)); + return bpf_ctx_narrow_access_ok(off, size, sizeof(__u32)); + + default: + return false; + } +} + +static u32 sk_lookup_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct bpf_sk_lookup, sk): + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, + offsetof(struct bpf_sk_lookup_kern, selected_sk)); + break; + + case offsetof(struct bpf_sk_lookup, family): + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + family, 2, target_size)); + break; + + case offsetof(struct bpf_sk_lookup, protocol): + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + protocol, 2, target_size)); + break; + + case offsetof(struct bpf_sk_lookup, remote_ip4): + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + v4.saddr, 4, target_size)); + break; + + case offsetof(struct bpf_sk_lookup, local_ip4): + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + v4.daddr, 4, target_size)); + break; + + case bpf_ctx_range_till(struct bpf_sk_lookup, + remote_ip6[0], remote_ip6[3]): { +#if IS_ENABLED(CONFIG_IPV6) + int off = si->off; + + off -= offsetof(struct bpf_sk_lookup, remote_ip6[0]); + off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, + offsetof(struct bpf_sk_lookup_kern, v6.saddr)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + } + case bpf_ctx_range_till(struct bpf_sk_lookup, + local_ip6[0], local_ip6[3]): { +#if IS_ENABLED(CONFIG_IPV6) + int off = si->off; + + off -= offsetof(struct bpf_sk_lookup, local_ip6[0]); + off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, + offsetof(struct bpf_sk_lookup_kern, v6.daddr)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + } + case offsetof(struct bpf_sk_lookup, remote_port): + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + sport, 2, target_size)); + break; + + case offsetof(struct bpf_sk_lookup, local_port): + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + dport, 2, target_size)); + break; + } + + return insn - insn_buf; +} + +const struct bpf_prog_ops sk_lookup_prog_ops = { +}; + +const struct bpf_verifier_ops sk_lookup_verifier_ops = { + .get_func_proto = sk_lookup_func_proto, + .is_valid_access = sk_lookup_is_valid_access, + .convert_ctx_access = sk_lookup_convert_ctx_access, +}; + #endif /* CONFIG_INET */ DEFINE_BPF_DISPATCHER(xdp) diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 6843376733df..5bfa448b4704 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -404,6 +404,7 @@ class PrinterHelpers(Printer): type_fwds = [ 'struct bpf_fib_lookup', + 'struct bpf_sk_lookup', 'struct bpf_perf_event_data', 'struct bpf_perf_event_value', 'struct bpf_pidns_info', @@ -450,6 +451,7 @@ class PrinterHelpers(Printer): 'struct bpf_perf_event_data', 'struct bpf_perf_event_value', 'struct bpf_pidns_info', + 'struct bpf_sk_lookup', 'struct bpf_sock', 'struct bpf_sock_addr', 'struct bpf_sock_ops', @@ -487,6 +489,11 @@ class PrinterHelpers(Printer): 'struct sk_msg_buff': 'struct sk_msg_md', 'struct xdp_buff': 'struct xdp_md', } + # Helpers overloaded for different context types. + overloaded_helpers = [ + 'bpf_get_socket_cookie', + 'bpf_sk_assign', + ] def print_header(self): header = '''\ @@ -543,7 +550,7 @@ class PrinterHelpers(Printer): for i, a in enumerate(proto['args']): t = a['type'] n = a['name'] - if proto['name'] == 'bpf_get_socket_cookie' and i == 0: + if proto['name'] in self.overloaded_helpers and i == 0: t = 'void' n = 'ctx' one_arg = '{}{}'.format(comma, self.map_type(t)) -- cgit v1.2.3-59-g8ed1b From b3ab1c6058fad8cd5726f24e9ed9053e43bb2af4 Mon Sep 17 00:00:00 2001 From: Ezequiel Garcia Date: Wed, 24 Jun 2020 21:28:00 +0200 Subject: media: Add V4L2_TYPE_IS_CAPTURE helper It's all too easy to get confused by the V4L2_TYPE_IS_OUTPUT macro, when it's used as !V4L2_TYPE_IS_OUTPUT. Reduce the risk of confusion with macro to explicitly check for the CAPTURE queue type case. This change does not affect functionality, and it's only intended to make the code more readable. Suggested-by: Nicolas Dufresne Signed-off-by: Ezequiel Garcia Signed-off-by: Hans Verkuil [hverkuil-cisco@xs4all.nl: checkpatch: align with parenthesis] Signed-off-by: Mauro Carvalho Chehab --- drivers/media/common/videobuf2/videobuf2-v4l2.c | 4 ++-- drivers/media/platform/exynos-gsc/gsc-core.c | 2 +- drivers/media/platform/exynos-gsc/gsc-m2m.c | 2 +- drivers/media/platform/mtk-jpeg/mtk_jpeg_core.c | 2 +- drivers/media/platform/mtk-mdp/mtk_mdp_m2m.c | 7 +++---- drivers/media/platform/rcar_jpu.c | 2 +- drivers/media/platform/sti/hva/hva-v4l2.c | 2 +- drivers/media/platform/ti-vpe/vpe.c | 2 +- drivers/media/test-drivers/vicodec/vicodec-core.c | 6 +++--- drivers/media/v4l2-core/v4l2-mem2mem.c | 6 +++--- drivers/staging/media/hantro/hantro_v4l2.c | 2 +- drivers/staging/media/rkvdec/rkvdec.c | 2 +- include/uapi/linux/videodev2.h | 2 ++ 13 files changed, 21 insertions(+), 20 deletions(-) (limited to 'include/uapi') diff --git a/drivers/media/common/videobuf2/videobuf2-v4l2.c b/drivers/media/common/videobuf2/videobuf2-v4l2.c index 57aa183bd198..30caad27281e 100644 --- a/drivers/media/common/videobuf2/videobuf2-v4l2.c +++ b/drivers/media/common/videobuf2/videobuf2-v4l2.c @@ -97,7 +97,7 @@ static int __verify_length(struct vb2_buffer *vb, const struct v4l2_buffer *b) unsigned int bytesused; unsigned int plane; - if (!V4L2_TYPE_IS_OUTPUT(b->type)) + if (V4L2_TYPE_IS_CAPTURE(b->type)) return 0; if (V4L2_TYPE_IS_MULTIPLANAR(b->type)) { @@ -311,7 +311,7 @@ static int vb2_fill_vb2_v4l2_buffer(struct vb2_buffer *vb, struct v4l2_buffer *b /* Zero flags that we handle */ vbuf->flags = b->flags & ~V4L2_BUFFER_MASK_FLAGS; - if (!vb->vb2_queue->copy_timestamp || !V4L2_TYPE_IS_OUTPUT(b->type)) { + if (!vb->vb2_queue->copy_timestamp || V4L2_TYPE_IS_CAPTURE(b->type)) { /* * Non-COPY timestamps and non-OUTPUT queues will get * their timestamp and timestamp source flags from the diff --git a/drivers/media/platform/exynos-gsc/gsc-core.c b/drivers/media/platform/exynos-gsc/gsc-core.c index f6650b45bc3d..9f41c2e7097a 100644 --- a/drivers/media/platform/exynos-gsc/gsc-core.c +++ b/drivers/media/platform/exynos-gsc/gsc-core.c @@ -577,7 +577,7 @@ int gsc_try_selection(struct gsc_ctx *ctx, struct v4l2_selection *s) v4l_bound_align_image(&tmp_w, min_w, max_w, mod_x, &tmp_h, min_h, max_h, mod_y, 0); - if (!V4L2_TYPE_IS_OUTPUT(s->type) && + if (V4L2_TYPE_IS_CAPTURE(s->type) && (ctx->gsc_ctrls.rotate->val == 90 || ctx->gsc_ctrls.rotate->val == 270)) gsc_check_crop_change(tmp_h, tmp_w, diff --git a/drivers/media/platform/exynos-gsc/gsc-m2m.c b/drivers/media/platform/exynos-gsc/gsc-m2m.c index e2c162635f72..27a3c92c73bc 100644 --- a/drivers/media/platform/exynos-gsc/gsc-m2m.c +++ b/drivers/media/platform/exynos-gsc/gsc-m2m.c @@ -255,7 +255,7 @@ static int gsc_m2m_buf_prepare(struct vb2_buffer *vb) if (IS_ERR(frame)) return PTR_ERR(frame); - if (!V4L2_TYPE_IS_OUTPUT(vb->vb2_queue->type)) { + if (V4L2_TYPE_IS_CAPTURE(vb->vb2_queue->type)) { for (i = 0; i < frame->fmt->num_planes; i++) vb2_set_plane_payload(vb, i, frame->payload[i]); } diff --git a/drivers/media/platform/mtk-jpeg/mtk_jpeg_core.c b/drivers/media/platform/mtk-jpeg/mtk_jpeg_core.c index f82a81a3bdee..61fed1e35a00 100644 --- a/drivers/media/platform/mtk-jpeg/mtk_jpeg_core.c +++ b/drivers/media/platform/mtk-jpeg/mtk_jpeg_core.c @@ -731,7 +731,7 @@ static void mtk_jpeg_stop_streaming(struct vb2_queue *q) * subsampling. Update capture queue when the stream is off. */ if (ctx->state == MTK_JPEG_SOURCE_CHANGE && - !V4L2_TYPE_IS_OUTPUT(q->type)) { + V4L2_TYPE_IS_CAPTURE(q->type)) { struct mtk_jpeg_src_buf *src_buf; vb = v4l2_m2m_next_src_buf(ctx->fh.m2m_ctx); diff --git a/drivers/media/platform/mtk-mdp/mtk_mdp_m2m.c b/drivers/media/platform/mtk-mdp/mtk_mdp_m2m.c index bb9caaf513bc..724c7333b6e5 100644 --- a/drivers/media/platform/mtk-mdp/mtk_mdp_m2m.c +++ b/drivers/media/platform/mtk-mdp/mtk_mdp_m2m.c @@ -193,7 +193,7 @@ static const struct mtk_mdp_fmt *mtk_mdp_try_fmt_mplane(struct mtk_mdp_ctx *ctx, pix_mp->field = V4L2_FIELD_NONE; pix_mp->pixelformat = fmt->pixelformat; - if (!V4L2_TYPE_IS_OUTPUT(f->type)) { + if (V4L2_TYPE_IS_CAPTURE(f->type)) { pix_mp->colorspace = ctx->colorspace; pix_mp->xfer_func = ctx->xfer_func; pix_mp->ycbcr_enc = ctx->ycbcr_enc; @@ -327,9 +327,8 @@ static int mtk_mdp_try_crop(struct mtk_mdp_ctx *ctx, u32 type, mtk_mdp_bound_align_image(&new_w, min_w, max_w, align_w, &new_h, min_h, max_h, align_h); - if (!V4L2_TYPE_IS_OUTPUT(type) && - (ctx->ctrls.rotate->val == 90 || - ctx->ctrls.rotate->val == 270)) + if (V4L2_TYPE_IS_CAPTURE(type) && + (ctx->ctrls.rotate->val == 90 || ctx->ctrls.rotate->val == 270)) mtk_mdp_check_crop_change(new_h, new_w, &r->width, &r->height); else diff --git a/drivers/media/platform/rcar_jpu.c b/drivers/media/platform/rcar_jpu.c index 5250a14324e9..9b99ff368698 100644 --- a/drivers/media/platform/rcar_jpu.c +++ b/drivers/media/platform/rcar_jpu.c @@ -1066,7 +1066,7 @@ static int jpu_buf_prepare(struct vb2_buffer *vb) } /* decoder capture queue */ - if (!ctx->encoder && !V4L2_TYPE_IS_OUTPUT(vb->vb2_queue->type)) + if (!ctx->encoder && V4L2_TYPE_IS_CAPTURE(vb->vb2_queue->type)) vb2_set_plane_payload(vb, i, size); } diff --git a/drivers/media/platform/sti/hva/hva-v4l2.c b/drivers/media/platform/sti/hva/hva-v4l2.c index 197b99d8fd9c..bb34d6997d99 100644 --- a/drivers/media/platform/sti/hva/hva-v4l2.c +++ b/drivers/media/platform/sti/hva/hva-v4l2.c @@ -1087,7 +1087,7 @@ static void hva_stop_streaming(struct vb2_queue *vq) if ((V4L2_TYPE_IS_OUTPUT(vq->type) && vb2_is_streaming(&ctx->fh.m2m_ctx->cap_q_ctx.q)) || - (!V4L2_TYPE_IS_OUTPUT(vq->type) && + (V4L2_TYPE_IS_CAPTURE(vq->type) && vb2_is_streaming(&ctx->fh.m2m_ctx->out_q_ctx.q))) { dev_dbg(dev, "%s %s out=%d cap=%d\n", ctx->name, to_type_str(vq->type), diff --git a/drivers/media/platform/ti-vpe/vpe.c b/drivers/media/platform/ti-vpe/vpe.c index cff2fcd6d812..346f8212791c 100644 --- a/drivers/media/platform/ti-vpe/vpe.c +++ b/drivers/media/platform/ti-vpe/vpe.c @@ -1576,7 +1576,7 @@ static int vpe_g_fmt(struct file *file, void *priv, struct v4l2_format *f) *f = q_data->format; - if (!V4L2_TYPE_IS_OUTPUT(f->type)) { + if (V4L2_TYPE_IS_CAPTURE(f->type)) { struct vpe_q_data *s_q_data; struct v4l2_pix_format_mplane *spix; diff --git a/drivers/media/test-drivers/vicodec/vicodec-core.c b/drivers/media/test-drivers/vicodec/vicodec-core.c index e879290727ef..8941d73f6611 100644 --- a/drivers/media/test-drivers/vicodec/vicodec-core.c +++ b/drivers/media/test-drivers/vicodec/vicodec-core.c @@ -1442,7 +1442,7 @@ static void vicodec_buf_queue(struct vb2_buffer *vb) .u.src_change.changes = V4L2_EVENT_SRC_CH_RESOLUTION, }; - if (!V4L2_TYPE_IS_OUTPUT(vb->vb2_queue->type) && + if (V4L2_TYPE_IS_CAPTURE(vb->vb2_queue->type) && vb2_is_streaming(vb->vb2_queue) && v4l2_m2m_dst_buf_is_last(ctx->fh.m2m_ctx)) { unsigned int i; @@ -1479,7 +1479,7 @@ static void vicodec_buf_queue(struct vb2_buffer *vb) * in the compressed stream */ if (ctx->is_stateless || ctx->is_enc || - !V4L2_TYPE_IS_OUTPUT(vb->vb2_queue->type)) { + V4L2_TYPE_IS_CAPTURE(vb->vb2_queue->type)) { v4l2_m2m_buf_queue(ctx->fh.m2m_ctx, vbuf); return; } @@ -1574,7 +1574,7 @@ static int vicodec_start_streaming(struct vb2_queue *q, state->gop_cnt = 0; if ((V4L2_TYPE_IS_OUTPUT(q->type) && !ctx->is_enc) || - (!V4L2_TYPE_IS_OUTPUT(q->type) && ctx->is_enc)) + (V4L2_TYPE_IS_CAPTURE(q->type) && ctx->is_enc)) return 0; if (info->id == V4L2_PIX_FMT_FWHT || diff --git a/drivers/media/v4l2-core/v4l2-mem2mem.c b/drivers/media/v4l2-core/v4l2-mem2mem.c index 62ac9424c92a..95a8f2dc5341 100644 --- a/drivers/media/v4l2-core/v4l2-mem2mem.c +++ b/drivers/media/v4l2-core/v4l2-mem2mem.c @@ -556,7 +556,7 @@ int v4l2_m2m_querybuf(struct file *file, struct v4l2_m2m_ctx *m2m_ctx, ret = vb2_querybuf(vq, buf); /* Adjust MMAP memory offsets for the CAPTURE queue */ - if (buf->memory == V4L2_MEMORY_MMAP && !V4L2_TYPE_IS_OUTPUT(vq->type)) { + if (buf->memory == V4L2_MEMORY_MMAP && V4L2_TYPE_IS_CAPTURE(vq->type)) { if (V4L2_TYPE_IS_MULTIPLANAR(vq->type)) { for (i = 0; i < buf->length; ++i) buf->m.planes[i].m.mem_offset @@ -712,7 +712,7 @@ int v4l2_m2m_qbuf(struct file *file, struct v4l2_m2m_ctx *m2m_ctx, int ret; vq = v4l2_m2m_get_vq(m2m_ctx, buf->type); - if (!V4L2_TYPE_IS_OUTPUT(vq->type) && + if (V4L2_TYPE_IS_CAPTURE(vq->type) && (buf->flags & V4L2_BUF_FLAG_REQUEST_FD)) { dprintk("%s: requests cannot be used with capture buffers\n", __func__); @@ -729,7 +729,7 @@ int v4l2_m2m_qbuf(struct file *file, struct v4l2_m2m_ctx *m2m_ctx, * buffer as DONE with LAST flag since it won't be queued on the * device. */ - if (!V4L2_TYPE_IS_OUTPUT(vq->type) && + if (V4L2_TYPE_IS_CAPTURE(vq->type) && vb2_is_streaming(vq) && !vb2_start_streaming_called(vq) && (v4l2_m2m_has_stopped(m2m_ctx) || v4l2_m2m_dst_buf_is_last(m2m_ctx))) v4l2_m2m_force_last_buf_done(m2m_ctx, vq); diff --git a/drivers/staging/media/hantro/hantro_v4l2.c b/drivers/staging/media/hantro/hantro_v4l2.c index f28a94e2fa93..63859e8a0923 100644 --- a/drivers/staging/media/hantro/hantro_v4l2.c +++ b/drivers/staging/media/hantro/hantro_v4l2.c @@ -237,7 +237,7 @@ static int hantro_try_fmt(const struct hantro_ctx *ctx, enum v4l2_buf_type type) { const struct hantro_fmt *fmt, *vpu_fmt; - bool capture = !V4L2_TYPE_IS_OUTPUT(type); + bool capture = V4L2_TYPE_IS_CAPTURE(type); bool coded; coded = capture == hantro_is_encoder_ctx(ctx); diff --git a/drivers/staging/media/rkvdec/rkvdec.c b/drivers/staging/media/rkvdec/rkvdec.c index 225eeca73356..fd68671f0286 100644 --- a/drivers/staging/media/rkvdec/rkvdec.c +++ b/drivers/staging/media/rkvdec/rkvdec.c @@ -489,7 +489,7 @@ static int rkvdec_start_streaming(struct vb2_queue *q, unsigned int count) const struct rkvdec_coded_fmt_desc *desc; int ret; - if (!V4L2_TYPE_IS_OUTPUT(q->type)) + if (V4L2_TYPE_IS_CAPTURE(q->type)) return 0; desc = ctx->coded_fmt_desc; diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index 303805438814..c7b70ff53bc1 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -171,6 +171,8 @@ enum v4l2_buf_type { || (type) == V4L2_BUF_TYPE_SDR_OUTPUT \ || (type) == V4L2_BUF_TYPE_META_OUTPUT) +#define V4L2_TYPE_IS_CAPTURE(type) (!V4L2_TYPE_IS_OUTPUT(type)) + enum v4l2_tuner_type { V4L2_TUNER_RADIO = 1, V4L2_TUNER_ANALOG_TV = 2, -- cgit v1.2.3-59-g8ed1b From 124ea650d3072b005457faed69909221c2905a1f Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Sun, 19 Jul 2020 12:04:11 +0200 Subject: capabilities: Introduce CAP_CHECKPOINT_RESTORE This patch introduces CAP_CHECKPOINT_RESTORE, a new capability facilitating checkpoint/restore for non-root users. Over the last years, The CRIU (Checkpoint/Restore In Userspace) team has been asked numerous times if it is possible to checkpoint/restore a process as non-root. The answer usually was: 'almost'. The main blocker to restore a process as non-root was to control the PID of the restored process. This feature available via the clone3 system call, or via /proc/sys/kernel/ns_last_pid is unfortunately guarded by CAP_SYS_ADMIN. In the past two years, requests for non-root checkpoint/restore have increased due to the following use cases: * Checkpoint/Restore in an HPC environment in combination with a resource manager distributing jobs where users are always running as non-root. There is a desire to provide a way to checkpoint and restore long running jobs. * Container migration as non-root * We have been in contact with JVM developers who are integrating CRIU into a Java VM to decrease the startup time. These checkpoint/restore applications are not meant to be running with CAP_SYS_ADMIN. We have seen the following workarounds: * Use a setuid wrapper around CRIU: See https://github.com/FredHutch/slurm-examples/blob/master/checkpointer/lib/checkpointer/checkpointer-suid.c * Use a setuid helper that writes to ns_last_pid. Unfortunately, this helper delegation technique is impossible to use with clone3, and is thus prone to races. See https://github.com/twosigma/set_ns_last_pid * Cycle through PIDs with fork() until the desired PID is reached: This has been demonstrated to work with cycling rates of 100,000 PIDs/s See https://github.com/twosigma/set_ns_last_pid * Patch out the CAP_SYS_ADMIN check from the kernel * Run the desired application in a new user and PID namespace to provide a local CAP_SYS_ADMIN for controlling PIDs. This technique has limited use in typical container environments (e.g., Kubernetes) as /proc is typically protected with read-only layers (e.g., /proc/sys) for hardening purposes. Read-only layers prevent additional /proc mounts (due to proc's SB_I_USERNS_VISIBLE property), making the use of new PID namespaces limited as certain applications need access to /proc matching their PID namespace. The introduced capability allows to: * Control PIDs when the current user is CAP_CHECKPOINT_RESTORE capable for the corresponding PID namespace via ns_last_pid/clone3. * Open files in /proc/pid/map_files when the current user is CAP_CHECKPOINT_RESTORE capable in the root namespace, useful for recovering files that are unreachable via the file system such as deleted files, or memfd files. See corresponding selftest for an example with clone3(). Signed-off-by: Adrian Reber Signed-off-by: Nicolas Viennot Reviewed-by: Serge Hallyn Acked-by: Christian Brauner Link: https://lore.kernel.org/r/20200719100418.2112740-2-areber@redhat.com Signed-off-by: Christian Brauner --- include/linux/capability.h | 6 ++++++ include/uapi/linux/capability.h | 9 ++++++++- security/selinux/include/classmap.h | 5 +++-- 3 files changed, 17 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/capability.h b/include/linux/capability.h index b4345b38a6be..1e7fe311cabe 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -261,6 +261,12 @@ static inline bool bpf_capable(void) return capable(CAP_BPF) || capable(CAP_SYS_ADMIN); } +static inline bool checkpoint_restore_ns_capable(struct user_namespace *ns) +{ + return ns_capable(ns, CAP_CHECKPOINT_RESTORE) || + ns_capable(ns, CAP_SYS_ADMIN); +} + /* audit system wants to get cap info from files as well */ extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps); diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h index 48ff0757ae5e..395dd0df8d08 100644 --- a/include/uapi/linux/capability.h +++ b/include/uapi/linux/capability.h @@ -408,7 +408,14 @@ struct vfs_ns_cap_data { */ #define CAP_BPF 39 -#define CAP_LAST_CAP CAP_BPF + +/* Allow checkpoint/restore related operations */ +/* Allow PID selection during clone3() */ +/* Allow writing to ns_last_pid */ + +#define CAP_CHECKPOINT_RESTORE 40 + +#define CAP_LAST_CAP CAP_CHECKPOINT_RESTORE #define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP) diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h index 98e1513b608a..40cebde62856 100644 --- a/security/selinux/include/classmap.h +++ b/security/selinux/include/classmap.h @@ -27,9 +27,10 @@ "audit_control", "setfcap" #define COMMON_CAP2_PERMS "mac_override", "mac_admin", "syslog", \ - "wake_alarm", "block_suspend", "audit_read", "perfmon", "bpf" + "wake_alarm", "block_suspend", "audit_read", "perfmon", "bpf", \ + "checkpoint_restore" -#if CAP_LAST_CAP > CAP_BPF +#if CAP_LAST_CAP > CAP_CHECKPOINT_RESTORE #error New capability defined, please update COMMON_CAP2_PERMS. #endif -- cgit v1.2.3-59-g8ed1b From c4471ad9a50d5548e66ae4511acfb1dc23a48744 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Mon, 20 Jul 2020 00:03:33 +0200 Subject: net: phy: add USXGMII link partner ability constants The constants are taken from the USXGMII Singleport Copper Interface specification. The naming are based on the SGMII ones, but with an MDIO_ prefix. Signed-off-by: Michael Walle Reviewed-by: Russell King Signed-off-by: David S. Miller --- include/uapi/linux/mdio.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/mdio.h b/include/uapi/linux/mdio.h index 4bcb41c71b8c..3f302e2523b2 100644 --- a/include/uapi/linux/mdio.h +++ b/include/uapi/linux/mdio.h @@ -324,4 +324,30 @@ static inline __u16 mdio_phy_id_c45(int prtad, int devad) return MDIO_PHY_ID_C45 | (prtad << 5) | devad; } +/* UsxgmiiChannelInfo[15:0] for USXGMII in-band auto-negotiation.*/ +#define MDIO_USXGMII_EEE_CLK_STP 0x0080 /* EEE clock stop supported */ +#define MDIO_USXGMII_EEE 0x0100 /* EEE supported */ +#define MDIO_USXGMII_SPD_MASK 0x0e00 /* USXGMII speed mask */ +#define MDIO_USXGMII_FULL_DUPLEX 0x1000 /* USXGMII full duplex */ +#define MDIO_USXGMII_DPX_SPD_MASK 0x1e00 /* USXGMII duplex and speed bits */ +#define MDIO_USXGMII_10 0x0000 /* 10Mbps */ +#define MDIO_USXGMII_10HALF 0x0000 /* 10Mbps half-duplex */ +#define MDIO_USXGMII_10FULL 0x1000 /* 10Mbps full-duplex */ +#define MDIO_USXGMII_100 0x0200 /* 100Mbps */ +#define MDIO_USXGMII_100HALF 0x0200 /* 100Mbps half-duplex */ +#define MDIO_USXGMII_100FULL 0x1200 /* 100Mbps full-duplex */ +#define MDIO_USXGMII_1000 0x0400 /* 1000Mbps */ +#define MDIO_USXGMII_1000HALF 0x0400 /* 1000Mbps half-duplex */ +#define MDIO_USXGMII_1000FULL 0x1400 /* 1000Mbps full-duplex */ +#define MDIO_USXGMII_10G 0x0600 /* 10Gbps */ +#define MDIO_USXGMII_10GHALF 0x0600 /* 10Gbps half-duplex */ +#define MDIO_USXGMII_10GFULL 0x1600 /* 10Gbps full-duplex */ +#define MDIO_USXGMII_2500 0x0800 /* 2500Mbps */ +#define MDIO_USXGMII_2500HALF 0x0800 /* 2500Mbps half-duplex */ +#define MDIO_USXGMII_2500FULL 0x1800 /* 2500Mbps full-duplex */ +#define MDIO_USXGMII_5000 0x0a00 /* 5000Mbps */ +#define MDIO_USXGMII_5000HALF 0x0a00 /* 5000Mbps half-duplex */ +#define MDIO_USXGMII_5000FULL 0x1a00 /* 5000Mbps full-duplex */ +#define MDIO_USXGMII_LINK 0x8000 /* PHY link with copper-side partner */ + #endif /* _UAPI__LINUX_MDIO_H__ */ -- cgit v1.2.3-59-g8ed1b From 55db9c0e853421fa71cac5e6855898601f78a1f5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Jul 2020 08:23:15 +0200 Subject: net: remove compat_sys_{get,set}sockopt Now that the ->compat_{get,set}sockopt proto_ops methods are gone there is no good reason left to keep the compat syscalls separate. This fixes the odd use of unsigned int for the compat_setsockopt optlen and the missing sock_use_custom_sol_socket. It would also easily allow running the eBPF hooks for the compat syscalls, but such a large change in behavior does not belong into a consolidation patch like this one. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- arch/arm64/include/asm/unistd32.h | 4 +- arch/mips/kernel/syscalls/syscall_n32.tbl | 4 +- arch/mips/kernel/syscalls/syscall_o32.tbl | 4 +- arch/parisc/kernel/syscalls/syscall.tbl | 4 +- arch/powerpc/kernel/syscalls/syscall.tbl | 4 +- arch/s390/kernel/syscalls/syscall.tbl | 4 +- arch/sparc/kernel/sys32.S | 12 ++-- arch/sparc/kernel/syscalls/syscall.tbl | 4 +- arch/x86/entry/syscall_x32.c | 7 ++ arch/x86/entry/syscalls/syscall_32.tbl | 4 +- arch/x86/entry/syscalls/syscall_64.tbl | 4 +- include/linux/compat.h | 4 -- include/linux/syscalls.h | 4 ++ include/uapi/asm-generic/unistd.h | 4 +- net/compat.c | 79 +--------------------- net/socket.c | 25 ++++--- tools/include/uapi/asm-generic/unistd.h | 4 +- tools/perf/arch/powerpc/entry/syscalls/syscall.tbl | 4 +- tools/perf/arch/s390/entry/syscalls/syscall.tbl | 4 +- tools/perf/arch/x86/entry/syscalls/syscall_64.tbl | 4 +- 20 files changed, 62 insertions(+), 125 deletions(-) (limited to 'include/uapi') diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 6d95d0c8bf2f..166e36903110 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -599,9 +599,9 @@ __SYSCALL(__NR_recvfrom, compat_sys_recvfrom) #define __NR_shutdown 293 __SYSCALL(__NR_shutdown, sys_shutdown) #define __NR_setsockopt 294 -__SYSCALL(__NR_setsockopt, compat_sys_setsockopt) +__SYSCALL(__NR_setsockopt, sys_setsockopt) #define __NR_getsockopt 295 -__SYSCALL(__NR_getsockopt, compat_sys_getsockopt) +__SYSCALL(__NR_getsockopt, sys_getsockopt) #define __NR_sendmsg 296 __SYSCALL(__NR_sendmsg, compat_sys_sendmsg) #define __NR_recvmsg 297 diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index f777141f5256..8488b0d0a99e 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -60,8 +60,8 @@ 50 n32 getsockname sys_getsockname 51 n32 getpeername sys_getpeername 52 n32 socketpair sys_socketpair -53 n32 setsockopt compat_sys_setsockopt -54 n32 getsockopt compat_sys_getsockopt +53 n32 setsockopt sys_setsockopt +54 n32 getsockopt sys_getsockopt 55 n32 clone __sys_clone 56 n32 fork __sys_fork 57 n32 execve compat_sys_execve diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index 13280625d312..b20522f813f9 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -184,7 +184,7 @@ 170 o32 connect sys_connect 171 o32 getpeername sys_getpeername 172 o32 getsockname sys_getsockname -173 o32 getsockopt sys_getsockopt compat_sys_getsockopt +173 o32 getsockopt sys_getsockopt sys_getsockopt 174 o32 listen sys_listen 175 o32 recv sys_recv compat_sys_recv 176 o32 recvfrom sys_recvfrom compat_sys_recvfrom @@ -192,7 +192,7 @@ 178 o32 send sys_send 179 o32 sendmsg sys_sendmsg compat_sys_sendmsg 180 o32 sendto sys_sendto -181 o32 setsockopt sys_setsockopt compat_sys_setsockopt +181 o32 setsockopt sys_setsockopt sys_setsockopt 182 o32 shutdown sys_shutdown 183 o32 socket sys_socket 184 o32 socketpair sys_socketpair diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index 5a758fa6ec52..3494e4fa1a17 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -198,8 +198,8 @@ 178 common rt_sigqueueinfo sys_rt_sigqueueinfo compat_sys_rt_sigqueueinfo 179 common rt_sigsuspend sys_rt_sigsuspend compat_sys_rt_sigsuspend 180 common chown sys_chown -181 common setsockopt sys_setsockopt compat_sys_setsockopt -182 common getsockopt sys_getsockopt compat_sys_getsockopt +181 common setsockopt sys_setsockopt sys_setsockopt +182 common getsockopt sys_getsockopt sys_getsockopt 183 common sendmsg sys_sendmsg compat_sys_sendmsg 184 common recvmsg sys_recvmsg compat_sys_recvmsg 185 common semop sys_semop diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index f833a3190822..94eb5b27ef65 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -433,8 +433,8 @@ 336 common recv sys_recv compat_sys_recv 337 common recvfrom sys_recvfrom compat_sys_recvfrom 338 common shutdown sys_shutdown -339 common setsockopt sys_setsockopt compat_sys_setsockopt -340 common getsockopt sys_getsockopt compat_sys_getsockopt +339 common setsockopt sys_setsockopt sys_setsockopt +340 common getsockopt sys_getsockopt sys_getsockopt 341 common sendmsg sys_sendmsg compat_sys_sendmsg 342 common recvmsg sys_recvmsg compat_sys_recvmsg 343 32 recvmmsg sys_recvmmsg_time32 compat_sys_recvmmsg_time32 diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index bfdcb7633957..0d63c71fc544 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -372,8 +372,8 @@ 362 common connect sys_connect sys_connect 363 common listen sys_listen sys_listen 364 common accept4 sys_accept4 sys_accept4 -365 common getsockopt sys_getsockopt compat_sys_getsockopt -366 common setsockopt sys_setsockopt compat_sys_setsockopt +365 common getsockopt sys_getsockopt sys_getsockopt +366 common setsockopt sys_setsockopt sys_setsockopt 367 common getsockname sys_getsockname sys_getsockname 368 common getpeername sys_getpeername sys_getpeername 369 common sendto sys_sendto sys_sendto diff --git a/arch/sparc/kernel/sys32.S b/arch/sparc/kernel/sys32.S index 489ffab918a8..a45f0f31fe51 100644 --- a/arch/sparc/kernel/sys32.S +++ b/arch/sparc/kernel/sys32.S @@ -157,22 +157,22 @@ do_sys_shutdown: /* sys_shutdown(int, int) */ nop nop nop -do_sys_setsockopt: /* compat_sys_setsockopt(int, int, int, char *, int) */ +do_sys_setsockopt: /* sys_setsockopt(int, int, int, char *, int) */ 47: ldswa [%o1 + 0x0] %asi, %o0 - sethi %hi(compat_sys_setsockopt), %g1 + sethi %hi(sys_setsockopt), %g1 48: ldswa [%o1 + 0x8] %asi, %o2 49: lduwa [%o1 + 0xc] %asi, %o3 50: ldswa [%o1 + 0x10] %asi, %o4 - jmpl %g1 + %lo(compat_sys_setsockopt), %g0 + jmpl %g1 + %lo(sys_setsockopt), %g0 51: ldswa [%o1 + 0x4] %asi, %o1 nop -do_sys_getsockopt: /* compat_sys_getsockopt(int, int, int, u32, u32) */ +do_sys_getsockopt: /* sys_getsockopt(int, int, int, u32, u32) */ 52: ldswa [%o1 + 0x0] %asi, %o0 - sethi %hi(compat_sys_getsockopt), %g1 + sethi %hi(sys_getsockopt), %g1 53: ldswa [%o1 + 0x8] %asi, %o2 54: lduwa [%o1 + 0xc] %asi, %o3 55: lduwa [%o1 + 0x10] %asi, %o4 - jmpl %g1 + %lo(compat_sys_getsockopt), %g0 + jmpl %g1 + %lo(sys_getsockopt), %g0 56: ldswa [%o1 + 0x4] %asi, %o1 nop do_sys_sendmsg: /* compat_sys_sendmsg(int, struct compat_msghdr *, unsigned int) */ diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index 8004a276cb74..c59b37965add 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -147,7 +147,7 @@ 115 32 getgroups32 sys_getgroups 116 common gettimeofday sys_gettimeofday compat_sys_gettimeofday 117 common getrusage sys_getrusage compat_sys_getrusage -118 common getsockopt sys_getsockopt compat_sys_getsockopt +118 common getsockopt sys_getsockopt sys_getsockopt 119 common getcwd sys_getcwd 120 common readv sys_readv compat_sys_readv 121 common writev sys_writev compat_sys_writev @@ -425,7 +425,7 @@ 352 common userfaultfd sys_userfaultfd 353 common bind sys_bind 354 common listen sys_listen -355 common setsockopt sys_setsockopt compat_sys_setsockopt +355 common setsockopt sys_setsockopt sys_setsockopt 356 common mlock2 sys_mlock2 357 common copy_file_range sys_copy_file_range 358 common preadv2 sys_preadv2 compat_sys_preadv2 diff --git a/arch/x86/entry/syscall_x32.c b/arch/x86/entry/syscall_x32.c index 3d8d70d3896c..1583831f61a9 100644 --- a/arch/x86/entry/syscall_x32.c +++ b/arch/x86/entry/syscall_x32.c @@ -8,6 +8,13 @@ #include #include +/* + * Reuse the 64-bit entry points for the x32 versions that occupy different + * slots in the syscall table. + */ +#define __x32_sys_getsockopt __x64_sys_getsockopt +#define __x32_sys_setsockopt __x64_sys_setsockopt + #define __SYSCALL_64(nr, sym) #define __SYSCALL_X32(nr, sym) extern long __x32_##sym(const struct pt_regs *); diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index d8f8a1a69ed1..43742a69dba1 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -376,8 +376,8 @@ 362 i386 connect sys_connect 363 i386 listen sys_listen 364 i386 accept4 sys_accept4 -365 i386 getsockopt sys_getsockopt compat_sys_getsockopt -366 i386 setsockopt sys_setsockopt compat_sys_setsockopt +365 i386 getsockopt sys_getsockopt sys_getsockopt +366 i386 setsockopt sys_setsockopt sys_setsockopt 367 i386 getsockname sys_getsockname 368 i386 getpeername sys_getpeername 369 i386 sendto sys_sendto diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 78847b32e137..e008d638e641 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -396,8 +396,8 @@ 538 x32 sendmmsg compat_sys_sendmmsg 539 x32 process_vm_readv compat_sys_process_vm_readv 540 x32 process_vm_writev compat_sys_process_vm_writev -541 x32 setsockopt compat_sys_setsockopt -542 x32 getsockopt compat_sys_getsockopt +541 x32 setsockopt sys_setsockopt +542 x32 getsockopt sys_getsockopt 543 x32 io_setup compat_sys_io_setup 544 x32 io_submit compat_sys_io_submit 545 x32 execveat compat_sys_execveat diff --git a/include/linux/compat.h b/include/linux/compat.h index e90100c0de72..c4255d8a4a8a 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -737,10 +737,6 @@ asmlinkage long compat_sys_shmat(int shmid, compat_uptr_t shmaddr, int shmflg); asmlinkage long compat_sys_recvfrom(int fd, void __user *buf, compat_size_t len, unsigned flags, struct sockaddr __user *addr, int __user *addrlen); -asmlinkage long compat_sys_setsockopt(int fd, int level, int optname, - char __user *optval, unsigned int optlen); -asmlinkage long compat_sys_getsockopt(int fd, int level, int optname, - char __user *optval, int __user *optlen); asmlinkage long compat_sys_sendmsg(int fd, struct compat_msghdr __user *msg, unsigned flags); asmlinkage long compat_sys_recvmsg(int fd, struct compat_msghdr __user *msg, diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index b951a87da987..aa46825c6f9d 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -1424,4 +1424,8 @@ long compat_ksys_semtimedop(int semid, struct sembuf __user *tsems, unsigned int nsops, const struct old_timespec32 __user *timeout); +int __sys_getsockopt(int fd, int level, int optname, char __user *optval, + int __user *optlen); +int __sys_setsockopt(int fd, int level, int optname, char __user *optval, + int optlen); #endif diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index f4a01305d9a6..c8c189a5f0a6 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -606,9 +606,9 @@ __SYSCALL(__NR_sendto, sys_sendto) #define __NR_recvfrom 207 __SC_COMP(__NR_recvfrom, sys_recvfrom, compat_sys_recvfrom) #define __NR_setsockopt 208 -__SC_COMP(__NR_setsockopt, sys_setsockopt, compat_sys_setsockopt) +__SC_COMP(__NR_setsockopt, sys_setsockopt, sys_setsockopt) #define __NR_getsockopt 209 -__SC_COMP(__NR_getsockopt, sys_getsockopt, compat_sys_getsockopt) +__SC_COMP(__NR_getsockopt, sys_getsockopt, sys_getsockopt) #define __NR_shutdown 210 __SYSCALL(__NR_shutdown, sys_shutdown) #define __NR_sendmsg 211 diff --git a/net/compat.c b/net/compat.c index 3e6c2c5ff260..091875bd6210 100644 --- a/net/compat.c +++ b/net/compat.c @@ -335,77 +335,6 @@ void scm_detach_fds_compat(struct msghdr *kmsg, struct scm_cookie *scm) __scm_destroy(scm); } -static int __compat_sys_setsockopt(int fd, int level, int optname, - char __user *optval, unsigned int optlen) -{ - int err; - struct socket *sock; - - if (optlen > INT_MAX) - return -EINVAL; - - sock = sockfd_lookup(fd, &err); - if (sock) { - err = security_socket_setsockopt(sock, level, optname); - if (err) { - sockfd_put(sock); - return err; - } - - if (level == SOL_SOCKET) - err = sock_setsockopt(sock, level, - optname, optval, optlen); - else if (sock->ops->compat_setsockopt) - err = sock->ops->compat_setsockopt(sock, level, - optname, optval, optlen); - else - err = sock->ops->setsockopt(sock, level, - optname, optval, optlen); - sockfd_put(sock); - } - return err; -} - -COMPAT_SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname, - char __user *, optval, unsigned int, optlen) -{ - return __compat_sys_setsockopt(fd, level, optname, optval, optlen); -} - -static int __compat_sys_getsockopt(int fd, int level, int optname, - char __user *optval, - int __user *optlen) -{ - int err; - struct socket *sock = sockfd_lookup(fd, &err); - - if (sock) { - err = security_socket_getsockopt(sock, level, optname); - if (err) { - sockfd_put(sock); - return err; - } - - if (level == SOL_SOCKET) - err = sock_getsockopt(sock, level, - optname, optval, optlen); - else if (sock->ops->compat_getsockopt) - err = sock->ops->compat_getsockopt(sock, level, - optname, optval, optlen); - else - err = sock->ops->getsockopt(sock, level, - optname, optval, optlen); - sockfd_put(sock); - } - return err; -} - -COMPAT_SYSCALL_DEFINE5(getsockopt, int, fd, int, level, int, optname, - char __user *, optval, int __user *, optlen) -{ - return __compat_sys_getsockopt(fd, level, optname, optval, optlen); -} - /* Argument list sizes for compat_sys_socketcall */ #define AL(x) ((x) * sizeof(u32)) static unsigned char nas[21] = { @@ -565,13 +494,11 @@ COMPAT_SYSCALL_DEFINE2(socketcall, int, call, u32 __user *, args) ret = __sys_shutdown(a0, a1); break; case SYS_SETSOCKOPT: - ret = __compat_sys_setsockopt(a0, a1, a[2], - compat_ptr(a[3]), a[4]); + ret = __sys_setsockopt(a0, a1, a[2], compat_ptr(a[3]), a[4]); break; case SYS_GETSOCKOPT: - ret = __compat_sys_getsockopt(a0, a1, a[2], - compat_ptr(a[3]), - compat_ptr(a[4])); + ret = __sys_getsockopt(a0, a1, a[2], compat_ptr(a[3]), + compat_ptr(a[4])); break; case SYS_SENDMSG: ret = __compat_sys_sendmsg(a0, compat_ptr(a1), a[2]); diff --git a/net/socket.c b/net/socket.c index b79376b17b45..dec345982abb 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2094,9 +2094,8 @@ static bool sock_use_custom_sol_socket(const struct socket *sock) * Set a socket option. Because we don't know the option lengths we have * to pass the user mode parameter for the protocols to sort out. */ - -static int __sys_setsockopt(int fd, int level, int optname, - char __user *optval, int optlen) +int __sys_setsockopt(int fd, int level, int optname, char __user *optval, + int optlen) { mm_segment_t oldfs = get_fs(); char *kernel_optval = NULL; @@ -2114,8 +2113,10 @@ static int __sys_setsockopt(int fd, int level, int optname, if (err) goto out_put; - err = BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock->sk, &level, &optname, - optval, &optlen, &kernel_optval); + if (!in_compat_syscall()) + err = BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock->sk, &level, &optname, + optval, &optlen, + &kernel_optval); if (err < 0) goto out_put; if (err > 0) { @@ -2154,9 +2155,8 @@ SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname, * Get a socket option. Because we don't know the option lengths we have * to pass a user mode parameter for the protocols to sort out. */ - -static int __sys_getsockopt(int fd, int level, int optname, - char __user *optval, int __user *optlen) +int __sys_getsockopt(int fd, int level, int optname, char __user *optval, + int __user *optlen) { int err, fput_needed; struct socket *sock; @@ -2170,7 +2170,8 @@ static int __sys_getsockopt(int fd, int level, int optname, if (err) goto out_put; - max_optlen = BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen); + if (!in_compat_syscall()) + max_optlen = BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen); if (level == SOL_SOCKET) err = sock_getsockopt(sock, level, optname, optval, optlen); @@ -2178,8 +2179,10 @@ static int __sys_getsockopt(int fd, int level, int optname, err = sock->ops->getsockopt(sock, level, optname, optval, optlen); - err = BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock->sk, level, optname, optval, - optlen, max_optlen, err); + if (!in_compat_syscall()) + err = BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock->sk, level, optname, + optval, optlen, max_optlen, + err); out_put: fput_light(sock->file, fput_needed); return err; diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h index f4a01305d9a6..c8c189a5f0a6 100644 --- a/tools/include/uapi/asm-generic/unistd.h +++ b/tools/include/uapi/asm-generic/unistd.h @@ -606,9 +606,9 @@ __SYSCALL(__NR_sendto, sys_sendto) #define __NR_recvfrom 207 __SC_COMP(__NR_recvfrom, sys_recvfrom, compat_sys_recvfrom) #define __NR_setsockopt 208 -__SC_COMP(__NR_setsockopt, sys_setsockopt, compat_sys_setsockopt) +__SC_COMP(__NR_setsockopt, sys_setsockopt, sys_setsockopt) #define __NR_getsockopt 209 -__SC_COMP(__NR_getsockopt, sys_getsockopt, compat_sys_getsockopt) +__SC_COMP(__NR_getsockopt, sys_getsockopt, sys_getsockopt) #define __NR_shutdown 210 __SYSCALL(__NR_shutdown, sys_shutdown) #define __NR_sendmsg 211 diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl index 35b61bfc1b1a..b190f2eb2611 100644 --- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl @@ -427,8 +427,8 @@ 336 common recv sys_recv compat_sys_recv 337 common recvfrom sys_recvfrom compat_sys_recvfrom 338 common shutdown sys_shutdown -339 common setsockopt sys_setsockopt compat_sys_setsockopt -340 common getsockopt sys_getsockopt compat_sys_getsockopt +339 common setsockopt sys_setsockopt sys_setsockopt +340 common getsockopt sys_getsockopt sys_getsockopt 341 common sendmsg sys_sendmsg compat_sys_sendmsg 342 common recvmsg sys_recvmsg compat_sys_recvmsg 343 32 recvmmsg sys_recvmmsg_time32 compat_sys_recvmmsg_time32 diff --git a/tools/perf/arch/s390/entry/syscalls/syscall.tbl b/tools/perf/arch/s390/entry/syscalls/syscall.tbl index b38d48464368..56ae24b6e4be 100644 --- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl @@ -372,8 +372,8 @@ 362 common connect sys_connect compat_sys_connect 363 common listen sys_listen sys_listen 364 common accept4 sys_accept4 compat_sys_accept4 -365 common getsockopt sys_getsockopt compat_sys_getsockopt -366 common setsockopt sys_setsockopt compat_sys_setsockopt +365 common getsockopt sys_getsockopt sys_getsockopt +366 common setsockopt sys_setsockopt sys_setsockopt 367 common getsockname sys_getsockname compat_sys_getsockname 368 common getpeername sys_getpeername compat_sys_getpeername 369 common sendto sys_sendto compat_sys_sendto diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl index 78847b32e137..e008d638e641 100644 --- a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl +++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl @@ -396,8 +396,8 @@ 538 x32 sendmmsg compat_sys_sendmmsg 539 x32 process_vm_readv compat_sys_process_vm_readv 540 x32 process_vm_writev compat_sys_process_vm_writev -541 x32 setsockopt compat_sys_setsockopt -542 x32 getsockopt compat_sys_getsockopt +541 x32 setsockopt sys_setsockopt +542 x32 getsockopt sys_getsockopt 543 x32 io_setup compat_sys_io_setup 544 x32 io_submit compat_sys_io_submit 545 x32 execveat compat_sys_execveat -- cgit v1.2.3-59-g8ed1b From eba75c587e811d3249c8bd50d22bb2266ccd3c0f Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 10 Jul 2020 09:29:02 -0400 Subject: icmp: support rfc 4884 Add setsockopt SOL_IP/IP_RECVERR_4884 to return the offset to an extension struct if present. ICMP messages may include an extension structure after the original datagram. RFC 4884 standardized this behavior. It stores the offset in words to the extension header in u8 icmphdr.un.reserved[1]. The field is valid only for ICMP types destination unreachable, time exceeded and parameter problem, if length is at least 128 bytes and entire packet does not exceed 576 bytes. Return the offset to the start of the extension struct when reading an ICMP error from the error queue, if it matches the above constraints. Do not return the raw u8 field. Return the offset from the start of the user buffer, in bytes. The kernel does not return the network and transport headers, so subtract those. Also validate the headers. Return the offset regardless of validation, as an invalid extension must still not be misinterpreted as part of the original datagram. Note that !invalid does not imply valid. If the extension version does not match, no validation can take place, for instance. For backward compatibility, make this optional, set by setsockopt SOL_IP/IP_RECVERR_RFC4884. For API example and feature test, see github.com/wdebruij/kerneltools/blob/master/tests/recv_icmp_v2.c For forward compatibility, reserve only setsockopt value 1, leaving other bits for additional icmp extensions. Changes v1->v2: - convert word offset to byte offset from start of user buffer - return in ee_data as u8 may be insufficient - define extension struct and object header structs - return len only if constraints met - if returning len, also validate Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/icmp.h | 4 +++ include/net/inet_sock.h | 1 + include/uapi/linux/errqueue.h | 14 ++++++++- include/uapi/linux/icmp.h | 22 ++++++++++++++ include/uapi/linux/in.h | 1 + net/ipv4/icmp.c | 71 +++++++++++++++++++++++++++++++++++++++++++ net/ipv4/ip_sockglue.c | 12 ++++++++ 7 files changed, 124 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/linux/icmp.h b/include/linux/icmp.h index 81ca84ce3119..8fc38a34cb20 100644 --- a/include/linux/icmp.h +++ b/include/linux/icmp.h @@ -15,6 +15,7 @@ #include #include +#include static inline struct icmphdr *icmp_hdr(const struct sk_buff *skb) { @@ -35,4 +36,7 @@ static inline bool icmp_is_err(int type) return false; } +void ip_icmp_error_rfc4884(const struct sk_buff *skb, + struct sock_ee_data_rfc4884 *out); + #endif /* _LINUX_ICMP_H */ diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index a7ce00af6c44..a3702d1d4875 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -225,6 +225,7 @@ struct inet_sock { mc_all:1, nodefrag:1; __u8 bind_address_no_port:1, + recverr_rfc4884:1, defer_connect:1; /* Indicates that fastopen_connect is set * and cookie exists so we defer connect * until first data frame is written diff --git a/include/uapi/linux/errqueue.h b/include/uapi/linux/errqueue.h index ca5cb3e3c6df..3c70e8ac14b8 100644 --- a/include/uapi/linux/errqueue.h +++ b/include/uapi/linux/errqueue.h @@ -5,6 +5,13 @@ #include #include +/* RFC 4884: return offset to extension struct + validation */ +struct sock_ee_data_rfc4884 { + __u16 len; + __u8 flags; + __u8 reserved; +}; + struct sock_extended_err { __u32 ee_errno; __u8 ee_origin; @@ -12,7 +19,10 @@ struct sock_extended_err { __u8 ee_code; __u8 ee_pad; __u32 ee_info; - __u32 ee_data; + union { + __u32 ee_data; + struct sock_ee_data_rfc4884 ee_rfc4884; + }; }; #define SO_EE_ORIGIN_NONE 0 @@ -31,6 +41,8 @@ struct sock_extended_err { #define SO_EE_CODE_TXTIME_INVALID_PARAM 1 #define SO_EE_CODE_TXTIME_MISSED 2 +#define SO_EE_RFC4884_FLAG_INVALID 1 + /** * struct scm_timestamping - timestamps exposed through cmsg * diff --git a/include/uapi/linux/icmp.h b/include/uapi/linux/icmp.h index 5589eeb791ca..fb169a50895e 100644 --- a/include/uapi/linux/icmp.h +++ b/include/uapi/linux/icmp.h @@ -19,6 +19,7 @@ #define _UAPI_LINUX_ICMP_H #include +#include #define ICMP_ECHOREPLY 0 /* Echo Reply */ #define ICMP_DEST_UNREACH 3 /* Destination Unreachable */ @@ -95,5 +96,26 @@ struct icmp_filter { __u32 data; }; +/* RFC 4884 extension struct: one per message */ +struct icmp_ext_hdr { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 reserved1:4, + version:4; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u8 version:4, + reserved1:4; +#else +#error "Please fix " +#endif + __u8 reserved2; + __sum16 checksum; +}; + +/* RFC 4884 extension object header: one for each object */ +struct icmp_extobj_hdr { + __be16 length; + __u8 class_num; + __u8 class_type; +}; #endif /* _UAPI_LINUX_ICMP_H */ diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h index 8533bf07450f..3d0d8231dc19 100644 --- a/include/uapi/linux/in.h +++ b/include/uapi/linux/in.h @@ -123,6 +123,7 @@ struct in_addr { #define IP_CHECKSUM 23 #define IP_BIND_ADDRESS_NO_PORT 24 #define IP_RECVFRAGSIZE 25 +#define IP_RECVERR_RFC4884 26 /* IP_MTU_DISCOVER values */ #define IP_PMTUDISC_DONT 0 /* Never send DF frames */ diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index e30515f89802..793aebf07c2a 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -1116,6 +1116,77 @@ error: goto drop; } +static bool ip_icmp_error_rfc4884_validate(const struct sk_buff *skb, int off) +{ + struct icmp_extobj_hdr *objh, _objh; + struct icmp_ext_hdr *exth, _exth; + u16 olen; + + exth = skb_header_pointer(skb, off, sizeof(_exth), &_exth); + if (!exth) + return false; + if (exth->version != 2) + return true; + + if (exth->checksum && + csum_fold(skb_checksum(skb, off, skb->len - off, 0))) + return false; + + off += sizeof(_exth); + while (off < skb->len) { + objh = skb_header_pointer(skb, off, sizeof(_objh), &_objh); + if (!objh) + return false; + + olen = ntohs(objh->length); + if (olen < sizeof(_objh)) + return false; + + off += olen; + if (off > skb->len) + return false; + } + + return true; +} + +void ip_icmp_error_rfc4884(const struct sk_buff *skb, + struct sock_ee_data_rfc4884 *out) +{ + int hlen, off; + + switch (icmp_hdr(skb)->type) { + case ICMP_DEST_UNREACH: + case ICMP_TIME_EXCEEDED: + case ICMP_PARAMETERPROB: + break; + default: + return; + } + + /* outer headers up to inner iph. skb->data is at inner payload */ + hlen = -skb_transport_offset(skb) - sizeof(struct icmphdr); + + /* per rfc 791: maximum packet length of 576 bytes */ + if (hlen + skb->len > 576) + return; + + /* per rfc 4884: minimal datagram length of 128 bytes */ + off = icmp_hdr(skb)->un.reserved[1] * sizeof(u32); + if (off < 128) + return; + + /* kernel has stripped headers: return payload offset in bytes */ + off -= hlen; + if (off + sizeof(struct icmp_ext_hdr) > skb->len) + return; + + out->len = off; + + if (!ip_icmp_error_rfc4884_validate(skb, off)) + out->flags |= SO_EE_RFC4884_FLAG_INVALID; +} + int icmp_err(struct sk_buff *skb, u32 info) { struct iphdr *iph = (struct iphdr *)skb->data; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 86b3b9a7cea3..a5ea02d7a183 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -411,6 +411,9 @@ void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err, serr->port = port; if (skb_pull(skb, payload - skb->data)) { + if (inet_sk(sk)->recverr_rfc4884) + ip_icmp_error_rfc4884(skb, &serr->ee.ee_rfc4884); + skb_reset_transport_header(skb); if (sock_queue_err_skb(sk, skb) == 0) return; @@ -904,6 +907,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, case IP_RECVORIGDSTADDR: case IP_CHECKSUM: case IP_RECVFRAGSIZE: + case IP_RECVERR_RFC4884: if (optlen >= sizeof(int)) { if (get_user(val, (int __user *) optval)) return -EFAULT; @@ -1063,6 +1067,11 @@ static int do_ip_setsockopt(struct sock *sk, int level, if (!val) skb_queue_purge(&sk->sk_error_queue); break; + case IP_RECVERR_RFC4884: + if (val < 0 || val > 1) + goto e_inval; + inet->recverr_rfc4884 = !!val; + break; case IP_MULTICAST_TTL: if (sk->sk_type == SOCK_STREAM) goto e_inval; @@ -1611,6 +1620,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, case IP_RECVERR: val = inet->recverr; break; + case IP_RECVERR_RFC4884: + val = inet->recverr_rfc4884; + break; case IP_MULTICAST_TTL: val = inet->mc_ttl; break; -- cgit v1.2.3-59-g8ed1b From f65b71aa25a65e13cf3d10445a48c63d3eeb942e Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 17 Jul 2020 01:45:29 +0300 Subject: ptp: add ability to configure duty cycle for periodic output There are external event timestampers (PHCs with support for PTP_EXTTS_REQUEST) that timestamp both event edges. When those edges are very close (such as in the case of a short pulse), there is a chance that the collected timestamp might be of the rising, or of the falling edge, we never know. There are also PHCs capable of generating periodic output with a configurable duty cycle. This is good news, because we can space the rising and falling edge out enough in time, that the risks to overrun the 1-entry timestamp FIFO of the extts PHC are lower (example: the perout PHC can be configured for a period of 1 second, and an "on" time of 0.5 seconds, resulting in a duty cycle of 50%). A flag is introduced for signaling that an on time is present in the perout request structure, for preserving compatibility. Logically speaking, the duty cycle cannot exceed 100% and the PTP core checks for this. PHC drivers that don't support this flag emit a periodic output of an unspecified duty cycle, same as before. The duty cycle is encoded as an "on" time, similar to the "start" and "period" times, and reuses the reserved space while preserving overall binary layout. Pahole reported before: struct ptp_perout_request { struct ptp_clock_time start; /* 0 16 */ struct ptp_clock_time period; /* 16 16 */ unsigned int index; /* 32 4 */ unsigned int flags; /* 36 4 */ unsigned int rsv[4]; /* 40 16 */ /* size: 56, cachelines: 1, members: 5 */ /* last cacheline: 56 bytes */ }; And now: struct ptp_perout_request { struct ptp_clock_time start; /* 0 16 */ struct ptp_clock_time period; /* 16 16 */ unsigned int index; /* 32 4 */ unsigned int flags; /* 36 4 */ union { struct ptp_clock_time on; /* 40 16 */ unsigned int rsv[4]; /* 40 16 */ }; /* 40 16 */ /* size: 56, cachelines: 1, members: 5 */ /* last cacheline: 56 bytes */ }; Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/ptp/ptp_chardev.c | 33 +++++++++++++++++++++++++++------ include/uapi/linux/ptp_clock.h | 17 ++++++++++++++--- 2 files changed, 41 insertions(+), 9 deletions(-) (limited to 'include/uapi') diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c index 375cd6e4aade..e0e6f85966e1 100644 --- a/drivers/ptp/ptp_chardev.c +++ b/drivers/ptp/ptp_chardev.c @@ -191,12 +191,33 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg) err = -EFAULT; break; } - if (((req.perout.flags & ~PTP_PEROUT_VALID_FLAGS) || - req.perout.rsv[0] || req.perout.rsv[1] || - req.perout.rsv[2] || req.perout.rsv[3]) && - cmd == PTP_PEROUT_REQUEST2) { - err = -EINVAL; - break; + if (cmd == PTP_PEROUT_REQUEST2) { + struct ptp_perout_request *perout = &req.perout; + + if (perout->flags & ~PTP_PEROUT_VALID_FLAGS) { + err = -EINVAL; + break; + } + /* + * The "on" field has undefined meaning if + * PTP_PEROUT_DUTY_CYCLE isn't set, we must still treat + * it as reserved, which must be set to zero. + */ + if (!(perout->flags & PTP_PEROUT_DUTY_CYCLE) && + (perout->rsv[0] || perout->rsv[1] || + perout->rsv[2] || perout->rsv[3])) { + err = -EINVAL; + break; + } + if (perout->flags & PTP_PEROUT_DUTY_CYCLE) { + /* The duty cycle must be subunitary. */ + if (perout->on.sec > perout->period.sec || + (perout->on.sec == perout->period.sec && + perout->on.nsec > perout->period.nsec)) { + err = -ERANGE; + break; + } + } } else if (cmd == PTP_PEROUT_REQUEST) { req.perout.flags &= PTP_PEROUT_V1_VALID_FLAGS; req.perout.rsv[0] = 0; diff --git a/include/uapi/linux/ptp_clock.h b/include/uapi/linux/ptp_clock.h index ff070aa64278..1d2841155f7d 100644 --- a/include/uapi/linux/ptp_clock.h +++ b/include/uapi/linux/ptp_clock.h @@ -53,12 +53,14 @@ /* * Bits of the ptp_perout_request.flags field: */ -#define PTP_PEROUT_ONE_SHOT (1<<0) +#define PTP_PEROUT_ONE_SHOT (1<<0) +#define PTP_PEROUT_DUTY_CYCLE (1<<1) /* * flag fields valid for the new PTP_PEROUT_REQUEST2 ioctl. */ -#define PTP_PEROUT_VALID_FLAGS (PTP_PEROUT_ONE_SHOT) +#define PTP_PEROUT_VALID_FLAGS (PTP_PEROUT_ONE_SHOT | \ + PTP_PEROUT_DUTY_CYCLE) /* * No flags are valid for the original PTP_PEROUT_REQUEST ioctl @@ -105,7 +107,16 @@ struct ptp_perout_request { struct ptp_clock_time period; /* Desired period, zero means disable. */ unsigned int index; /* Which channel to configure. */ unsigned int flags; - unsigned int rsv[4]; /* Reserved for future use. */ + union { + /* + * The "on" time of the signal. + * Must be lower than the period. + * Valid only if (flags & PTP_PEROUT_DUTY_CYCLE) is set. + */ + struct ptp_clock_time on; + /* Reserved for future use. */ + unsigned int rsv[4]; + }; }; #define PTP_MAX_SAMPLES 25 /* Maximum allowed offset measurement samples. */ -- cgit v1.2.3-59-g8ed1b From b6bd41363a1ca39282496803cc32f7515ed917fe Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 17 Jul 2020 01:45:30 +0300 Subject: ptp: introduce a phase offset in the periodic output request Some PHCs like the ocelot/felix switch cannot emit generic periodic output, but just PPS (pulse per second) signals, which: - don't start from arbitrary absolute times, but are rather phase-aligned to the beginning of [the closest next] second. - have an optional phase offset relative to that beginning of the second. For those, it was initially established that they should reject any other absolute time for the PTP_PEROUT_REQUEST than 0.000000000 [1]. But when it actually came to writing an application [2] that makes use of this functionality, we realized that we can't really deal generically with PHCs that support absolute start time, and with PHCs that don't, without an explicit interface. Namely, in an ideal world, PHC drivers would ensure that the "perout.start" value written to hardware will result in a functional output. This means that if the PTP time has become in the past of this PHC's current time, it should be automatically fast-forwarded by the driver into a close enough future time that is known to work (note: this is necessary only if the hardware doesn't do this fast-forward by itself). But we don't really know what is the status for PHC drivers in use today, so in the general sense, user space would be risking to have a non-functional periodic output if it simply asked for a start time of 0.000000000. So let's introduce a flag for this type of reduced-functionality hardware, named PTP_PEROUT_PHASE. The start time is just "soon", the only thing we know for sure about this signal is that its rising edge events, Rn, occur at: Rn = perout.phase + n * perout.period The "phase" in the periodic output structure is simply an alias to the "start" time, since both cannot logically be specified at the same time. Therefore, the binary layout of the structure is not affected. [1]: https://patchwork.ozlabs.org/project/netdev/patch/20200320103726.32559-7-yangbo.lu@nxp.com/ [2]: https://www.mail-archive.com/linuxptp-devel@lists.sourceforge.net/msg04142.html Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/uapi/linux/ptp_clock.h | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/ptp_clock.h b/include/uapi/linux/ptp_clock.h index 1d2841155f7d..1d108d597f66 100644 --- a/include/uapi/linux/ptp_clock.h +++ b/include/uapi/linux/ptp_clock.h @@ -55,12 +55,14 @@ */ #define PTP_PEROUT_ONE_SHOT (1<<0) #define PTP_PEROUT_DUTY_CYCLE (1<<1) +#define PTP_PEROUT_PHASE (1<<2) /* * flag fields valid for the new PTP_PEROUT_REQUEST2 ioctl. */ #define PTP_PEROUT_VALID_FLAGS (PTP_PEROUT_ONE_SHOT | \ - PTP_PEROUT_DUTY_CYCLE) + PTP_PEROUT_DUTY_CYCLE | \ + PTP_PEROUT_PHASE) /* * No flags are valid for the original PTP_PEROUT_REQUEST ioctl @@ -103,7 +105,20 @@ struct ptp_extts_request { }; struct ptp_perout_request { - struct ptp_clock_time start; /* Absolute start time. */ + union { + /* + * Absolute start time. + * Valid only if (flags & PTP_PEROUT_PHASE) is unset. + */ + struct ptp_clock_time start; + /* + * Phase offset. The signal should start toggling at an + * unspecified integer multiple of the period, plus this value. + * The start time should be "as soon as possible". + * Valid only if (flags & PTP_PEROUT_PHASE) is set. + */ + struct ptp_clock_time phase; + }; struct ptp_clock_time period; /* Desired period, zero means disable. */ unsigned int index; /* Which channel to configure. */ unsigned int flags; -- cgit v1.2.3-59-g8ed1b From 6c0246a4588d418f72acd40a7b7601be403d80a9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 16 Jul 2020 13:11:28 +0800 Subject: perf: Add perf_event_mmap_page::cap_user_time_short ABI In order to support short clock counters, provide an ABI extension. As a whole: u64 time, delta, cyc = read_cycle_counter(); + if (cap_user_time_short) + cyc = time_cycle + ((cyc - time_cycle) & time_mask); delta = mul_u64_u32_shr(cyc, time_mult, time_shift); if (cap_user_time_zero) time = time_zero + delta; delta += time_offset; Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Leo Yan Link: https://lore.kernel.org/r/20200716051130.4359-6-leo.yan@linaro.org Signed-off-by: Will Deacon --- include/uapi/linux/perf_event.h | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 7b2d6fc9e6ed..21a1edd08cbe 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -532,9 +532,10 @@ struct perf_event_mmap_page { cap_bit0_is_deprecated : 1, /* Always 1, signals that bit 0 is zero */ cap_user_rdpmc : 1, /* The RDPMC instruction can be used to read counts */ - cap_user_time : 1, /* The time_* fields are used */ + cap_user_time : 1, /* The time_{shift,mult,offset} fields are used */ cap_user_time_zero : 1, /* The time_zero field is used */ - cap_____res : 59; + cap_user_time_short : 1, /* the time_{cycle,mask} fields are used */ + cap_____res : 58; }; }; @@ -593,13 +594,29 @@ struct perf_event_mmap_page { * ((rem * time_mult) >> time_shift); */ __u64 time_zero; + __u32 size; /* Header size up to __reserved[] fields. */ + __u32 __reserved_1; + + /* + * If cap_usr_time_short, the hardware clock is less than 64bit wide + * and we must compute the 'cyc' value, as used by cap_usr_time, as: + * + * cyc = time_cycles + ((cyc - time_cycles) & time_mask) + * + * NOTE: this form is explicitly chosen such that cap_usr_time_short + * is a correction on top of cap_usr_time, and code that doesn't + * know about cap_usr_time_short still works under the assumption + * the counter doesn't wrap. + */ + __u64 time_cycles; + __u64 time_mask; /* * Hole for extension of the self monitor capabilities */ - __u8 __reserved[118*8+4]; /* align to 1k. */ + __u8 __reserved[116*8]; /* align to 1k. */ /* * Control data for the mmap() data buffer. -- cgit v1.2.3-59-g8ed1b From 8e7eafb816ab7e5047b74cb8eb1db2f8f14f7d7a Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 18 Jul 2020 17:32:20 -0700 Subject: RDMA: rdma_user_ioctl.h: fix a duplicated word + clarify Change the repeated word "it" in a comment to "it to". Also insert a dash in the sentence to add clarity. Link: https://lore.kernel.org/r/20200719003220.21250-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/rdma_user_ioctl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/rdma/rdma_user_ioctl.h b/include/uapi/rdma/rdma_user_ioctl.h index d92d2721b28c..53c55188dd2a 100644 --- a/include/uapi/rdma/rdma_user_ioctl.h +++ b/include/uapi/rdma/rdma_user_ioctl.h @@ -43,7 +43,7 @@ /* * General blocks assignments - * It is closed on purpose do not expose it it user space + * It is closed on purpose - do not expose it to user space * #define MAD_CMD_BASE 0x00 * #define HFI1_CMD_BAS 0xE0 */ -- cgit v1.2.3-59-g8ed1b From b43870c74f3fdf0cd06bf5f1b7a5ed70a2cd4ed2 Mon Sep 17 00:00:00 2001 From: Max Englander Date: Sat, 4 Jul 2020 15:15:28 +0000 Subject: audit: report audit wait metric in audit status reply In environments where the preservation of audit events and predictable usage of system memory are prioritized, admins may use a combination of --backlog_wait_time and -b options at the risk of degraded performance resulting from backlog waiting. In some cases, this risk may be preferred to lost events or unbounded memory usage. Ideally, this risk can be mitigated by making adjustments when backlog waiting is detected. However, detection can be difficult using the currently available metrics. For example, an admin attempting to debug degraded performance may falsely believe a full backlog indicates backlog waiting. It may turn out the backlog frequently fills up but drains quickly. To make it easier to reliably track degraded performance to backlog waiting, this patch makes the following changes: Add a new field backlog_wait_time_total to the audit status reply. Initialize this field to zero. Add to this field the total time spent by the current task on scheduled timeouts while the backlog limit is exceeded. Reset field to zero upon request via AUDIT_SET. Tested on Ubuntu 18.04 using complementary changes to the audit-userspace and audit-testsuite: - https://github.com/linux-audit/audit-userspace/pull/134 - https://github.com/linux-audit/audit-testsuite/pull/97 Signed-off-by: Max Englander Signed-off-by: Paul Moore --- include/uapi/linux/audit.h | 18 +++++++++++------- kernel/audit.c | 35 +++++++++++++++++++++++++---------- 2 files changed, 36 insertions(+), 17 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index 9b6a973f4cc3..cd2d8279a5e4 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -333,14 +333,15 @@ enum { }; /* Status symbols */ - /* Mask values */ -#define AUDIT_STATUS_ENABLED 0x0001 -#define AUDIT_STATUS_FAILURE 0x0002 -#define AUDIT_STATUS_PID 0x0004 + /* Mask values */ +#define AUDIT_STATUS_ENABLED 0x0001 +#define AUDIT_STATUS_FAILURE 0x0002 +#define AUDIT_STATUS_PID 0x0004 #define AUDIT_STATUS_RATE_LIMIT 0x0008 -#define AUDIT_STATUS_BACKLOG_LIMIT 0x0010 -#define AUDIT_STATUS_BACKLOG_WAIT_TIME 0x0020 -#define AUDIT_STATUS_LOST 0x0040 +#define AUDIT_STATUS_BACKLOG_LIMIT 0x0010 +#define AUDIT_STATUS_BACKLOG_WAIT_TIME 0x0020 +#define AUDIT_STATUS_LOST 0x0040 +#define AUDIT_STATUS_BACKLOG_WAIT_TIME_ACTUAL 0x0080 #define AUDIT_FEATURE_BITMAP_BACKLOG_LIMIT 0x00000001 #define AUDIT_FEATURE_BITMAP_BACKLOG_WAIT_TIME 0x00000002 @@ -467,6 +468,9 @@ struct audit_status { __u32 feature_bitmap; /* bitmap of kernel audit features */ }; __u32 backlog_wait_time;/* message queue wait timeout */ + __u32 backlog_wait_time_actual;/* time spent waiting while + * message limit exceeded + */ }; struct audit_features { diff --git a/kernel/audit.c b/kernel/audit.c index a2f3e34aa724..d72663ac248c 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -136,6 +136,11 @@ u32 audit_sig_sid = 0; */ static atomic_t audit_lost = ATOMIC_INIT(0); +/* Monotonically increasing sum of time the kernel has spent + * waiting while the backlog limit is exceeded. + */ +static atomic_t audit_backlog_wait_time_actual = ATOMIC_INIT(0); + /* Hash for inode-based rules */ struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; @@ -1201,17 +1206,18 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) case AUDIT_GET: { struct audit_status s; memset(&s, 0, sizeof(s)); - s.enabled = audit_enabled; - s.failure = audit_failure; + s.enabled = audit_enabled; + s.failure = audit_failure; /* NOTE: use pid_vnr() so the PID is relative to the current * namespace */ - s.pid = auditd_pid_vnr(); - s.rate_limit = audit_rate_limit; - s.backlog_limit = audit_backlog_limit; - s.lost = atomic_read(&audit_lost); - s.backlog = skb_queue_len(&audit_queue); - s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL; - s.backlog_wait_time = audit_backlog_wait_time; + s.pid = auditd_pid_vnr(); + s.rate_limit = audit_rate_limit; + s.backlog_limit = audit_backlog_limit; + s.lost = atomic_read(&audit_lost); + s.backlog = skb_queue_len(&audit_queue); + s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL; + s.backlog_wait_time = audit_backlog_wait_time; + s.backlog_wait_time_actual = atomic_read(&audit_backlog_wait_time_actual); audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s)); break; } @@ -1315,6 +1321,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) audit_log_config_change("lost", 0, lost, 1); return lost; } + if (s.mask == AUDIT_STATUS_BACKLOG_WAIT_TIME_ACTUAL) { + u32 actual = atomic_xchg(&audit_backlog_wait_time_actual, 0); + + audit_log_config_change("backlog_wait_time_actual", 0, actual, 1); + return actual; + } break; } case AUDIT_GET_FEATURE: @@ -1826,12 +1838,15 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, /* sleep if we are allowed and we haven't exhausted our * backlog wait limit */ if (gfpflags_allow_blocking(gfp_mask) && (stime > 0)) { + long rtime = stime; + DECLARE_WAITQUEUE(wait, current); add_wait_queue_exclusive(&audit_backlog_wait, &wait); set_current_state(TASK_UNINTERRUPTIBLE); - stime = schedule_timeout(stime); + stime = schedule_timeout(rtime); + atomic_add(rtime - stime, &audit_backlog_wait_time_actual); remove_wait_queue(&audit_backlog_wait, &wait); } else { if (audit_rate_check() && printk_ratelimit()) -- cgit v1.2.3-59-g8ed1b From 4787dd582dbde0b7f29eb3dbe59df3da1b350925 Mon Sep 17 00:00:00 2001 From: Martin Varghese Date: Fri, 17 Jul 2020 08:05:12 +0530 Subject: bareudp: Reverted support to enable & disable rx metadata collection The commit fe80536acf83 ("bareudp: Added attribute to enable & disable rx metadata collection") breaks the the original(5.7) default behavior of bareudp module to collect RX metadadata at the receive. It was added to avoid the crash at the kernel neighbour subsytem when packet with metadata from bareudp is processed. But it is no more needed as the commit 394de110a733 ("net: Added pointer check for dst->ops->neigh_lookup in dst_neigh_lookup_skb") solves this crash. Fixes: fe80536acf83 ("bareudp: Added attribute to enable & disable rx metadata collection") Signed-off-by: Martin Varghese Acked-by: Guillaume Nault Signed-off-by: David S. Miller --- Documentation/networking/bareudp.rst | 6 ++---- drivers/net/bareudp.c | 21 +++++---------------- include/net/bareudp.h | 1 - include/uapi/linux/if_link.h | 1 - 4 files changed, 7 insertions(+), 22 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/networking/bareudp.rst b/Documentation/networking/bareudp.rst index 0e00636d8d74..465a8b251bfe 100644 --- a/Documentation/networking/bareudp.rst +++ b/Documentation/networking/bareudp.rst @@ -48,7 +48,5 @@ enabled. The bareudp device could be used along with OVS or flower filter in TC. The OVS or TC flower layer must set the tunnel information in SKB dst field before sending packet buffer to the bareudp device for transmission. On reception the -bareudp device decapsulates the udp header and passes the inner packet to the -network stack. If RX_COLLECT_METADATA flag is enabled in the device the tunnel -information will be stored in the SKB dst field before the packet buffer is -passed to the network stack. +bareudp device extracts and stores the tunnel information in SKB dst field before +passing the packet buffer to the network stack. diff --git a/drivers/net/bareudp.c b/drivers/net/bareudp.c index 108a8cafc4f8..44eb2b1d0416 100644 --- a/drivers/net/bareudp.c +++ b/drivers/net/bareudp.c @@ -46,7 +46,6 @@ struct bareudp_dev { __be16 port; u16 sport_min; bool multi_proto_mode; - bool rx_collect_metadata; struct socket __rcu *sock; struct list_head next; /* bareudp node on namespace list */ struct gro_cells gro_cells; @@ -126,14 +125,12 @@ static int bareudp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) bareudp->dev->stats.rx_dropped++; goto drop; } - if (bareudp->rx_collect_metadata) { - tun_dst = udp_tun_rx_dst(skb, family, TUNNEL_KEY, 0, 0); - if (!tun_dst) { - bareudp->dev->stats.rx_dropped++; - goto drop; - } - skb_dst_set(skb, &tun_dst->dst); + tun_dst = udp_tun_rx_dst(skb, family, TUNNEL_KEY, 0, 0); + if (!tun_dst) { + bareudp->dev->stats.rx_dropped++; + goto drop; } + skb_dst_set(skb, &tun_dst->dst); skb->dev = bareudp->dev; oiph = skb_network_header(skb); skb_reset_network_header(skb); @@ -577,9 +574,6 @@ static int bareudp2info(struct nlattr *data[], struct bareudp_conf *conf, if (data[IFLA_BAREUDP_MULTIPROTO_MODE]) conf->multi_proto_mode = true; - if (data[IFLA_BAREUDP_RX_COLLECT_METADATA]) - conf->rx_collect_metadata = true; - return 0; } @@ -617,7 +611,6 @@ static int bareudp_configure(struct net *net, struct net_device *dev, bareudp->ethertype = conf->ethertype; bareudp->sport_min = conf->sport_min; bareudp->multi_proto_mode = conf->multi_proto_mode; - bareudp->rx_collect_metadata = conf->rx_collect_metadata; err = register_netdevice(dev); if (err) @@ -676,7 +669,6 @@ static size_t bareudp_get_size(const struct net_device *dev) nla_total_size(sizeof(__be16)) + /* IFLA_BAREUDP_ETHERTYPE */ nla_total_size(sizeof(__u16)) + /* IFLA_BAREUDP_SRCPORT_MIN */ nla_total_size(0) + /* IFLA_BAREUDP_MULTIPROTO_MODE */ - nla_total_size(0) + /* IFLA_BAREUDP_RX_COLLECT_METADATA */ 0; } @@ -693,9 +685,6 @@ static int bareudp_fill_info(struct sk_buff *skb, const struct net_device *dev) if (bareudp->multi_proto_mode && nla_put_flag(skb, IFLA_BAREUDP_MULTIPROTO_MODE)) goto nla_put_failure; - if (bareudp->rx_collect_metadata && - nla_put_flag(skb, IFLA_BAREUDP_RX_COLLECT_METADATA)) - goto nla_put_failure; return 0; diff --git a/include/net/bareudp.h b/include/net/bareudp.h index 3dd5f9a8d01c..dc65a0d71d9b 100644 --- a/include/net/bareudp.h +++ b/include/net/bareudp.h @@ -12,7 +12,6 @@ struct bareudp_conf { __be16 port; u16 sport_min; bool multi_proto_mode; - bool rx_collect_metadata; }; struct net_device *bareudp_dev_create(struct net *net, const char *name, diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 26842ffd0501..af8f31987526 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -601,7 +601,6 @@ enum { IFLA_BAREUDP_ETHERTYPE, IFLA_BAREUDP_SRCPORT_MIN, IFLA_BAREUDP_MULTIPROTO_MODE, - IFLA_BAREUDP_RX_COLLECT_METADATA, __IFLA_BAREUDP_MAX }; -- cgit v1.2.3-59-g8ed1b From c333f9495c451d958c6f4a41e5de2d8f80f79496 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 17 Jul 2020 16:37:13 -0700 Subject: raid: md_p.h: drop duplicated word in a comment Drop the doubled word "the" in a comment. Signed-off-by: Randy Dunlap Cc: Song Liu Cc: linux-raid@vger.kernel.org Signed-off-by: Song Liu --- include/uapi/linux/raid/md_p.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h index 1f2d8c81f0e0..e5a98a16f9b0 100644 --- a/include/uapi/linux/raid/md_p.h +++ b/include/uapi/linux/raid/md_p.h @@ -123,7 +123,7 @@ typedef struct mdp_device_descriptor_s { /* * Notes: - * - if an array is being reshaped (restriped) in order to change the + * - if an array is being reshaped (restriped) in order to change * the number of active devices in the array, 'raid_disks' will be * the larger of the old and new numbers. 'delta_disks' will * be the "new - old". So if +ve, raid_disks is the new value, and -- cgit v1.2.3-59-g8ed1b From 1859f4ebcf2dfe2e878a3805700036fdc4e01e2c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 18 Jul 2020 17:27:38 -0700 Subject: android: binder.h: drop a duplicated word MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop the repeated word "the" in a comment. Cc: Arve Hjønnevåg Cc: Todd Kjos Cc: Martijn Coenen Cc: Joel Fernandes Cc: Hridya Valsaraju Cc: Suren Baghdasaryan Cc: devel@driverdev.osuosl.org Acked-by: Christian Brauner Signed-off-by: Randy Dunlap Link: https://lore.kernel.org/r/20200719002738.20210-1-rdunlap@infradead.org Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/android/binder.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h index 2832134e5397..f1ce2c4c077e 100644 --- a/include/uapi/linux/android/binder.h +++ b/include/uapi/linux/android/binder.h @@ -404,7 +404,7 @@ enum binder_driver_return_protocol { BR_FAILED_REPLY = _IO('r', 17), /* - * The the last transaction (either a bcTRANSACTION or + * The last transaction (either a bcTRANSACTION or * a bcATTEMPT_ACQUIRE) failed (e.g. out of memory). No parameters. */ }; -- cgit v1.2.3-59-g8ed1b From 7deff7b5b4395784b194bae3631b8333d3423938 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 18 Jul 2020 17:28:41 -0700 Subject: hyperv: hyperv.h: drop a duplicated word Drop the repeated word "the" in a comment. Signed-off-by: Randy Dunlap Cc: "K. Y. Srinivasan" Cc: Haiyang Zhang Cc: Stephen Hemminger Cc: Wei Liu Cc: linux-hyperv@vger.kernel.org Link: https://lore.kernel.org/r/20200719002841.20369-1-rdunlap@infradead.org Signed-off-by: Wei Liu --- include/uapi/linux/hyperv.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/hyperv.h b/include/uapi/linux/hyperv.h index 8f24404ad04f..6135d92e0d47 100644 --- a/include/uapi/linux/hyperv.h +++ b/include/uapi/linux/hyperv.h @@ -219,7 +219,7 @@ struct hv_do_fcopy { * kernel and user-level daemon communicate using a connector channel. * * The user mode component first registers with the - * the kernel component. Subsequently, the kernel component requests, data + * kernel component. Subsequently, the kernel component requests, data * for the specified keys. In response to this message the user mode component * fills in the value corresponding to the specified key. We overload the * sequence field in the cn_msg header to define our KVP message types. -- cgit v1.2.3-59-g8ed1b From 3bf1c021e36e7269c71dceafec713b0181f413a9 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Tue, 9 Jun 2020 16:14:55 +0300 Subject: uapi/habanalabs: fix some comments MAP/UNMAP are done also for device memory. Reviewed-by: Omer Shpigelman Signed-off-by: Oded Gabbay --- include/uapi/misc/habanalabs.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index f6267a8d7416..f218d1c62c62 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -530,13 +530,13 @@ union hl_wait_cs_args { struct hl_wait_cs_out out; }; -/* Opcode to alloc device memory */ +/* Opcode to allocate device memory */ #define HL_MEM_OP_ALLOC 0 /* Opcode to free previously allocated device memory */ #define HL_MEM_OP_FREE 1 -/* Opcode to map host memory */ +/* Opcode to map host and device memory */ #define HL_MEM_OP_MAP 2 -/* Opcode to unmap previously mapped host memory */ +/* Opcode to unmap previously mapped host and device memory */ #define HL_MEM_OP_UNMAP 3 /* Memory flags */ -- cgit v1.2.3-59-g8ed1b From db491e4f08a9fd84ebb1ebd22a6b0b988a81a0d8 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Thu, 18 Jun 2020 09:51:16 +0300 Subject: habanalabs: Add dropped cs statistics info struct Add command submission statistics structure which can be obtained through the info ioctl. Each drop counter describes the reason for which the command submission was dropped. This information is needed for the user to be aware of the specific reason for which the submitted work was dropped. The user can then utilize the driver more efficiently. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/command_submission.c | 24 +++++++++++++++++++++++- drivers/misc/habanalabs/habanalabs.h | 5 +++++ drivers/misc/habanalabs/habanalabs_ioctl.c | 24 ++++++++++++++++++++++++ drivers/misc/habanalabs/hw_queue.c | 5 ++++- include/uapi/misc/habanalabs.h | 21 +++++++++++++++++++++ 5 files changed, 77 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/drivers/misc/habanalabs/command_submission.c b/drivers/misc/habanalabs/command_submission.c index f81d6685e011..777f88d25acd 100644 --- a/drivers/misc/habanalabs/command_submission.c +++ b/drivers/misc/habanalabs/command_submission.c @@ -246,6 +246,18 @@ static void free_job(struct hl_device *hdev, struct hl_cs_job *job) kfree(job); } +static void cs_counters_aggregate(struct hl_device *hdev, struct hl_ctx *ctx) +{ + hdev->aggregated_cs_counters.device_in_reset_drop_cnt += + ctx->cs_counters.device_in_reset_drop_cnt; + hdev->aggregated_cs_counters.out_of_mem_drop_cnt += + ctx->cs_counters.out_of_mem_drop_cnt; + hdev->aggregated_cs_counters.parsing_drop_cnt += + ctx->cs_counters.parsing_drop_cnt; + hdev->aggregated_cs_counters.queue_full_drop_cnt += + ctx->cs_counters.queue_full_drop_cnt; +} + static void cs_do_release(struct kref *ref) { struct hl_cs *cs = container_of(ref, struct hl_cs, @@ -349,6 +361,8 @@ static void cs_do_release(struct kref *ref) dma_fence_signal(cs->fence); dma_fence_put(cs->fence); + cs_counters_aggregate(hdev, cs->ctx); + kfree(cs); } @@ -632,12 +646,15 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks, rc = validate_queue_index(hdev, chunk, &queue_type, &is_kernel_allocated_cb); - if (rc) + if (rc) { + hpriv->ctx->cs_counters.parsing_drop_cnt++; goto free_cs_object; + } if (is_kernel_allocated_cb) { cb = get_cb_from_cs_chunk(hdev, &hpriv->cb_mgr, chunk); if (!cb) { + hpriv->ctx->cs_counters.parsing_drop_cnt++; rc = -EINVAL; goto free_cs_object; } @@ -651,6 +668,7 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks, job = hl_cs_allocate_job(hdev, queue_type, is_kernel_allocated_cb); if (!job) { + hpriv->ctx->cs_counters.out_of_mem_drop_cnt++; dev_err(hdev->dev, "Failed to allocate a new job\n"); rc = -ENOMEM; if (is_kernel_allocated_cb) @@ -683,6 +701,7 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks, rc = cs_parser(hpriv, job); if (rc) { + hpriv->ctx->cs_counters.parsing_drop_cnt++; dev_err(hdev->dev, "Failed to parse JOB %d.%llu.%d, err %d, rejecting the CS\n", cs->ctx->asid, cs->sequence, job->id, rc); @@ -691,6 +710,7 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks, } if (int_queues_only) { + hpriv->ctx->cs_counters.parsing_drop_cnt++; dev_err(hdev->dev, "Reject CS %d.%llu because only internal queues jobs are present\n", cs->ctx->asid, cs->sequence); @@ -875,6 +895,7 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type, job = hl_cs_allocate_job(hdev, q_type, true); if (!job) { + ctx->cs_counters.out_of_mem_drop_cnt++; dev_err(hdev->dev, "Failed to allocate a new job\n"); rc = -ENOMEM; goto put_cs; @@ -882,6 +903,7 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type, cb = hl_cb_kernel_create(hdev, PAGE_SIZE); if (!cb) { + ctx->cs_counters.out_of_mem_drop_cnt++; kfree(job); rc = -EFAULT; goto put_cs; diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h index e4d6f7c91194..ae781453a509 100644 --- a/drivers/misc/habanalabs/habanalabs.h +++ b/drivers/misc/habanalabs/habanalabs.h @@ -10,6 +10,7 @@ #include "include/armcp_if.h" #include "include/qman_if.h" +#include #include #include @@ -787,6 +788,7 @@ struct hl_ctx { struct mutex mem_hash_lock; struct mutex mmu_lock; struct list_head debugfs_list; + struct hl_cs_counters cs_counters; u64 cs_sequence; u64 *dram_default_hops; spinlock_t cs_lock; @@ -1391,6 +1393,7 @@ struct hl_device_idle_busy_ts { * @compute_ctx: current compute context executing. * @idle_busy_ts_arr: array to hold time stamps of transitions from idle to busy * and vice-versa + * @aggregated_cs_counters: aggregated cs counters among all contexts * @dram_used_mem: current DRAM memory consumption. * @timeout_jiffies: device CS timeout value. * @max_power: the max power of the device, as configured by the sysadmin. This @@ -1489,6 +1492,8 @@ struct hl_device { struct hl_device_idle_busy_ts *idle_busy_ts_arr; + struct hl_cs_counters aggregated_cs_counters; + atomic64_t dram_used_mem; u64 timeout_jiffies; u64 max_power; diff --git a/drivers/misc/habanalabs/habanalabs_ioctl.c b/drivers/misc/habanalabs/habanalabs_ioctl.c index 52eedd3a6c3a..5af1c03da473 100644 --- a/drivers/misc/habanalabs/habanalabs_ioctl.c +++ b/drivers/misc/habanalabs/habanalabs_ioctl.c @@ -276,6 +276,27 @@ static int time_sync_info(struct hl_device *hdev, struct hl_info_args *args) min((size_t) max_size, sizeof(time_sync))) ? -EFAULT : 0; } +static int cs_counters_info(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + struct hl_device *hdev = hpriv->hdev; + struct hl_info_cs_counters cs_counters = {0}; + u32 max_size = args->return_size; + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + + if ((!max_size) || (!out)) + return -EINVAL; + + memcpy(&cs_counters.cs_counters, &hdev->aggregated_cs_counters, + sizeof(struct hl_cs_counters)); + + if (hpriv->ctx) + memcpy(&cs_counters.ctx_cs_counters, &hpriv->ctx->cs_counters, + sizeof(struct hl_cs_counters)); + + return copy_to_user(out, &cs_counters, + min((size_t) max_size, sizeof(cs_counters))) ? -EFAULT : 0; +} + static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, struct device *dev) { @@ -336,6 +357,9 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, case HL_INFO_TIME_SYNC: return time_sync_info(hdev, args); + case HL_INFO_CS_COUNTERS: + return cs_counters_info(hpriv, args); + default: dev_err(dev, "Invalid request %d\n", args->op); rc = -ENOTTY; diff --git a/drivers/misc/habanalabs/hw_queue.c b/drivers/misc/habanalabs/hw_queue.c index f5a10a5ac300..da66ffb528f8 100644 --- a/drivers/misc/habanalabs/hw_queue.c +++ b/drivers/misc/habanalabs/hw_queue.c @@ -514,6 +514,7 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs) hdev->asic_funcs->hw_queues_lock(hdev); if (hl_device_disabled_or_in_reset(hdev)) { + ctx->cs_counters.device_in_reset_drop_cnt++; dev_err(hdev->dev, "device is disabled or in reset, CS rejected!\n"); rc = -EPERM; @@ -543,8 +544,10 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs) break; } - if (rc) + if (rc) { + ctx->cs_counters.queue_full_drop_cnt++; goto unroll_cq_resv; + } if (q->queue_type == QUEUE_TYPE_EXT || q->queue_type == QUEUE_TYPE_HW) diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index f218d1c62c62..d5c4f983b7a8 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -263,6 +263,7 @@ enum hl_device_status { * time the driver was loaded. * HL_INFO_TIME_SYNC - Retrieve the device's time alongside the host's time * for synchronization. + * HL_INFO_CS_COUNTERS - Retrieve command submission counters */ #define HL_INFO_HW_IP_INFO 0 #define HL_INFO_HW_EVENTS 1 @@ -274,6 +275,7 @@ enum hl_device_status { #define HL_INFO_CLK_RATE 8 #define HL_INFO_RESET_COUNT 9 #define HL_INFO_TIME_SYNC 10 +#define HL_INFO_CS_COUNTERS 11 #define HL_INFO_VERSION_MAX_LEN 128 #define HL_INFO_CARD_NAME_MAX_LEN 16 @@ -338,6 +340,25 @@ struct hl_info_time_sync { __u64 host_time; }; +/** + * struct hl_info_cs_counters - command submission counters + * @out_of_mem_drop_cnt: dropped due to memory allocation issue + * @parsing_drop_cnt: dropped due to error in packet parsing + * @queue_full_drop_cnt: dropped due to queue full + * @device_in_reset_drop_cnt: dropped due to device in reset + */ +struct hl_cs_counters { + __u64 out_of_mem_drop_cnt; + __u64 parsing_drop_cnt; + __u64 queue_full_drop_cnt; + __u64 device_in_reset_drop_cnt; +}; + +struct hl_info_cs_counters { + struct hl_cs_counters cs_counters; + struct hl_cs_counters ctx_cs_counters; +}; + struct hl_info_args { /* Location of relevant struct in userspace */ __u64 return_pointer; -- cgit v1.2.3-59-g8ed1b From 8b603d0715a372f5827d3a6b19d9568bf854b687 Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Fri, 24 Jul 2020 10:41:12 +0200 Subject: RDMA/mlx5: Fix typo in enum name Nnothing uses the enum name, so this is harmless. Fixes: 322694412400 ("IB/mlx5: Introduce driver create and destroy flow methods") Link: https://lore.kernel.org/r/20200724084112.GC31930@amd Signed-off-by: Pavel Machek (CIP) Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/mlx5_user_ioctl_cmds.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h index b330e6eee626..e24d66d278cf 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h +++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h @@ -263,7 +263,7 @@ enum mlx5_ib_create_flow_attrs { MLX5_IB_ATTR_CREATE_FLOW_FLAGS, }; -enum mlx5_ib_destoy_flow_attrs { +enum mlx5_ib_destroy_flow_attrs { MLX5_IB_ATTR_DESTROY_FLOW_HANDLE = (1U << UVERBS_ID_NS_SHIFT), }; -- cgit v1.2.3-59-g8ed1b From 5923b8f7fa218a9bccd730c0a9692635eb2fc740 Mon Sep 17 00:00:00 2001 From: Ariel Levkovich Date: Thu, 23 Jul 2020 01:03:01 +0300 Subject: net/sched: cls_flower: Add hash info to flow classification Adding new cls flower keys for hash value and hash mask and dissect the hash info from the skb into the flow key towards flow classication. Signed-off-by: Ariel Levkovich Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 3 +++ net/sched/cls_flower.c | 16 ++++++++++++++++ 2 files changed, 19 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 7576209d96f9..ee95f42fb0ec 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -578,6 +578,9 @@ enum { TCA_FLOWER_KEY_MPLS_OPTS, + TCA_FLOWER_KEY_HASH, /* u32 */ + TCA_FLOWER_KEY_HASH_MASK, /* u32 */ + __TCA_FLOWER_MAX, }; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index acd8e05c2ba5..a4f7ef1de7e7 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -64,6 +64,7 @@ struct fl_flow_key { }; } tp_range; struct flow_dissector_key_ct ct; + struct flow_dissector_key_hash hash; } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ struct fl_flow_mask_range { @@ -318,6 +319,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, skb_flow_dissect_ct(skb, &mask->dissector, &skb_key, fl_ct_info_to_flower_map, ARRAY_SIZE(fl_ct_info_to_flower_map)); + skb_flow_dissect_hash(skb, &mask->dissector, &skb_key); skb_flow_dissect(skb, &mask->dissector, &skb_key, 0); f = fl_mask_lookup(mask, &skb_key); @@ -695,6 +697,9 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_CT_LABELS_MASK] = { .type = NLA_BINARY, .len = 128 / BITS_PER_BYTE }, [TCA_FLOWER_FLAGS] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_HASH] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_HASH_MASK] = { .type = NLA_U32 }, + }; static const struct nla_policy @@ -1626,6 +1631,10 @@ static int fl_set_key(struct net *net, struct nlattr **tb, fl_set_key_ip(tb, true, &key->enc_ip, &mask->enc_ip); + fl_set_key_val(tb, &key->hash.hash, TCA_FLOWER_KEY_HASH, + &mask->hash.hash, TCA_FLOWER_KEY_HASH_MASK, + sizeof(key->hash.hash)); + if (tb[TCA_FLOWER_KEY_ENC_OPTS]) { ret = fl_set_enc_opt(tb, key, mask, extack); if (ret) @@ -1740,6 +1749,8 @@ static void fl_init_dissector(struct flow_dissector *dissector, FLOW_DISSECTOR_KEY_ENC_OPTS, enc_opts); FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_CT, ct); + FL_KEY_SET_IF_MASKED(mask, keys, cnt, + FLOW_DISSECTOR_KEY_HASH, hash); skb_flow_dissector_init(dissector, keys, cnt); } @@ -2960,6 +2971,11 @@ static int fl_dump_key(struct sk_buff *skb, struct net *net, if (fl_dump_key_flags(skb, key->control.flags, mask->control.flags)) goto nla_put_failure; + if (fl_dump_key_val(skb, &key->hash.hash, TCA_FLOWER_KEY_HASH, + &mask->hash.hash, TCA_FLOWER_KEY_HASH_MASK, + sizeof(key->hash.hash))) + goto nla_put_failure; + return 0; nla_put_failure: -- cgit v1.2.3-59-g8ed1b From 01370434df85eb76ecb1527a4466013c4aca2436 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 24 Jul 2020 09:03:10 -0400 Subject: icmp6: support rfc 4884 Extend the rfc 4884 read interface introduced for ipv4 in commit eba75c587e81 ("icmp: support rfc 4884") to ipv6. Add socket option SOL_IPV6/IPV6_RECVERR_RFC4884. Changes v1->v2: - make ipv6_icmp_error_rfc4884 static (file scope) Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/ipv6.h | 1 + include/uapi/linux/icmpv6.h | 1 + include/uapi/linux/in6.h | 1 + net/ipv4/icmp.c | 1 + net/ipv6/datagram.c | 16 ++++++++++++++++ net/ipv6/ipv6_sockglue.c | 12 ++++++++++++ 6 files changed, 32 insertions(+) (limited to 'include/uapi') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 8d8f877e7f81..a44789d027cc 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -283,6 +283,7 @@ struct ipv6_pinfo { autoflowlabel:1, autoflowlabel_set:1, mc_all:1, + recverr_rfc4884:1, rtalert_isolate:1; __u8 min_hopcount; __u8 tclass; diff --git a/include/uapi/linux/icmpv6.h b/include/uapi/linux/icmpv6.h index 2622b5a3e616..c1661febc2dc 100644 --- a/include/uapi/linux/icmpv6.h +++ b/include/uapi/linux/icmpv6.h @@ -68,6 +68,7 @@ struct icmp6hdr { #define icmp6_mtu icmp6_dataun.un_data32[0] #define icmp6_unused icmp6_dataun.un_data32[0] #define icmp6_maxdelay icmp6_dataun.un_data16[0] +#define icmp6_datagram_len icmp6_dataun.un_data8[0] #define icmp6_router icmp6_dataun.u_nd_advt.router #define icmp6_solicited icmp6_dataun.u_nd_advt.solicited #define icmp6_override icmp6_dataun.u_nd_advt.override diff --git a/include/uapi/linux/in6.h b/include/uapi/linux/in6.h index 9f2273a08356..5ad396a57eb3 100644 --- a/include/uapi/linux/in6.h +++ b/include/uapi/linux/in6.h @@ -179,6 +179,7 @@ struct in6_flowlabel_req { #define IPV6_LEAVE_ANYCAST 28 #define IPV6_MULTICAST_ALL 29 #define IPV6_ROUTER_ALERT_ISOLATE 30 +#define IPV6_RECVERR_RFC4884 31 /* IPV6_MTU_DISCOVER values */ #define IPV6_PMTUDISC_DONT 0 diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 7498c58460a1..cf36f955bfe6 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -1173,6 +1173,7 @@ void ip_icmp_error_rfc4884(const struct sk_buff *skb, if (!ip_icmp_error_rfc4884_validate(skb, off)) out->flags |= SO_EE_RFC4884_FLAG_INVALID; } +EXPORT_SYMBOL_GPL(ip_icmp_error_rfc4884); int icmp_err(struct sk_buff *skb, u32 info) { diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 390bedde21a5..cc8ad7ddecda 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -284,6 +285,17 @@ int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr *uaddr, } EXPORT_SYMBOL_GPL(ip6_datagram_connect_v6_only); +static void ipv6_icmp_error_rfc4884(const struct sk_buff *skb, + struct sock_ee_data_rfc4884 *out) +{ + switch (icmp6_hdr(skb)->icmp6_type) { + case ICMPV6_TIME_EXCEED: + case ICMPV6_DEST_UNREACH: + ip_icmp_error_rfc4884(skb, out, sizeof(struct icmp6hdr), + icmp6_hdr(skb)->icmp6_datagram_len * 8); + } +} + void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port, u32 info, u8 *payload) { @@ -313,6 +325,10 @@ void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, serr->port = port; __skb_pull(skb, payload - skb->data); + + if (inet6_sk(sk)->recverr_rfc4884) + ipv6_icmp_error_rfc4884(skb, &serr->ee.ee_rfc4884); + skb_reset_transport_header(skb); if (sock_queue_err_skb(sk, skb)) diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index d2282f5c9760..20c740976334 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -965,6 +965,14 @@ done: np->rxopt.bits.recvfragsize = valbool; retv = 0; break; + case IPV6_RECVERR_RFC4884: + if (optlen < sizeof(int)) + goto e_inval; + if (val < 0 || val > 1) + goto e_inval; + np->recverr_rfc4884 = valbool; + retv = 0; + break; } release_sock(sk); @@ -1439,6 +1447,10 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, val = np->rtalert_isolate; break; + case IPV6_RECVERR_RFC4884: + val = np->recverr_rfc4884; + break; + default: return -ENOPROTOOPT; } -- cgit v1.2.3-59-g8ed1b From d721a43ff69cd473019c3b77aacb76b09102aca3 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Sat, 25 Jul 2020 20:00:27 +0800 Subject: bcache: increase super block version for cache device and backing device The new added super block version BCACHE_SB_VERSION_BDEV_WITH_FEATURES (5) BCACHE_SB_VERSION_CDEV_WITH_FEATURES value (6), is for the feature set bits. Devices have super block version equal to the new version will have three new members for feature set bits in the on-disk super block, __le64 feature_compat; __le64 feature_incompat; __le64 feature_ro_compat; They are used for further new features which may introduce on-disk format change, and avoid unncessary super block version increase. The very basic features handling code skeleton is also initialized in this patch. Signed-off-by: Coly Li Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- drivers/md/bcache/features.h | 78 ++++++++++++++++++++++++++++++++++++++++++++ drivers/md/bcache/super.c | 32 ++++++++++++++++-- include/uapi/linux/bcache.h | 29 +++++++++++----- 3 files changed, 128 insertions(+), 11 deletions(-) create mode 100644 drivers/md/bcache/features.h (limited to 'include/uapi') diff --git a/drivers/md/bcache/features.h b/drivers/md/bcache/features.h new file mode 100644 index 000000000000..ae7df37b9862 --- /dev/null +++ b/drivers/md/bcache/features.h @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _BCACHE_FEATURES_H +#define _BCACHE_FEATURES_H + +#include +#include +#include + +#define BCH_FEATURE_COMPAT 0 +#define BCH_FEATURE_RO_COMPAT 1 +#define BCH_FEATURE_INCOMPAT 2 +#define BCH_FEATURE_TYPE_MASK 0x03 + +#define BCH_FEATURE_COMPAT_SUUP 0 +#define BCH_FEATURE_RO_COMPAT_SUUP 0 +#define BCH_FEATURE_INCOMPAT_SUUP 0 + +#define BCH_HAS_COMPAT_FEATURE(sb, mask) \ + ((sb)->feature_compat & (mask)) +#define BCH_HAS_RO_COMPAT_FEATURE(sb, mask) \ + ((sb)->feature_ro_compat & (mask)) +#define BCH_HAS_INCOMPAT_FEATURE(sb, mask) \ + ((sb)->feature_incompat & (mask)) + +/* Feature set definition */ + +#define BCH_FEATURE_COMPAT_FUNCS(name, flagname) \ +static inline int bch_has_feature_##name(struct cache_sb *sb) \ +{ \ + return (((sb)->feature_compat & \ + BCH##_FEATURE_COMPAT_##flagname) != 0); \ +} \ +static inline void bch_set_feature_##name(struct cache_sb *sb) \ +{ \ + (sb)->feature_compat |= \ + BCH##_FEATURE_COMPAT_##flagname; \ +} \ +static inline void bch_clear_feature_##name(struct cache_sb *sb) \ +{ \ + (sb)->feature_compat &= \ + ~BCH##_FEATURE_COMPAT_##flagname; \ +} + +#define BCH_FEATURE_RO_COMPAT_FUNCS(name, flagname) \ +static inline int bch_has_feature_##name(struct cache_sb *sb) \ +{ \ + return (((sb)->feature_ro_compat & \ + BCH##_FEATURE_RO_COMPAT_##flagname) != 0); \ +} \ +static inline void bch_set_feature_##name(struct cache_sb *sb) \ +{ \ + (sb)->feature_ro_compat |= \ + BCH##_FEATURE_RO_COMPAT_##flagname; \ +} \ +static inline void bch_clear_feature_##name(struct cache_sb *sb) \ +{ \ + (sb)->feature_ro_compat &= \ + ~BCH##_FEATURE_RO_COMPAT_##flagname; \ +} + +#define BCH_FEATURE_INCOMPAT_FUNCS(name, flagname) \ +static inline int bch_has_feature_##name(struct cache_sb *sb) \ +{ \ + return (((sb)->feature_incompat & \ + BCH##_FEATURE_INCOMPAT_##flagname) != 0); \ +} \ +static inline void bch_set_feature_##name(struct cache_sb *sb) \ +{ \ + (sb)->feature_incompat |= \ + BCH##_FEATURE_INCOMPAT_##flagname; \ +} \ +static inline void bch_clear_feature_##name(struct cache_sb *sb) \ +{ \ + (sb)->feature_incompat &= \ + ~BCH##_FEATURE_INCOMPAT_##flagname; \ +} + +#endif diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 40fb18028c01..c6ef410a21a8 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -13,6 +13,7 @@ #include "extents.h" #include "request.h" #include "writeback.h" +#include "features.h" #include #include @@ -194,6 +195,7 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, sb->data_offset = BDEV_DATA_START_DEFAULT; break; case BCACHE_SB_VERSION_BDEV_WITH_OFFSET: + case BCACHE_SB_VERSION_BDEV_WITH_FEATURES: sb->data_offset = le64_to_cpu(s->data_offset); err = "Bad data offset"; @@ -207,6 +209,14 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, if (err) goto err; break; + case BCACHE_SB_VERSION_CDEV_WITH_FEATURES: + err = read_super_common(sb, bdev, s); + if (err) + goto err; + sb->feature_compat = le64_to_cpu(s->feature_compat); + sb->feature_incompat = le64_to_cpu(s->feature_incompat); + sb->feature_ro_compat = le64_to_cpu(s->feature_ro_compat); + break; default: err = "Unsupported superblock version"; goto err; @@ -241,7 +251,6 @@ static void __write_super(struct cache_sb *sb, struct cache_sb_disk *out, offset_in_page(out)); out->offset = cpu_to_le64(sb->offset); - out->version = cpu_to_le64(sb->version); memcpy(out->uuid, sb->uuid, 16); memcpy(out->set_uuid, sb->set_uuid, 16); @@ -257,6 +266,13 @@ static void __write_super(struct cache_sb *sb, struct cache_sb_disk *out, for (i = 0; i < sb->keys; i++) out->d[i] = cpu_to_le64(sb->d[i]); + if (sb->version >= BCACHE_SB_VERSION_CDEV_WITH_FEATURES) { + out->feature_compat = cpu_to_le64(sb->feature_compat); + out->feature_incompat = cpu_to_le64(sb->feature_incompat); + out->feature_ro_compat = cpu_to_le64(sb->feature_ro_compat); + } + + out->version = cpu_to_le64(sb->version); out->csum = csum_set(out); pr_debug("ver %llu, flags %llu, seq %llu\n", @@ -313,17 +329,20 @@ void bcache_write_super(struct cache_set *c) { struct closure *cl = &c->sb_write; struct cache *ca; - unsigned int i; + unsigned int i, version = BCACHE_SB_VERSION_CDEV_WITH_UUID; down(&c->sb_write_mutex); closure_init(cl, &c->cl); c->sb.seq++; + if (c->sb.version > version) + version = c->sb.version; + for_each_cache(ca, c, i) { struct bio *bio = &ca->sb_bio; - ca->sb.version = BCACHE_SB_VERSION_CDEV_WITH_UUID; + ca->sb.version = version; ca->sb.seq = c->sb.seq; ca->sb.last_mount = c->sb.last_mount; @@ -1839,6 +1858,13 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) c->sb.bucket_size = sb->bucket_size; c->sb.nr_in_set = sb->nr_in_set; c->sb.last_mount = sb->last_mount; + c->sb.version = sb->version; + if (c->sb.version >= BCACHE_SB_VERSION_CDEV_WITH_FEATURES) { + c->sb.feature_compat = sb->feature_compat; + c->sb.feature_ro_compat = sb->feature_ro_compat; + c->sb.feature_incompat = sb->feature_incompat; + } + c->bucket_bits = ilog2(sb->bucket_size); c->block_bits = ilog2(sb->block_size); c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry); diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h index 9a1965c6c3d0..47df2db2e727 100644 --- a/include/uapi/linux/bcache.h +++ b/include/uapi/linux/bcache.h @@ -141,11 +141,13 @@ static inline struct bkey *bkey_idx(const struct bkey *k, unsigned int nr_keys) * Version 3: Cache device with new UUID format * Version 4: Backing device with data offset */ -#define BCACHE_SB_VERSION_CDEV 0 -#define BCACHE_SB_VERSION_BDEV 1 -#define BCACHE_SB_VERSION_CDEV_WITH_UUID 3 -#define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4 -#define BCACHE_SB_MAX_VERSION 4 +#define BCACHE_SB_VERSION_CDEV 0 +#define BCACHE_SB_VERSION_BDEV 1 +#define BCACHE_SB_VERSION_CDEV_WITH_UUID 3 +#define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4 +#define BCACHE_SB_VERSION_CDEV_WITH_FEATURES 5 +#define BCACHE_SB_VERSION_BDEV_WITH_FEATURES 6 +#define BCACHE_SB_MAX_VERSION 6 #define SB_SECTOR 8 #define SB_OFFSET (SB_SECTOR << SECTOR_SHIFT) @@ -173,7 +175,12 @@ struct cache_sb_disk { __le64 flags; __le64 seq; - __le64 pad[8]; + + __le64 feature_compat; + __le64 feature_incompat; + __le64 feature_ro_compat; + + __le64 pad[5]; union { struct { @@ -224,7 +231,12 @@ struct cache_sb { __u64 flags; __u64 seq; - __u64 pad[8]; + + __u64 feature_compat; + __u64 feature_incompat; + __u64 feature_ro_compat; + + __u64 pad[5]; union { struct { @@ -262,7 +274,8 @@ struct cache_sb { static inline _Bool SB_IS_BDEV(const struct cache_sb *sb) { return sb->version == BCACHE_SB_VERSION_BDEV - || sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET; + || sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET + || sb->version == BCACHE_SB_VERSION_BDEV_WITH_FEATURES; } BITMASK(CACHE_SYNC, struct cache_sb, flags, 0, 1); -- cgit v1.2.3-59-g8ed1b From 4c1ccd0896d6a45f2159280d957afd441a7aeaba Mon Sep 17 00:00:00 2001 From: Coly Li Date: Sat, 25 Jul 2020 20:00:29 +0800 Subject: bcache: struct cache_sb is only for in-memory super block now We have struct cache_sb_disk for on-disk super block already, it is unnecessary to keep the in-memory super block format exactly mapping to the on-disk struct layout. This patch adds code comments to notice that struct cache_sb is not exactly mapping to cache_sb_disk, and removes the useless member csum and pad[5]. Although struct cache_sb does not belong to uapi, but there are still some on-disk format related macros reference it and it is unncessary to get rid of such dependency now. So struct cache_sb will continue to stay in include/uapi/linux/bache.h for now. Signed-off-by: Coly Li Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/uapi/linux/bcache.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h index 47df2db2e727..0ef984ea515a 100644 --- a/include/uapi/linux/bcache.h +++ b/include/uapi/linux/bcache.h @@ -215,8 +215,13 @@ struct cache_sb_disk { __le64 d[SB_JOURNAL_BUCKETS]; /* journal buckets */ }; +/* + * This is for in-memory bcache super block. + * NOTE: cache_sb is NOT exactly mapping to cache_sb_disk, the member + * size, ordering and even whole struct size may be different + * from cache_sb_disk. + */ struct cache_sb { - __u64 csum; __u64 offset; /* sector where this sb was written */ __u64 version; @@ -236,8 +241,6 @@ struct cache_sb { __u64 feature_incompat; __u64 feature_ro_compat; - __u64 pad[5]; - union { struct { /* Cache devices */ @@ -245,7 +248,6 @@ struct cache_sb { __u16 block_size; /* sectors */ __u16 bucket_size; /* sectors */ - __u16 nr_in_set; __u16 nr_this_dev; }; -- cgit v1.2.3-59-g8ed1b From ffa470327572b8f85dceda48fd0676d9658cb8c5 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Sat, 25 Jul 2020 20:00:35 +0800 Subject: bcache: add bucket_size_hi into struct cache_sb_disk for large bucket The large bucket feature is to extend bucket_size from 16bit to 32bit. When create cache device on zoned device (e.g. zoned NVMe SSD), making a single bucket cover one or more zones of the zoned device is the simplest way to support zoned device as cache by bcache. But current maximum bucket size is 16MB and a typical zone size of zoned device is 256MB, this is the major motiviation to extend bucket size to a larger bit width. This patch is the basic and first change to support large bucket size, the major changes it makes are, - Add BCH_FEATURE_INCOMPAT_LARGE_BUCKET for the large bucket feature, INCOMPAT means it introduces incompatible on-disk format change. - Add BCH_FEATURE_INCOMPAT_FUNCS(large_bucket, LARGE_BUCKET) routines. - Adds __le16 bucket_size_hi into struct cache_sb_disk at offset 0x8d0 for the on-disk super block format. - For the in-memory super block struct cache_sb, member bucket_size is extended from __u16 to __32. - Add get_bucket_size() to combine the bucket_size and bucket_size_hi from struct cache_sb_disk into an unsigned int value. Since we already have large bucket size helpers meta_bucket_pages(), meta_bucket_bytes() and alloc_meta_bucket_pages(), they make sure when bucket size > 8MB, the memory allocation for bcache meta data bucket won't fail no matter how large the bucket size extended. So these meta data buckets are handled properly when the bucket size width increase from 16bit to 32bit, we don't need to worry about them. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/alloc.c | 2 +- drivers/md/bcache/features.c | 22 ++++++++++++++++++++++ drivers/md/bcache/features.h | 9 ++++++--- drivers/md/bcache/movinggc.c | 4 ++-- drivers/md/bcache/super.c | 23 +++++++++++++++++++---- include/uapi/linux/bcache.h | 3 ++- 6 files changed, 52 insertions(+), 11 deletions(-) create mode 100644 drivers/md/bcache/features.c (limited to 'include/uapi') diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index a1df0d95151c..52035a78d836 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -87,7 +87,7 @@ void bch_rescale_priorities(struct cache_set *c, int sectors) { struct cache *ca; struct bucket *b; - unsigned int next = c->nbuckets * c->sb.bucket_size / 1024; + unsigned long next = c->nbuckets * c->sb.bucket_size / 1024; unsigned int i; int r; diff --git a/drivers/md/bcache/features.c b/drivers/md/bcache/features.c new file mode 100644 index 000000000000..ba53944bb390 --- /dev/null +++ b/drivers/md/bcache/features.c @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Feature set bits and string conversion. + * Inspired by ext4's features compat/incompat/ro_compat related code. + * + * Copyright 2020 Coly Li + * + */ +#include +#include "bcache.h" + +struct feature { + int compat; + unsigned int mask; + const char *string; +}; + +static struct feature feature_list[] = { + {BCH_FEATURE_INCOMPAT, BCH_FEATURE_INCOMPAT_LARGE_BUCKET, + "large_bucket"}, + {0, 0, 0 }, +}; diff --git a/drivers/md/bcache/features.h b/drivers/md/bcache/features.h index ae7df37b9862..dca052cf5203 100644 --- a/drivers/md/bcache/features.h +++ b/drivers/md/bcache/features.h @@ -11,9 +11,13 @@ #define BCH_FEATURE_INCOMPAT 2 #define BCH_FEATURE_TYPE_MASK 0x03 +/* Feature set definition */ +/* Incompat feature set */ +#define BCH_FEATURE_INCOMPAT_LARGE_BUCKET 0x0001 /* 32bit bucket size */ + #define BCH_FEATURE_COMPAT_SUUP 0 #define BCH_FEATURE_RO_COMPAT_SUUP 0 -#define BCH_FEATURE_INCOMPAT_SUUP 0 +#define BCH_FEATURE_INCOMPAT_SUUP BCH_FEATURE_INCOMPAT_LARGE_BUCKET #define BCH_HAS_COMPAT_FEATURE(sb, mask) \ ((sb)->feature_compat & (mask)) @@ -22,8 +26,6 @@ #define BCH_HAS_INCOMPAT_FEATURE(sb, mask) \ ((sb)->feature_incompat & (mask)) -/* Feature set definition */ - #define BCH_FEATURE_COMPAT_FUNCS(name, flagname) \ static inline int bch_has_feature_##name(struct cache_sb *sb) \ { \ @@ -75,4 +77,5 @@ static inline void bch_clear_feature_##name(struct cache_sb *sb) \ ~BCH##_FEATURE_INCOMPAT_##flagname; \ } +BCH_FEATURE_INCOMPAT_FUNCS(large_bucket, LARGE_BUCKET); #endif diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index b7dd2d75f58c..5872d6470470 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -206,8 +206,8 @@ void bch_moving_gc(struct cache_set *c) mutex_lock(&c->bucket_lock); for_each_cache(ca, c, i) { - unsigned int sectors_to_move = 0; - unsigned int reserve_sectors = ca->sb.bucket_size * + unsigned long sectors_to_move = 0; + unsigned long reserve_sectors = ca->sb.bucket_size * fifo_used(&ca->free[RESERVE_MOVINGGC]); ca->heap.used = 0; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index e4f05c4ddcdd..62c9681fe92f 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -60,6 +60,17 @@ struct workqueue_struct *bch_journal_wq; /* Superblock */ +static unsigned int get_bucket_size(struct cache_sb *sb, struct cache_sb_disk *s) +{ + unsigned int bucket_size = le16_to_cpu(s->bucket_size); + + if (sb->version >= BCACHE_SB_VERSION_CDEV_WITH_FEATURES && + bch_has_feature_large_bucket(sb)) + bucket_size |= le16_to_cpu(s->bucket_size_hi) << 16; + + return bucket_size; +} + static const char *read_super_common(struct cache_sb *sb, struct block_device *bdev, struct cache_sb_disk *s) { @@ -68,7 +79,7 @@ static const char *read_super_common(struct cache_sb *sb, struct block_device * sb->first_bucket= le16_to_cpu(s->first_bucket); sb->nbuckets = le64_to_cpu(s->nbuckets); - sb->bucket_size = le16_to_cpu(s->bucket_size); + sb->bucket_size = get_bucket_size(sb, s); sb->nr_in_set = le16_to_cpu(s->nr_in_set); sb->nr_this_dev = le16_to_cpu(s->nr_this_dev); @@ -210,12 +221,16 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, goto err; break; case BCACHE_SB_VERSION_CDEV_WITH_FEATURES: - err = read_super_common(sb, bdev, s); - if (err) - goto err; + /* + * Feature bits are needed in read_super_common(), + * convert them firstly. + */ sb->feature_compat = le64_to_cpu(s->feature_compat); sb->feature_incompat = le64_to_cpu(s->feature_incompat); sb->feature_ro_compat = le64_to_cpu(s->feature_ro_compat); + err = read_super_common(sb, bdev, s); + if (err) + goto err; break; default: err = "Unsupported superblock version"; diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h index 0ef984ea515a..52e8bcb33981 100644 --- a/include/uapi/linux/bcache.h +++ b/include/uapi/linux/bcache.h @@ -213,6 +213,7 @@ struct cache_sb_disk { __le16 keys; }; __le64 d[SB_JOURNAL_BUCKETS]; /* journal buckets */ + __le16 bucket_size_hi; }; /* @@ -247,9 +248,9 @@ struct cache_sb { __u64 nbuckets; /* device size */ __u16 block_size; /* sectors */ - __u16 bucket_size; /* sectors */ __u16 nr_in_set; __u16 nr_this_dev; + __u32 bucket_size; /* sectors */ }; struct { /* Backing devices */ -- cgit v1.2.3-59-g8ed1b From 92fe2aa859f52ce6aa595ca97fec110dc7100e63 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 20 Jul 2020 15:07:30 -0700 Subject: libnvdimm: Validate command family indices The ND_CMD_CALL format allows for a general passthrough of passlisted commands targeting a given command set. However there is no validation of the family index relative to what the bus supports. - Update the NFIT bus implementation (the only one that supports ND_CMD_CALL passthrough) to also passlist the valid set of command family indices. - Update the generic __nd_ioctl() path to validate that field on behalf of all implementations. Fixes: 31eca76ba2fc ("nfit, libnvdimm: limited/whitelisted dimm command marshaling mechanism") Cc: Vishal Verma Cc: Dave Jiang Cc: Ira Weiny Cc: "Rafael J. Wysocki" Cc: Len Brown Cc: Signed-off-by: Dan Williams Signed-off-by: Vishal Verma --- drivers/acpi/nfit/core.c | 11 +++++++++-- drivers/acpi/nfit/nfit.h | 1 - drivers/nvdimm/bus.c | 16 ++++++++++++++++ include/linux/libnvdimm.h | 2 ++ include/uapi/linux/ndctl.h | 4 ++++ 5 files changed, 31 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c index 7c138a4edc03..1f72ce1a782b 100644 --- a/drivers/acpi/nfit/core.c +++ b/drivers/acpi/nfit/core.c @@ -1823,6 +1823,7 @@ static void populate_shutdown_status(struct nfit_mem *nfit_mem) static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc, struct nfit_mem *nfit_mem, u32 device_handle) { + struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc; struct acpi_device *adev, *adev_dimm; struct device *dev = acpi_desc->dev; unsigned long dsm_mask, label_mask; @@ -1834,6 +1835,7 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc, /* nfit test assumes 1:1 relationship between commands and dsms */ nfit_mem->dsm_mask = acpi_desc->dimm_cmd_force_en; nfit_mem->family = NVDIMM_FAMILY_INTEL; + set_bit(NVDIMM_FAMILY_INTEL, &nd_desc->dimm_family_mask); if (dcr->valid_fields & ACPI_NFIT_CONTROL_MFG_INFO_VALID) sprintf(nfit_mem->id, "%04x-%02x-%04x-%08x", @@ -1886,10 +1888,13 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc, * Note, that checking for function0 (bit0) tells us if any commands * are reachable through this GUID. */ + clear_bit(NVDIMM_FAMILY_INTEL, &nd_desc->dimm_family_mask); for (i = 0; i <= NVDIMM_FAMILY_MAX; i++) - if (acpi_check_dsm(adev_dimm->handle, to_nfit_uuid(i), 1, 1)) + if (acpi_check_dsm(adev_dimm->handle, to_nfit_uuid(i), 1, 1)) { + set_bit(i, &nd_desc->dimm_family_mask); if (family < 0 || i == default_dsm_family) family = i; + } /* limit the supported commands to those that are publicly documented */ nfit_mem->family = family; @@ -2153,6 +2158,9 @@ static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc) nd_desc->cmd_mask = acpi_desc->bus_cmd_force_en; nd_desc->bus_dsm_mask = acpi_desc->bus_nfit_cmd_force_en; + set_bit(ND_CMD_CALL, &nd_desc->cmd_mask); + set_bit(NVDIMM_BUS_FAMILY_NFIT, &nd_desc->bus_family_mask); + adev = to_acpi_dev(acpi_desc); if (!adev) return; @@ -2160,7 +2168,6 @@ static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc) for (i = ND_CMD_ARS_CAP; i <= ND_CMD_CLEAR_ERROR; i++) if (acpi_check_dsm(adev->handle, guid, 1, 1ULL << i)) set_bit(i, &nd_desc->cmd_mask); - set_bit(ND_CMD_CALL, &nd_desc->cmd_mask); dsm_mask = (1 << ND_CMD_ARS_CAP) | diff --git a/drivers/acpi/nfit/nfit.h b/drivers/acpi/nfit/nfit.h index f5525f8bb770..5c5e7ebba8dc 100644 --- a/drivers/acpi/nfit/nfit.h +++ b/drivers/acpi/nfit/nfit.h @@ -33,7 +33,6 @@ | ACPI_NFIT_MEM_RESTORE_FAILED | ACPI_NFIT_MEM_FLUSH_FAILED \ | ACPI_NFIT_MEM_NOT_ARMED | ACPI_NFIT_MEM_MAP_FAILED) -#define NVDIMM_FAMILY_MAX NVDIMM_FAMILY_HYPERV #define NVDIMM_CMD_MAX 31 #define NVDIMM_STANDARD_CMDMASK \ diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c index 09087c38fabd..955265656b96 100644 --- a/drivers/nvdimm/bus.c +++ b/drivers/nvdimm/bus.c @@ -1037,9 +1037,25 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm, dimm_name = "bus"; } + /* Validate command family support against bus declared support */ if (cmd == ND_CMD_CALL) { + unsigned long *mask; + if (copy_from_user(&pkg, p, sizeof(pkg))) return -EFAULT; + + if (nvdimm) { + if (pkg.nd_family > NVDIMM_FAMILY_MAX) + return -EINVAL; + mask = &nd_desc->dimm_family_mask; + } else { + if (pkg.nd_family > NVDIMM_BUS_FAMILY_MAX) + return -EINVAL; + mask = &nd_desc->bus_family_mask; + } + + if (!test_bit(pkg.nd_family, mask)) + return -EINVAL; } if (!desc || diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 18da4059be09..bd39a2cf7972 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -78,6 +78,8 @@ struct nvdimm_bus_descriptor { const struct attribute_group **attr_groups; unsigned long bus_dsm_mask; unsigned long cmd_mask; + unsigned long dimm_family_mask; + unsigned long bus_family_mask; struct module *module; char *provider_name; struct device_node *of_node; diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h index 0e09dc5cec19..e9468b9332bd 100644 --- a/include/uapi/linux/ndctl.h +++ b/include/uapi/linux/ndctl.h @@ -245,6 +245,10 @@ struct nd_cmd_pkg { #define NVDIMM_FAMILY_MSFT 3 #define NVDIMM_FAMILY_HYPERV 4 #define NVDIMM_FAMILY_PAPR 5 +#define NVDIMM_FAMILY_MAX NVDIMM_FAMILY_PAPR + +#define NVDIMM_BUS_FAMILY_NFIT 0 +#define NVDIMM_BUS_FAMILY_MAX NVDIMM_BUS_FAMILY_NFIT #define ND_IOCTL_CALL _IOWR(ND_IOCTL, ND_CMD_CALL,\ struct nd_cmd_pkg) -- cgit v1.2.3-59-g8ed1b From 6450ddbd5d8e83ea9927c7f9076a21f829699e0f Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 20 Jul 2020 15:07:40 -0700 Subject: ACPI: NFIT: Define runtime firmware activation commands Platform reboots are expensive. Towards reducing downtime to apply firmware updates the Intel NVDIMM command definition is growing support for applying live firmware updates that only require temporarily suspending memory traffic instead of a full reboot. Follow-on commits add support for triggering firmware activation, this patch only defines the commands, adds probe support, and validates that they are blocked via the ioctl path. The ioctl-path block ensures that the OS is in charge since these commands have side effects only the OS can handle. Specifically firmware activation may cause the memory controller to be quiesced on the order of 100s of milliseconds. In that case Linux ensure the activation only takes place while the OS is in a suspend state. Link: https://pmem.io/documents/IntelOptanePMem_DSM_Interface-V2.0.pdf Cc: Vishal Verma Cc: Dave Jiang Cc: Ira Weiny Cc: "Rafael J. Wysocki" Cc: Len Brown Signed-off-by: Dan Williams Signed-off-by: Vishal Verma --- drivers/acpi/nfit/core.c | 86 +++++++++++++++++++++++++++++++--------------- drivers/acpi/nfit/intel.h | 53 ++++++++++++++++++++++++++++ drivers/acpi/nfit/nfit.h | 25 +++++++++++++- include/uapi/linux/ndctl.h | 3 +- 4 files changed, 137 insertions(+), 30 deletions(-) (limited to 'include/uapi') diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c index 9fdd655bdf0e..78cc9e2d2aa3 100644 --- a/drivers/acpi/nfit/core.c +++ b/drivers/acpi/nfit/core.c @@ -73,6 +73,18 @@ const guid_t *to_nfit_uuid(enum nfit_uuids id) } EXPORT_SYMBOL(to_nfit_uuid); +static const guid_t *to_nfit_bus_uuid(int family) +{ + if (WARN_ONCE(family == NVDIMM_BUS_FAMILY_NFIT, + "only secondary bus families can be translated\n")) + return NULL; + /* + * The index of bus UUIDs starts immediately following the last + * NVDIMM/leaf family. + */ + return to_nfit_uuid(family + NVDIMM_FAMILY_MAX); +} + static struct acpi_device *to_acpi_dev(struct acpi_nfit_desc *acpi_desc) { struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc; @@ -362,24 +374,8 @@ static u8 nfit_dsm_revid(unsigned family, unsigned func) { static const u8 revid_table[NVDIMM_FAMILY_MAX+1][NVDIMM_CMD_MAX+1] = { [NVDIMM_FAMILY_INTEL] = { - [NVDIMM_INTEL_GET_MODES] = 2, - [NVDIMM_INTEL_GET_FWINFO] = 2, - [NVDIMM_INTEL_START_FWUPDATE] = 2, - [NVDIMM_INTEL_SEND_FWUPDATE] = 2, - [NVDIMM_INTEL_FINISH_FWUPDATE] = 2, - [NVDIMM_INTEL_QUERY_FWUPDATE] = 2, - [NVDIMM_INTEL_SET_THRESHOLD] = 2, - [NVDIMM_INTEL_INJECT_ERROR] = 2, - [NVDIMM_INTEL_GET_SECURITY_STATE] = 2, - [NVDIMM_INTEL_SET_PASSPHRASE] = 2, - [NVDIMM_INTEL_DISABLE_PASSPHRASE] = 2, - [NVDIMM_INTEL_UNLOCK_UNIT] = 2, - [NVDIMM_INTEL_FREEZE_LOCK] = 2, - [NVDIMM_INTEL_SECURE_ERASE] = 2, - [NVDIMM_INTEL_OVERWRITE] = 2, - [NVDIMM_INTEL_QUERY_OVERWRITE] = 2, - [NVDIMM_INTEL_SET_MASTER_PASSPHRASE] = 2, - [NVDIMM_INTEL_MASTER_SECURE_ERASE] = 2, + [NVDIMM_INTEL_GET_MODES ... + NVDIMM_INTEL_FW_ACTIVATE_ARM] = 2, }, }; u8 id; @@ -406,7 +402,7 @@ static bool payload_dumpable(struct nvdimm *nvdimm, unsigned int func) } static int cmd_to_func(struct nfit_mem *nfit_mem, unsigned int cmd, - struct nd_cmd_pkg *call_pkg) + struct nd_cmd_pkg *call_pkg, int *family) { if (call_pkg) { int i; @@ -417,6 +413,7 @@ static int cmd_to_func(struct nfit_mem *nfit_mem, unsigned int cmd, for (i = 0; i < ARRAY_SIZE(call_pkg->nd_reserved2); i++) if (call_pkg->nd_reserved2[i]) return -EINVAL; + *family = call_pkg->nd_family; return call_pkg->nd_command; } @@ -450,13 +447,14 @@ int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, acpi_handle handle; const guid_t *guid; int func, rc, i; + int family = 0; if (cmd_rc) *cmd_rc = -EINVAL; if (cmd == ND_CMD_CALL) call_pkg = buf; - func = cmd_to_func(nfit_mem, cmd, call_pkg); + func = cmd_to_func(nfit_mem, cmd, call_pkg, &family); if (func < 0) return func; @@ -478,9 +476,17 @@ int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, cmd_name = nvdimm_bus_cmd_name(cmd); cmd_mask = nd_desc->cmd_mask; - dsm_mask = acpi_desc->bus_dsm_mask; + if (cmd == ND_CMD_CALL && call_pkg->nd_family) { + family = call_pkg->nd_family; + if (!test_bit(family, &nd_desc->bus_family_mask)) + return -EINVAL; + dsm_mask = acpi_desc->family_dsm_mask[family]; + guid = to_nfit_bus_uuid(family); + } else { + dsm_mask = acpi_desc->bus_dsm_mask; + guid = to_nfit_uuid(NFIT_DEV_BUS); + } desc = nd_cmd_bus_desc(cmd); - guid = to_nfit_uuid(NFIT_DEV_BUS); handle = adev->handle; dimm_name = "bus"; } @@ -516,8 +522,8 @@ int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, in_buf.buffer.length = call_pkg->nd_size_in; } - dev_dbg(dev, "%s cmd: %d: func: %d input length: %d\n", - dimm_name, cmd, func, in_buf.buffer.length); + dev_dbg(dev, "%s cmd: %d: family: %d func: %d input length: %d\n", + dimm_name, cmd, family, func, in_buf.buffer.length); if (payload_dumpable(nvdimm, func)) print_hex_dump_debug("nvdimm in ", DUMP_PREFIX_OFFSET, 4, 4, in_buf.buffer.pointer, @@ -2153,14 +2159,21 @@ static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc) { struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc; const guid_t *guid = to_nfit_uuid(NFIT_DEV_BUS); + unsigned long dsm_mask, *mask; struct acpi_device *adev; - unsigned long dsm_mask; int i; - nd_desc->cmd_mask = acpi_desc->bus_cmd_force_en; set_bit(ND_CMD_CALL, &nd_desc->cmd_mask); set_bit(NVDIMM_BUS_FAMILY_NFIT, &nd_desc->bus_family_mask); + /* enable nfit_test to inject bus command emulation */ + if (acpi_desc->bus_cmd_force_en) { + nd_desc->cmd_mask = acpi_desc->bus_cmd_force_en; + mask = &nd_desc->bus_family_mask; + if (acpi_desc->family_dsm_mask[NVDIMM_BUS_FAMILY_INTEL]) + set_bit(NVDIMM_BUS_FAMILY_INTEL, mask); + } + adev = to_acpi_dev(acpi_desc); if (!adev) return; @@ -2181,6 +2194,14 @@ static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc) for_each_set_bit(i, &dsm_mask, BITS_PER_LONG) if (acpi_check_dsm(adev->handle, guid, 1, 1ULL << i)) set_bit(i, &acpi_desc->bus_dsm_mask); + + /* Enumerate allowed NVDIMM_BUS_FAMILY_INTEL commands */ + dsm_mask = NVDIMM_BUS_INTEL_FW_ACTIVATE_CMDMASK; + guid = to_nfit_bus_uuid(NVDIMM_BUS_FAMILY_INTEL); + mask = &acpi_desc->family_dsm_mask[NVDIMM_BUS_FAMILY_INTEL]; + for_each_set_bit(i, &dsm_mask, BITS_PER_LONG) + if (acpi_check_dsm(adev->handle, guid, 1, 1ULL << i)) + set_bit(i, mask); } static ssize_t range_index_show(struct device *dev, @@ -3492,7 +3513,10 @@ static int __acpi_nfit_clear_to_send(struct nvdimm_bus_descriptor *nd_desc, return 0; } -/* prevent security commands from being issued via ioctl */ +/* + * Prevent security and firmware activate commands from being issued via + * ioctl. + */ static int acpi_nfit_clear_to_send(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, unsigned int cmd, void *buf) { @@ -3503,10 +3527,15 @@ static int acpi_nfit_clear_to_send(struct nvdimm_bus_descriptor *nd_desc, call_pkg->nd_family == NVDIMM_FAMILY_INTEL) { func = call_pkg->nd_command; if (func > NVDIMM_CMD_MAX || - (1 << func) & NVDIMM_INTEL_SECURITY_CMDMASK) + (1 << func) & NVDIMM_INTEL_DENY_CMDMASK) return -EOPNOTSUPP; } + /* block all non-nfit bus commands */ + if (!nvdimm && cmd == ND_CMD_CALL && + call_pkg->nd_family != NVDIMM_BUS_FAMILY_NFIT) + return -EOPNOTSUPP; + return __acpi_nfit_clear_to_send(nd_desc, nvdimm, cmd); } @@ -3798,6 +3827,7 @@ static __init int nfit_init(void) guid_parse(UUID_NFIT_DIMM_N_HPE2, &nfit_uuid[NFIT_DEV_DIMM_N_HPE2]); guid_parse(UUID_NFIT_DIMM_N_MSFT, &nfit_uuid[NFIT_DEV_DIMM_N_MSFT]); guid_parse(UUID_NFIT_DIMM_N_HYPERV, &nfit_uuid[NFIT_DEV_DIMM_N_HYPERV]); + guid_parse(UUID_INTEL_BUS, &nfit_uuid[NFIT_BUS_INTEL]); nfit_wq = create_singlethread_workqueue("nfit"); if (!nfit_wq) diff --git a/drivers/acpi/nfit/intel.h b/drivers/acpi/nfit/intel.h index 0aca682ab9d7..868d073731cc 100644 --- a/drivers/acpi/nfit/intel.h +++ b/drivers/acpi/nfit/intel.h @@ -111,4 +111,57 @@ struct nd_intel_master_secure_erase { u8 passphrase[ND_INTEL_PASSPHRASE_SIZE]; u32 status; } __packed; + +#define ND_INTEL_FWA_IDLE 0 +#define ND_INTEL_FWA_ARMED 1 +#define ND_INTEL_FWA_BUSY 2 + +#define ND_INTEL_DIMM_FWA_NONE 0 +#define ND_INTEL_DIMM_FWA_NOTSTAGED 1 +#define ND_INTEL_DIMM_FWA_SUCCESS 2 +#define ND_INTEL_DIMM_FWA_NEEDRESET 3 +#define ND_INTEL_DIMM_FWA_MEDIAFAILED 4 +#define ND_INTEL_DIMM_FWA_ABORT 5 +#define ND_INTEL_DIMM_FWA_NOTSUPP 6 +#define ND_INTEL_DIMM_FWA_ERROR 7 + +struct nd_intel_fw_activate_dimminfo { + u32 status; + u16 result; + u8 state; + u8 reserved[7]; +} __packed; + +struct nd_intel_fw_activate_arm { + u8 activate_arm; + u32 status; +} __packed; + +/* Root device command payloads */ +#define ND_INTEL_BUS_FWA_CAP_FWQUIESCE (1 << 0) +#define ND_INTEL_BUS_FWA_CAP_OSQUIESCE (1 << 1) +#define ND_INTEL_BUS_FWA_CAP_RESET (1 << 2) + +struct nd_intel_bus_fw_activate_businfo { + u32 status; + u16 reserved; + u8 state; + u8 capability; + u64 activate_tmo; + u64 cpu_quiesce_tmo; + u64 io_quiesce_tmo; + u64 max_quiesce_tmo; +} __packed; + +#define ND_INTEL_BUS_FWA_STATUS_NOARM (6 | 1 << 16) +#define ND_INTEL_BUS_FWA_STATUS_BUSY (6 | 2 << 16) +#define ND_INTEL_BUS_FWA_STATUS_NOFW (6 | 3 << 16) +#define ND_INTEL_BUS_FWA_STATUS_TMO (6 | 4 << 16) +#define ND_INTEL_BUS_FWA_STATUS_NOIDLE (6 | 5 << 16) +#define ND_INTEL_BUS_FWA_STATUS_ABORT (6 | 6 << 16) + +struct nd_intel_bus_fw_activate { + u8 iodev_state; + u32 status; +} __packed; #endif diff --git a/drivers/acpi/nfit/nfit.h b/drivers/acpi/nfit/nfit.h index da097149d94d..97c122628975 100644 --- a/drivers/acpi/nfit/nfit.h +++ b/drivers/acpi/nfit/nfit.h @@ -18,6 +18,7 @@ /* http://pmem.io/documents/NVDIMM_DSM_Interface-V1.6.pdf */ #define UUID_NFIT_DIMM "4309ac30-0d11-11e4-9191-0800200c9a66" +#define UUID_INTEL_BUS "c7d8acd4-2df8-4b82-9f65-a325335af149" /* https://github.com/HewlettPackard/hpe-nvm/blob/master/Documentation/ */ #define UUID_NFIT_DIMM_N_HPE1 "9002c334-acf3-4c0e-9642-a235f0d53bc6" @@ -65,6 +66,13 @@ enum nvdimm_family_cmds { NVDIMM_INTEL_QUERY_OVERWRITE = 26, NVDIMM_INTEL_SET_MASTER_PASSPHRASE = 27, NVDIMM_INTEL_MASTER_SECURE_ERASE = 28, + NVDIMM_INTEL_FW_ACTIVATE_DIMMINFO = 29, + NVDIMM_INTEL_FW_ACTIVATE_ARM = 30, +}; + +enum nvdimm_bus_family_cmds { + NVDIMM_BUS_INTEL_FW_ACTIVATE_BUSINFO = 1, + NVDIMM_BUS_INTEL_FW_ACTIVATE = 2, }; #define NVDIMM_INTEL_SECURITY_CMDMASK \ @@ -75,13 +83,22 @@ enum nvdimm_family_cmds { | 1 << NVDIMM_INTEL_SET_MASTER_PASSPHRASE \ | 1 << NVDIMM_INTEL_MASTER_SECURE_ERASE) +#define NVDIMM_INTEL_FW_ACTIVATE_CMDMASK \ +(1 << NVDIMM_INTEL_FW_ACTIVATE_DIMMINFO | 1 << NVDIMM_INTEL_FW_ACTIVATE_ARM) + +#define NVDIMM_BUS_INTEL_FW_ACTIVATE_CMDMASK \ +(1 << NVDIMM_BUS_INTEL_FW_ACTIVATE_BUSINFO | 1 << NVDIMM_BUS_INTEL_FW_ACTIVATE) + #define NVDIMM_INTEL_CMDMASK \ (NVDIMM_STANDARD_CMDMASK | 1 << NVDIMM_INTEL_GET_MODES \ | 1 << NVDIMM_INTEL_GET_FWINFO | 1 << NVDIMM_INTEL_START_FWUPDATE \ | 1 << NVDIMM_INTEL_SEND_FWUPDATE | 1 << NVDIMM_INTEL_FINISH_FWUPDATE \ | 1 << NVDIMM_INTEL_QUERY_FWUPDATE | 1 << NVDIMM_INTEL_SET_THRESHOLD \ | 1 << NVDIMM_INTEL_INJECT_ERROR | 1 << NVDIMM_INTEL_LATCH_SHUTDOWN \ - | NVDIMM_INTEL_SECURITY_CMDMASK) + | NVDIMM_INTEL_SECURITY_CMDMASK | NVDIMM_INTEL_FW_ACTIVATE_CMDMASK) + +#define NVDIMM_INTEL_DENY_CMDMASK \ +(NVDIMM_INTEL_SECURITY_CMDMASK | NVDIMM_INTEL_FW_ACTIVATE_CMDMASK) enum nfit_uuids { /* for simplicity alias the uuid index with the family id */ @@ -90,6 +107,11 @@ enum nfit_uuids { NFIT_DEV_DIMM_N_HPE2 = NVDIMM_FAMILY_HPE2, NFIT_DEV_DIMM_N_MSFT = NVDIMM_FAMILY_MSFT, NFIT_DEV_DIMM_N_HYPERV = NVDIMM_FAMILY_HYPERV, + /* + * to_nfit_bus_uuid() expects to translate bus uuid family ids + * to a UUID index using NVDIMM_FAMILY_MAX as an offset + */ + NFIT_BUS_INTEL = NVDIMM_FAMILY_MAX + NVDIMM_BUS_FAMILY_INTEL, NFIT_SPA_VOLATILE, NFIT_SPA_PM, NFIT_SPA_DCR, @@ -238,6 +260,7 @@ struct acpi_nfit_desc { unsigned long dimm_cmd_force_en; unsigned long bus_cmd_force_en; unsigned long bus_dsm_mask; + unsigned long family_dsm_mask[NVDIMM_BUS_FAMILY_MAX + 1]; unsigned int platform_cap; unsigned int scrub_tmo; int (*blk_do_io)(struct nd_blk_region *ndbr, resource_size_t dpa, diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h index e9468b9332bd..8cf1e4884fd5 100644 --- a/include/uapi/linux/ndctl.h +++ b/include/uapi/linux/ndctl.h @@ -248,7 +248,8 @@ struct nd_cmd_pkg { #define NVDIMM_FAMILY_MAX NVDIMM_FAMILY_PAPR #define NVDIMM_BUS_FAMILY_NFIT 0 -#define NVDIMM_BUS_FAMILY_MAX NVDIMM_BUS_FAMILY_NFIT +#define NVDIMM_BUS_FAMILY_INTEL 1 +#define NVDIMM_BUS_FAMILY_MAX NVDIMM_BUS_FAMILY_INTEL #define ND_IOCTL_CALL _IOWR(ND_IOCTL, ND_CMD_CALL,\ struct nd_cmd_pkg) -- cgit v1.2.3-59-g8ed1b From a5cbe05a6673b85bed2a63ffcfea6a96c6410cff Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 23 Jul 2020 11:41:12 -0700 Subject: bpf: Implement bpf iterator for map elements The bpf iterator for map elements are implemented. The bpf program will receive four parameters: bpf_iter_meta *meta: the meta data bpf_map *map: the bpf_map whose elements are traversed void *key: the key of one element void *value: the value of the same element Here, meta and map pointers are always valid, and key has register type PTR_TO_RDONLY_BUF_OR_NULL and value has register type PTR_TO_RDWR_BUF_OR_NULL. The kernel will track the access range of key and value during verification time. Later, these values will be compared against the values in the actual map to ensure all accesses are within range. A new field iter_seq_info is added to bpf_map_ops which is used to add map type specific information, i.e., seq_ops, init/fini seq_file func and seq_file private data size. Subsequent patches will have actual implementation for bpf_map_ops->iter_seq_info. In user space, BPF_ITER_LINK_MAP_FD needs to be specified in prog attr->link_create.flags, which indicates that attr->link_create.target_fd is a map_fd. The reason for such an explicit flag is for possible future cases where one bpf iterator may allow more than one possible customization, e.g., pid and cgroup id for task_file. Current kernel internal implementation only allows the target to register at most one required bpf_iter_link_info. To support the above case, optional bpf_iter_link_info's are needed, the target can be extended to register such link infos, and user provided link_info needs to match one of target supported ones. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200723184112.590360-1-yhs@fb.com --- include/linux/bpf.h | 16 ++++++++ include/uapi/linux/bpf.h | 7 ++++ kernel/bpf/bpf_iter.c | 85 ++++++++++++++++++++++++++++++++++-------- kernel/bpf/map_iter.c | 30 ++++++++++++++- tools/include/uapi/linux/bpf.h | 7 ++++ 5 files changed, 128 insertions(+), 17 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f9c4bb08f616..4175cf1f4665 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -107,6 +107,9 @@ struct bpf_map_ops { /* BTF name and id of struct allocated by map_alloc */ const char * const map_btf_name; int *map_btf_id; + + /* bpf_iter info used to open a seq_file */ + const struct bpf_iter_seq_info *iter_seq_info; }; struct bpf_map_memory { @@ -1207,12 +1210,18 @@ int bpf_obj_get_user(const char __user *pathname, int flags); int __init bpf_iter_ ## target(args) { return 0; } struct bpf_iter_aux_info { + struct bpf_map *map; }; +typedef int (*bpf_iter_check_target_t)(struct bpf_prog *prog, + struct bpf_iter_aux_info *aux); + #define BPF_ITER_CTX_ARG_MAX 2 struct bpf_iter_reg { const char *target; + bpf_iter_check_target_t check_target; u32 ctx_arg_info_size; + enum bpf_iter_link_info req_linfo; struct bpf_ctx_arg_aux ctx_arg_info[BPF_ITER_CTX_ARG_MAX]; const struct bpf_iter_seq_info *seq_info; }; @@ -1223,6 +1232,13 @@ struct bpf_iter_meta { u64 seq_num; }; +struct bpf_iter__bpf_map_elem { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct bpf_map *, map); + __bpf_md_ptr(void *, key); + __bpf_md_ptr(void *, value); +}; + int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info); void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info); bool bpf_iter_prog_supported(struct bpf_prog *prog); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 54d0c886e3ba..828c2f6438f2 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -246,6 +246,13 @@ enum bpf_link_type { MAX_BPF_LINK_TYPE, }; +enum bpf_iter_link_info { + BPF_ITER_LINK_UNSPEC = 0, + BPF_ITER_LINK_MAP_FD = 1, + + MAX_BPF_ITER_LINK_INFO, +}; + /* cgroup-bpf attach flags used in BPF_PROG_ATTACH command * * NONE(default): No further bpf programs allowed in the subtree. diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 8fa94cb1b5a0..363b9cafc2d8 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -14,11 +14,13 @@ struct bpf_iter_target_info { struct bpf_iter_link { struct bpf_link link; + struct bpf_iter_aux_info aux; struct bpf_iter_target_info *tinfo; }; struct bpf_iter_priv_data { struct bpf_iter_target_info *tinfo; + const struct bpf_iter_seq_info *seq_info; struct bpf_prog *prog; u64 session_id; u64 seq_num; @@ -35,7 +37,8 @@ static DEFINE_MUTEX(link_mutex); /* incremented on every opened seq_file */ static atomic64_t session_id; -static int prepare_seq_file(struct file *file, struct bpf_iter_link *link); +static int prepare_seq_file(struct file *file, struct bpf_iter_link *link, + const struct bpf_iter_seq_info *seq_info); static void bpf_iter_inc_seq_num(struct seq_file *seq) { @@ -199,11 +202,25 @@ done: return copied; } +static const struct bpf_iter_seq_info * +__get_seq_info(struct bpf_iter_link *link) +{ + const struct bpf_iter_seq_info *seq_info; + + if (link->aux.map) { + seq_info = link->aux.map->ops->iter_seq_info; + if (seq_info) + return seq_info; + } + + return link->tinfo->reg_info->seq_info; +} + static int iter_open(struct inode *inode, struct file *file) { struct bpf_iter_link *link = inode->i_private; - return prepare_seq_file(file, link); + return prepare_seq_file(file, link, __get_seq_info(link)); } static int iter_release(struct inode *inode, struct file *file) @@ -218,8 +235,8 @@ static int iter_release(struct inode *inode, struct file *file) iter_priv = container_of(seq->private, struct bpf_iter_priv_data, target_private); - if (iter_priv->tinfo->reg_info->seq_info->fini_seq_private) - iter_priv->tinfo->reg_info->seq_info->fini_seq_private(seq->private); + if (iter_priv->seq_info->fini_seq_private) + iter_priv->seq_info->fini_seq_private(seq->private); bpf_prog_put(iter_priv->prog); seq->private = iter_priv; @@ -318,6 +335,11 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog) static void bpf_iter_link_release(struct bpf_link *link) { + struct bpf_iter_link *iter_link = + container_of(link, struct bpf_iter_link, link); + + if (iter_link->aux.map) + bpf_map_put_with_uref(iter_link->aux.map); } static void bpf_iter_link_dealloc(struct bpf_link *link) @@ -370,14 +392,13 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct bpf_link_primer link_primer; struct bpf_iter_target_info *tinfo; + struct bpf_iter_aux_info aux = {}; struct bpf_iter_link *link; + u32 prog_btf_id, target_fd; bool existed = false; - u32 prog_btf_id; + struct bpf_map *map; int err; - if (attr->link_create.target_fd || attr->link_create.flags) - return -EINVAL; - prog_btf_id = prog->aux->attach_btf_id; mutex_lock(&targets_mutex); list_for_each_entry(tinfo, &targets, list) { @@ -390,6 +411,13 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) if (!existed) return -ENOENT; + /* Make sure user supplied flags are target expected. */ + target_fd = attr->link_create.target_fd; + if (attr->link_create.flags != tinfo->reg_info->req_linfo) + return -EINVAL; + if (!attr->link_create.flags && target_fd) + return -EINVAL; + link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN); if (!link) return -ENOMEM; @@ -403,21 +431,45 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) return err; } + if (tinfo->reg_info->req_linfo == BPF_ITER_LINK_MAP_FD) { + map = bpf_map_get_with_uref(target_fd); + if (IS_ERR(map)) { + err = PTR_ERR(map); + goto cleanup_link; + } + + aux.map = map; + err = tinfo->reg_info->check_target(prog, &aux); + if (err) { + bpf_map_put_with_uref(map); + goto cleanup_link; + } + + link->aux.map = map; + } + return bpf_link_settle(&link_primer); + +cleanup_link: + bpf_link_cleanup(&link_primer); + return err; } static void init_seq_meta(struct bpf_iter_priv_data *priv_data, struct bpf_iter_target_info *tinfo, + const struct bpf_iter_seq_info *seq_info, struct bpf_prog *prog) { priv_data->tinfo = tinfo; + priv_data->seq_info = seq_info; priv_data->prog = prog; priv_data->session_id = atomic64_inc_return(&session_id); priv_data->seq_num = 0; priv_data->done_stop = false; } -static int prepare_seq_file(struct file *file, struct bpf_iter_link *link) +static int prepare_seq_file(struct file *file, struct bpf_iter_link *link, + const struct bpf_iter_seq_info *seq_info) { struct bpf_iter_priv_data *priv_data; struct bpf_iter_target_info *tinfo; @@ -433,21 +485,21 @@ static int prepare_seq_file(struct file *file, struct bpf_iter_link *link) tinfo = link->tinfo; total_priv_dsize = offsetof(struct bpf_iter_priv_data, target_private) + - tinfo->reg_info->seq_info->seq_priv_size; - priv_data = __seq_open_private(file, tinfo->reg_info->seq_info->seq_ops, + seq_info->seq_priv_size; + priv_data = __seq_open_private(file, seq_info->seq_ops, total_priv_dsize); if (!priv_data) { err = -ENOMEM; goto release_prog; } - if (tinfo->reg_info->seq_info->init_seq_private) { - err = tinfo->reg_info->seq_info->init_seq_private(priv_data->target_private, NULL); + if (seq_info->init_seq_private) { + err = seq_info->init_seq_private(priv_data->target_private, &link->aux); if (err) goto release_seq_file; } - init_seq_meta(priv_data, tinfo, prog); + init_seq_meta(priv_data, tinfo, seq_info, prog); seq = file->private_data; seq->private = priv_data->target_private; @@ -463,6 +515,7 @@ release_prog: int bpf_iter_new_fd(struct bpf_link *link) { + struct bpf_iter_link *iter_link; struct file *file; unsigned int flags; int err, fd; @@ -481,8 +534,8 @@ int bpf_iter_new_fd(struct bpf_link *link) goto free_fd; } - err = prepare_seq_file(file, - container_of(link, struct bpf_iter_link, link)); + iter_link = container_of(link, struct bpf_iter_link, link); + err = prepare_seq_file(file, iter_link, __get_seq_info(iter_link)); if (err) goto free_file; diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index 1a69241fb1e2..8a1f9b3355d0 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -98,10 +98,38 @@ static struct bpf_iter_reg bpf_map_reg_info = { .seq_info = &bpf_map_seq_info, }; +static int bpf_iter_check_map(struct bpf_prog *prog, + struct bpf_iter_aux_info *aux) +{ + return -EINVAL; +} + +DEFINE_BPF_ITER_FUNC(bpf_map_elem, struct bpf_iter_meta *meta, + struct bpf_map *map, void *key, void *value) + +static const struct bpf_iter_reg bpf_map_elem_reg_info = { + .target = "bpf_map_elem", + .check_target = bpf_iter_check_map, + .req_linfo = BPF_ITER_LINK_MAP_FD, + .ctx_arg_info_size = 2, + .ctx_arg_info = { + { offsetof(struct bpf_iter__bpf_map_elem, key), + PTR_TO_RDONLY_BUF_OR_NULL }, + { offsetof(struct bpf_iter__bpf_map_elem, value), + PTR_TO_RDWR_BUF_OR_NULL }, + }, +}; + static int __init bpf_map_iter_init(void) { + int ret; + bpf_map_reg_info.ctx_arg_info[0].btf_id = *btf_bpf_map_id; - return bpf_iter_reg_target(&bpf_map_reg_info); + ret = bpf_iter_reg_target(&bpf_map_reg_info); + if (ret) + return ret; + + return bpf_iter_reg_target(&bpf_map_elem_reg_info); } late_initcall(bpf_map_iter_init); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 54d0c886e3ba..828c2f6438f2 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -246,6 +246,13 @@ enum bpf_link_type { MAX_BPF_LINK_TYPE, }; +enum bpf_iter_link_info { + BPF_ITER_LINK_UNSPEC = 0, + BPF_ITER_LINK_MAP_FD = 1, + + MAX_BPF_ITER_LINK_INFO, +}; + /* cgroup-bpf attach flags used in BPF_PROG_ATTACH command * * NONE(default): No further bpf programs allowed in the subtree. -- cgit v1.2.3-59-g8ed1b From aa8d3a716b59db6c1ad6c68fb8aa05e31980da60 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 21 Jul 2020 23:45:57 -0700 Subject: bpf, xdp: Add bpf_link-based XDP attachment API Add bpf_link-based API (bpf_xdp_link) to attach BPF XDP program through BPF_LINK_CREATE command. bpf_xdp_link is mutually exclusive with direct BPF program attachment, previous BPF program should be detached prior to attempting to create a new bpf_xdp_link attachment (for a given XDP mode). Once BPF link is attached, it can't be replaced by other BPF program attachment or link attachment. It will be detached only when the last BPF link FD is closed. bpf_xdp_link will be auto-detached when net_device is shutdown, similarly to how other BPF links behave (cgroup, flow_dissector). At that point bpf_link will become defunct, but won't be destroyed until last FD is closed. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200722064603.3350758-5-andriin@fb.com --- include/linux/netdevice.h | 4 ++ include/uapi/linux/bpf.h | 7 +- kernel/bpf/syscall.c | 5 ++ net/core/dev.c | 169 ++++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 178 insertions(+), 7 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cad44b40c776..7d3c412fcfe5 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -888,6 +888,7 @@ struct bpf_prog_offload_ops; struct netlink_ext_ack; struct xdp_umem; struct xdp_dev_bulk_queue; +struct bpf_xdp_link; enum bpf_xdp_mode { XDP_MODE_SKB = 0, @@ -898,6 +899,7 @@ enum bpf_xdp_mode { struct bpf_xdp_entity { struct bpf_prog *prog; + struct bpf_xdp_link *link; }; struct netdev_bpf { @@ -3831,7 +3833,9 @@ struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf); int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, int fd, int expected_fd, u32 flags); +int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode); + int xdp_umem_query(struct net_device *dev, u16 queue_id); int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 828c2f6438f2..87823fb9c123 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -230,6 +230,7 @@ enum bpf_attach_type { BPF_CGROUP_INET_SOCK_RELEASE, BPF_XDP_CPUMAP, BPF_SK_LOOKUP, + BPF_XDP, __MAX_BPF_ATTACH_TYPE }; @@ -242,6 +243,7 @@ enum bpf_link_type { BPF_LINK_TYPE_CGROUP = 3, BPF_LINK_TYPE_ITER = 4, BPF_LINK_TYPE_NETNS = 5, + BPF_LINK_TYPE_XDP = 6, MAX_BPF_LINK_TYPE, }; @@ -614,7 +616,10 @@ union bpf_attr { struct { /* struct used by BPF_LINK_CREATE command */ __u32 prog_fd; /* eBPF program to attach */ - __u32 target_fd; /* object to attach to */ + union { + __u32 target_fd; /* object to attach to */ + __u32 target_ifindex; /* target ifindex */ + }; __u32 attach_type; /* attach type */ __u32 flags; /* extra flags */ } link_create; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ee290b1f2d9e..0e8c88db7e7a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2824,6 +2824,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) return BPF_PROG_TYPE_TRACING; case BPF_SK_LOOKUP: return BPF_PROG_TYPE_SK_LOOKUP; + case BPF_XDP: + return BPF_PROG_TYPE_XDP; default: return BPF_PROG_TYPE_UNSPEC; } @@ -3921,6 +3923,9 @@ static int link_create(union bpf_attr *attr) case BPF_PROG_TYPE_SK_LOOKUP: ret = netns_bpf_link_create(attr, prog); break; + case BPF_PROG_TYPE_XDP: + ret = bpf_xdp_link_attach(attr, prog); + break; default: ret = -EINVAL; } diff --git a/net/core/dev.c b/net/core/dev.c index 521ce031ee35..e24248f3d675 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8716,6 +8716,12 @@ int dev_change_proto_down_generic(struct net_device *dev, bool proto_down) } EXPORT_SYMBOL(dev_change_proto_down_generic); +struct bpf_xdp_link { + struct bpf_link link; + struct net_device *dev; /* protected by rtnl_lock, no refcnt held */ + int flags; +}; + static enum bpf_xdp_mode dev_xdp_mode(u32 flags) { if (flags & XDP_FLAGS_HW_MODE) @@ -8738,9 +8744,19 @@ static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode) }; } +static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev, + enum bpf_xdp_mode mode) +{ + return dev->xdp_state[mode].link; +} + static struct bpf_prog *dev_xdp_prog(struct net_device *dev, enum bpf_xdp_mode mode) { + struct bpf_xdp_link *link = dev_xdp_link(dev, mode); + + if (link) + return link->link.prog; return dev->xdp_state[mode].prog; } @@ -8751,9 +8767,17 @@ u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode) return prog ? prog->aux->id : 0; } +static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode, + struct bpf_xdp_link *link) +{ + dev->xdp_state[mode].link = link; + dev->xdp_state[mode].prog = NULL; +} + static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode, struct bpf_prog *prog) { + dev->xdp_state[mode].link = NULL; dev->xdp_state[mode].prog = prog; } @@ -8793,6 +8817,7 @@ static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode, static void dev_xdp_uninstall(struct net_device *dev) { + struct bpf_xdp_link *link; struct bpf_prog *prog; enum bpf_xdp_mode mode; bpf_op_t bpf_op; @@ -8810,14 +8835,20 @@ static void dev_xdp_uninstall(struct net_device *dev) WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL)); - bpf_prog_put(prog); - dev_xdp_set_prog(dev, mode, NULL); + /* auto-detach link from net device */ + link = dev_xdp_link(dev, mode); + if (link) + link->dev = NULL; + else + bpf_prog_put(prog); + + dev_xdp_set_link(dev, mode, NULL); } } static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack, - struct bpf_prog *new_prog, struct bpf_prog *old_prog, - u32 flags) + struct bpf_xdp_link *link, struct bpf_prog *new_prog, + struct bpf_prog *old_prog, u32 flags) { struct bpf_prog *cur_prog; enum bpf_xdp_mode mode; @@ -8826,6 +8857,14 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack ASSERT_RTNL(); + /* either link or prog attachment, never both */ + if (link && (new_prog || old_prog)) + return -EINVAL; + /* link supports only XDP mode flags */ + if (link && (flags & ~XDP_FLAGS_MODES)) { + NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment"); + return -EINVAL; + } /* just one XDP mode bit should be set, zero defaults to SKB mode */ if (hweight32(flags & XDP_FLAGS_MODES) > 1) { NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set"); @@ -8838,7 +8877,18 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack } mode = dev_xdp_mode(flags); + /* can't replace attached link */ + if (dev_xdp_link(dev, mode)) { + NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link"); + return -EBUSY; + } + cur_prog = dev_xdp_prog(dev, mode); + /* can't replace attached prog with link */ + if (link && cur_prog) { + NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link"); + return -EBUSY; + } if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) { NL_SET_ERR_MSG(extack, "Active program does not match expected"); return -EEXIST; @@ -8848,6 +8898,10 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack return -EBUSY; } + /* put effective new program into new_prog */ + if (link) + new_prog = link->link.prog; + if (new_prog) { bool offload = mode == XDP_MODE_HW; enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB @@ -8884,13 +8938,116 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack return err; } - dev_xdp_set_prog(dev, mode, new_prog); + if (link) + dev_xdp_set_link(dev, mode, link); + else + dev_xdp_set_prog(dev, mode, new_prog); if (cur_prog) bpf_prog_put(cur_prog); return 0; } +static int dev_xdp_attach_link(struct net_device *dev, + struct netlink_ext_ack *extack, + struct bpf_xdp_link *link) +{ + return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags); +} + +static int dev_xdp_detach_link(struct net_device *dev, + struct netlink_ext_ack *extack, + struct bpf_xdp_link *link) +{ + enum bpf_xdp_mode mode; + bpf_op_t bpf_op; + + ASSERT_RTNL(); + + mode = dev_xdp_mode(link->flags); + if (dev_xdp_link(dev, mode) != link) + return -EINVAL; + + bpf_op = dev_xdp_bpf_op(dev, mode); + WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL)); + dev_xdp_set_link(dev, mode, NULL); + return 0; +} + +static void bpf_xdp_link_release(struct bpf_link *link) +{ + struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); + + rtnl_lock(); + + /* if racing with net_device's tear down, xdp_link->dev might be + * already NULL, in which case link was already auto-detached + */ + if (xdp_link->dev) + WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link)); + + rtnl_unlock(); +} + +static void bpf_xdp_link_dealloc(struct bpf_link *link) +{ + struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); + + kfree(xdp_link); +} + +static const struct bpf_link_ops bpf_xdp_link_lops = { + .release = bpf_xdp_link_release, + .dealloc = bpf_xdp_link_dealloc, +}; + +int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct net *net = current->nsproxy->net_ns; + struct bpf_link_primer link_primer; + struct bpf_xdp_link *link; + struct net_device *dev; + int err, fd; + + dev = dev_get_by_index(net, attr->link_create.target_ifindex); + if (!dev) + return -EINVAL; + + link = kzalloc(sizeof(*link), GFP_USER); + if (!link) { + err = -ENOMEM; + goto out_put_dev; + } + + bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog); + link->dev = dev; + link->flags = attr->link_create.flags; + + err = bpf_link_prime(&link->link, &link_primer); + if (err) { + kfree(link); + goto out_put_dev; + } + + rtnl_lock(); + err = dev_xdp_attach_link(dev, NULL, link); + rtnl_unlock(); + + if (err) { + bpf_link_cleanup(&link_primer); + goto out_put_dev; + } + + fd = bpf_link_settle(&link_primer); + /* link itself doesn't hold dev's refcnt to not complicate shutdown */ + dev_put(dev); + return fd; + +out_put_dev: + dev_put(dev); + return err; +} + /** * dev_change_xdp_fd - set or clear a bpf program for a device rx path * @dev: device @@ -8927,7 +9084,7 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, } } - err = dev_xdp_attach(dev, extack, new_prog, old_prog, flags); + err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags); err_out: if (err && new_prog) -- cgit v1.2.3-59-g8ed1b From c1931c9784ebb5787c0784c112fb8baa5e8455b3 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 21 Jul 2020 23:45:59 -0700 Subject: bpf: Implement BPF XDP link-specific introspection APIs Implement XDP link-specific show_fdinfo and link_info to emit ifindex. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200722064603.3350758-7-andriin@fb.com --- include/uapi/linux/bpf.h | 3 +++ net/core/dev.c | 31 +++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 87823fb9c123..e1ba4ae6a916 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4069,6 +4069,9 @@ struct bpf_link_info { __u32 netns_ino; __u32 attach_type; } netns; + struct { + __u32 ifindex; + } xdp; }; } __attribute__((aligned(8))); diff --git a/net/core/dev.c b/net/core/dev.c index 49f284f51a22..82ce0920b172 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8996,6 +8996,35 @@ static void bpf_xdp_link_dealloc(struct bpf_link *link) kfree(xdp_link); } +static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); + u32 ifindex = 0; + + rtnl_lock(); + if (xdp_link->dev) + ifindex = xdp_link->dev->ifindex; + rtnl_unlock(); + + seq_printf(seq, "ifindex:\t%u\n", ifindex); +} + +static int bpf_xdp_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); + u32 ifindex = 0; + + rtnl_lock(); + if (xdp_link->dev) + ifindex = xdp_link->dev->ifindex; + rtnl_unlock(); + + info->xdp.ifindex = ifindex; + return 0; +} + static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog, struct bpf_prog *old_prog) { @@ -9041,6 +9070,8 @@ out_unlock: static const struct bpf_link_ops bpf_xdp_link_lops = { .release = bpf_xdp_link_release, .dealloc = bpf_xdp_link_dealloc, + .show_fdinfo = bpf_xdp_link_show_fdinfo, + .fill_link_info = bpf_xdp_link_fill_link_info, .update_prog = bpf_xdp_link_update, }; -- cgit v1.2.3-59-g8ed1b From 1a0c02ba643ed05c07ddf14d87c3bec640666836 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Mon, 20 Jul 2020 08:50:58 -0700 Subject: dmaengine: idxd: add missing invalid flags field to completion Add missing "invalid flags" field to completion record struct. Reported-by: Nikhil Rao Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/159526025819.49266.13176787210106133664.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Vinod Koul --- include/uapi/linux/idxd.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h index 1f412fbf561b..5d07a3ff9e96 100644 --- a/include/uapi/linux/idxd.h +++ b/include/uapi/linux/idxd.h @@ -178,6 +178,12 @@ struct dsa_completion_record { uint32_t bytes_completed; uint64_t fault_addr; union { + /* common record */ + struct { + uint32_t invalid_flags:24; + uint32_t rsvd2:8; + }; + uint16_t delta_rec_size; uint16_t crc_val; -- cgit v1.2.3-59-g8ed1b From 06f67c47076e5c3ee65276171596479dcc3a3941 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Sun, 28 Jun 2020 13:07:14 +0800 Subject: btrfs: use __u16 for the return value of btrfs_qgroup_level() The qgroup level is limited to u16, so no need to use u64 for it. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 2 +- include/uapi/linux/btrfs_tree.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 787128d7e196..229e461dbfc3 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -538,7 +538,7 @@ bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info) if (qgroup->rsv.values[i]) { ret = true; btrfs_warn(fs_info, - "qgroup %llu/%llu has unreleased space, type %d rsv %llu", + "qgroup %hu/%llu has unreleased space, type %d rsv %llu", btrfs_qgroup_level(qgroup->qgroupid), btrfs_qgroup_subvolid(qgroup->qgroupid), i, qgroup->rsv.values[i]); diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index a3f3975df0de..9ba64ca6b4ac 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -913,9 +913,9 @@ struct btrfs_free_space_info { #define BTRFS_FREE_SPACE_USING_BITMAPS (1ULL << 0) #define BTRFS_QGROUP_LEVEL_SHIFT 48 -static inline __u64 btrfs_qgroup_level(__u64 qgroupid) +static inline __u16 btrfs_qgroup_level(__u64 qgroupid) { - return qgroupid >> BTRFS_QGROUP_LEVEL_SHIFT; + return (__u16)(qgroupid >> BTRFS_QGROUP_LEVEL_SHIFT); } /* -- cgit v1.2.3-59-g8ed1b From 137c541821a83debb63b3fa8abdd1cbc41bdf3a1 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Mon, 13 Jul 2020 21:28:58 +0900 Subject: btrfs: pass checksum type via BTRFS_IOC_FS_INFO ioctl With the recent addition of filesystem checksum types other than CRC32c, it is not anymore hard-coded which checksum type a btrfs filesystem uses. Up to now there is no good way to read the filesystem checksum, apart from reading the filesystem UUID and then query sysfs for the checksum type. Add a new csum_type and csum_size fields to the BTRFS_IOC_FS_INFO ioctl command which usually is used to query filesystem features. Also add a flags member indicating that the kernel responded with a set csum_type and csum_size field. For compatibility reasons, only return the csum_type and csum_size if the BTRFS_FS_INFO_FLAG_CSUM_INFO flag was passed to the kernel. Also clear any unknown flags so we don't pass false positives to user-space newer than the kernel. To simplify further additions to the ioctl, also switch the padding to a u8 array. Pahole was used to verify the result of this switch: The csum members are added before flags, which might look odd, but this is to keep the alignment requirements and not to introduce holes in the structure. $ pahole -C btrfs_ioctl_fs_info_args fs/btrfs/btrfs.ko struct btrfs_ioctl_fs_info_args { __u64 max_id; /* 0 8 */ __u64 num_devices; /* 8 8 */ __u8 fsid[16]; /* 16 16 */ __u32 nodesize; /* 32 4 */ __u32 sectorsize; /* 36 4 */ __u32 clone_alignment; /* 40 4 */ __u16 csum_type; /* 44 2 */ __u16 csum_size; /* 46 2 */ __u64 flags; /* 48 8 */ __u8 reserved[968]; /* 56 968 */ /* size: 1024, cachelines: 16, members: 10 */ }; Fixes: 3951e7f050ac ("btrfs: add xxhash64 to checksumming algorithms") Fixes: 3831bf0094ab ("btrfs: add sha256 to checksumming algorithm") CC: stable@vger.kernel.org # 5.5+ Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 16 +++++++++++++--- include/uapi/linux/btrfs.h | 14 ++++++++++++-- 2 files changed, 25 insertions(+), 5 deletions(-) (limited to 'include/uapi') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ab34179d7cbc..2854f4a40787 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3217,11 +3217,15 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_fs_info_args *fi_args; struct btrfs_device *device; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + u64 flags_in; int ret = 0; - fi_args = kzalloc(sizeof(*fi_args), GFP_KERNEL); - if (!fi_args) - return -ENOMEM; + fi_args = memdup_user(arg, sizeof(*fi_args)); + if (IS_ERR(fi_args)) + return PTR_ERR(fi_args); + + flags_in = fi_args->flags; + memset(fi_args, 0, sizeof(*fi_args)); rcu_read_lock(); fi_args->num_devices = fs_devices->num_devices; @@ -3237,6 +3241,12 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, fi_args->sectorsize = fs_info->sectorsize; fi_args->clone_alignment = fs_info->sectorsize; + if (flags_in & BTRFS_FS_INFO_FLAG_CSUM_INFO) { + fi_args->csum_type = btrfs_super_csum_type(fs_info->super_copy); + fi_args->csum_size = btrfs_super_csum_size(fs_info->super_copy); + fi_args->flags |= BTRFS_FS_INFO_FLAG_CSUM_INFO; + } + if (copy_to_user(arg, fi_args, sizeof(*fi_args))) ret = -EFAULT; diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index e6b6cb0f8bc6..24f6848ad78e 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -243,6 +243,13 @@ struct btrfs_ioctl_dev_info_args { __u8 path[BTRFS_DEVICE_PATH_NAME_MAX]; /* out */ }; +/* + * Retrieve information about the filesystem + */ + +/* Request information about checksum type and size */ +#define BTRFS_FS_INFO_FLAG_CSUM_INFO (1 << 0) + struct btrfs_ioctl_fs_info_args { __u64 max_id; /* out */ __u64 num_devices; /* out */ @@ -250,8 +257,11 @@ struct btrfs_ioctl_fs_info_args { __u32 nodesize; /* out */ __u32 sectorsize; /* out */ __u32 clone_alignment; /* out */ - __u32 reserved32; - __u64 reserved[122]; /* pad to 1k */ + /* See BTRFS_FS_INFO_FLAG_* */ + __u16 csum_type; /* out */ + __u16 csum_size; /* out */ + __u64 flags; /* in/out */ + __u8 reserved[968]; /* pad to 1k */ }; /* -- cgit v1.2.3-59-g8ed1b From 0fb408a558aadbaa58beb75b02c95741e1fbb514 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Mon, 13 Jul 2020 21:28:59 +0900 Subject: btrfs: add filesystem generation to FS_INFO ioctl Add retrieval of the filesystem's generation to the fsinfo ioctl. This is driven by setting the BTRFS_FS_INFO_FLAG_GENERATION flag in btrfs_ioctl_fs_info_args::flags. Reviewed-by: Nikolay Borisov Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 5 +++++ include/uapi/linux/btrfs.h | 6 +++++- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 2854f4a40787..55dd20d0f9cb 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3247,6 +3247,11 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, fi_args->flags |= BTRFS_FS_INFO_FLAG_CSUM_INFO; } + if (flags_in & BTRFS_FS_INFO_FLAG_GENERATION) { + fi_args->generation = fs_info->generation; + fi_args->flags |= BTRFS_FS_INFO_FLAG_GENERATION; + } + if (copy_to_user(arg, fi_args, sizeof(*fi_args))) ret = -EFAULT; diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index 24f6848ad78e..9b82e01c191d 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -250,6 +250,9 @@ struct btrfs_ioctl_dev_info_args { /* Request information about checksum type and size */ #define BTRFS_FS_INFO_FLAG_CSUM_INFO (1 << 0) +/* Request information about filesystem generation */ +#define BTRFS_FS_INFO_FLAG_GENERATION (1 << 1) + struct btrfs_ioctl_fs_info_args { __u64 max_id; /* out */ __u64 num_devices; /* out */ @@ -261,7 +264,8 @@ struct btrfs_ioctl_fs_info_args { __u16 csum_type; /* out */ __u16 csum_size; /* out */ __u64 flags; /* in/out */ - __u8 reserved[968]; /* pad to 1k */ + __u64 generation; /* out */ + __u8 reserved[960]; /* pad to 1k */ }; /* -- cgit v1.2.3-59-g8ed1b From 49bac897683340457a0ab1f76b924a1220bdb604 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Mon, 13 Jul 2020 21:29:00 +0900 Subject: btrfs: add metadata_uuid to FS_INFO ioctl Add retrieval of the filesystem's metadata UUID to the fsinfo ioctl. This is driven by setting the BTRFS_FS_INFO_FLAG_METADATA_UUID flag in btrfs_ioctl_fs_info_args::flags. Reviewed-by: Nikolay Borisov Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 6 ++++++ include/uapi/linux/btrfs.h | 5 ++++- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 55dd20d0f9cb..b4ddf51ae377 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3252,6 +3252,12 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, fi_args->flags |= BTRFS_FS_INFO_FLAG_GENERATION; } + if (flags_in & BTRFS_FS_INFO_FLAG_METADATA_UUID) { + memcpy(&fi_args->metadata_uuid, fs_devices->metadata_uuid, + sizeof(fi_args->metadata_uuid)); + fi_args->flags |= BTRFS_FS_INFO_FLAG_METADATA_UUID; + } + if (copy_to_user(arg, fi_args, sizeof(*fi_args))) ret = -EFAULT; diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index 9b82e01c191d..2c39d15a2beb 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -252,6 +252,8 @@ struct btrfs_ioctl_dev_info_args { /* Request information about filesystem generation */ #define BTRFS_FS_INFO_FLAG_GENERATION (1 << 1) +/* Request information about filesystem metadata UUID */ +#define BTRFS_FS_INFO_FLAG_METADATA_UUID (1 << 2) struct btrfs_ioctl_fs_info_args { __u64 max_id; /* out */ @@ -265,7 +267,8 @@ struct btrfs_ioctl_fs_info_args { __u16 csum_size; /* out */ __u64 flags; /* in/out */ __u64 generation; /* out */ - __u8 reserved[960]; /* pad to 1k */ + __u8 metadata_uuid[BTRFS_FSID_SIZE]; /* out */ + __u8 reserved[944]; /* pad to 1k */ }; /* -- cgit v1.2.3-59-g8ed1b From 50c8a002bfd43798768ad07833b2b9d4b4d5274f Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 18 Jul 2020 17:29:03 -0700 Subject: platform/x86: ISST: drop a duplicated word in isst_if.h Drop the repeated word "for" in a comment. Signed-off-by: Randy Dunlap Cc: Srinivas Pandruvada Cc: platform-driver-x86@vger.kernel.org Cc: Darren Hart Cc: Andy Shevchenko Acked-by: Srinivas Pandruvada Signed-off-by: Andy Shevchenko --- include/uapi/linux/isst_if.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/isst_if.h b/include/uapi/linux/isst_if.h index 0a52b7b093d3..ba078f8e9add 100644 --- a/include/uapi/linux/isst_if.h +++ b/include/uapi/linux/isst_if.h @@ -69,7 +69,7 @@ struct isst_if_cpu_maps { * @logical_cpu: Logical CPU number to get target PCI device. * @reg: PUNIT register offset * @value: For write operation value to write and for - * for read placeholder read value + * read placeholder read value * * Structure to specify read/write data to PUNIT registers. */ -- cgit v1.2.3-59-g8ed1b From 1e6b57d6421f0343dd11619612e5ff8930cddf38 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 11 Jun 2020 11:11:32 -0400 Subject: unexport linux/elfcore.h It's unusable from userland - it uses elf_gregset_t, which is not provided by exported headers. glibc has it in sys/procfs.h, but the same file defines struct elf_prstatus, so linux/elfcore.h can't be included once sys/procfs.h has been pulled. Same goes for uclibc and dietlibc simply doesn't have elf_gregset_t defined anywhere. IOW, no userland source is including that thing. Signed-off-by: Al Viro --- include/linux/elfcore.h | 69 +++++++++++++++++++++++++++-- include/uapi/linux/elfcore.h | 101 ------------------------------------------- scripts/headers_install.sh | 1 - usr/include/Makefile | 1 - 4 files changed, 66 insertions(+), 106 deletions(-) delete mode 100644 include/uapi/linux/elfcore.h (limited to 'include/uapi') diff --git a/include/linux/elfcore.h b/include/linux/elfcore.h index 4cad0e784b28..96ab215dad2d 100644 --- a/include/linux/elfcore.h +++ b/include/linux/elfcore.h @@ -5,12 +5,75 @@ #include #include #include - -#include -#include +#include +#include +#include +#include +#include +#include struct coredump_params; +struct elf_siginfo +{ + int si_signo; /* signal number */ + int si_code; /* extra code */ + int si_errno; /* errno */ +}; + +/* + * Definitions to generate Intel SVR4-like core files. + * These mostly have the same names as the SVR4 types with "elf_" + * tacked on the front to prevent clashes with linux definitions, + * and the typedef forms have been avoided. This is mostly like + * the SVR4 structure, but more Linuxy, with things that Linux does + * not support and which gdb doesn't really use excluded. + */ +struct elf_prstatus +{ + struct elf_siginfo pr_info; /* Info associated with signal */ + short pr_cursig; /* Current signal */ + unsigned long pr_sigpend; /* Set of pending signals */ + unsigned long pr_sighold; /* Set of held signals */ + pid_t pr_pid; + pid_t pr_ppid; + pid_t pr_pgrp; + pid_t pr_sid; + struct __kernel_old_timeval pr_utime; /* User time */ + struct __kernel_old_timeval pr_stime; /* System time */ + struct __kernel_old_timeval pr_cutime; /* Cumulative user time */ + struct __kernel_old_timeval pr_cstime; /* Cumulative system time */ + elf_gregset_t pr_reg; /* GP registers */ +#ifdef CONFIG_BINFMT_ELF_FDPIC + /* When using FDPIC, the loadmap addresses need to be communicated + * to GDB in order for GDB to do the necessary relocations. The + * fields (below) used to communicate this information are placed + * immediately after ``pr_reg'', so that the loadmap addresses may + * be viewed as part of the register set if so desired. + */ + unsigned long pr_exec_fdpic_loadmap; + unsigned long pr_interp_fdpic_loadmap; +#endif + int pr_fpvalid; /* True if math co-processor being used. */ +}; + +#define ELF_PRARGSZ (80) /* Number of chars for args */ + +struct elf_prpsinfo +{ + char pr_state; /* numeric process state */ + char pr_sname; /* char for pr_state */ + char pr_zomb; /* zombie */ + char pr_nice; /* nice val */ + unsigned long pr_flag; /* flags */ + __kernel_uid_t pr_uid; + __kernel_gid_t pr_gid; + pid_t pr_pid, pr_ppid, pr_pgrp, pr_sid; + /* Lots missing */ + char pr_fname[16]; /* filename of executable */ + char pr_psargs[ELF_PRARGSZ]; /* initial part of arg list */ +}; + static inline void elf_core_copy_regs(elf_gregset_t *elfregs, struct pt_regs *regs) { #ifdef ELF_CORE_COPY_REGS diff --git a/include/uapi/linux/elfcore.h b/include/uapi/linux/elfcore.h deleted file mode 100644 index baf03562306d..000000000000 --- a/include/uapi/linux/elfcore.h +++ /dev/null @@ -1,101 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _UAPI_LINUX_ELFCORE_H -#define _UAPI_LINUX_ELFCORE_H - -#include -#include -#include -#include -#include -#include - -struct elf_siginfo -{ - int si_signo; /* signal number */ - int si_code; /* extra code */ - int si_errno; /* errno */ -}; - - -#ifndef __KERNEL__ -typedef elf_greg_t greg_t; -typedef elf_gregset_t gregset_t; -typedef elf_fpregset_t fpregset_t; -typedef elf_fpxregset_t fpxregset_t; -#define NGREG ELF_NGREG -#endif - -/* - * Definitions to generate Intel SVR4-like core files. - * These mostly have the same names as the SVR4 types with "elf_" - * tacked on the front to prevent clashes with linux definitions, - * and the typedef forms have been avoided. This is mostly like - * the SVR4 structure, but more Linuxy, with things that Linux does - * not support and which gdb doesn't really use excluded. - * Fields present but not used are marked with "XXX". - */ -struct elf_prstatus -{ -#if 0 - long pr_flags; /* XXX Process flags */ - short pr_why; /* XXX Reason for process halt */ - short pr_what; /* XXX More detailed reason */ -#endif - struct elf_siginfo pr_info; /* Info associated with signal */ - short pr_cursig; /* Current signal */ - unsigned long pr_sigpend; /* Set of pending signals */ - unsigned long pr_sighold; /* Set of held signals */ -#if 0 - struct sigaltstack pr_altstack; /* Alternate stack info */ - struct sigaction pr_action; /* Signal action for current sig */ -#endif - pid_t pr_pid; - pid_t pr_ppid; - pid_t pr_pgrp; - pid_t pr_sid; - struct __kernel_old_timeval pr_utime; /* User time */ - struct __kernel_old_timeval pr_stime; /* System time */ - struct __kernel_old_timeval pr_cutime; /* Cumulative user time */ - struct __kernel_old_timeval pr_cstime; /* Cumulative system time */ -#if 0 - long pr_instr; /* Current instruction */ -#endif - elf_gregset_t pr_reg; /* GP registers */ -#ifdef CONFIG_BINFMT_ELF_FDPIC - /* When using FDPIC, the loadmap addresses need to be communicated - * to GDB in order for GDB to do the necessary relocations. The - * fields (below) used to communicate this information are placed - * immediately after ``pr_reg'', so that the loadmap addresses may - * be viewed as part of the register set if so desired. - */ - unsigned long pr_exec_fdpic_loadmap; - unsigned long pr_interp_fdpic_loadmap; -#endif - int pr_fpvalid; /* True if math co-processor being used. */ -}; - -#define ELF_PRARGSZ (80) /* Number of chars for args */ - -struct elf_prpsinfo -{ - char pr_state; /* numeric process state */ - char pr_sname; /* char for pr_state */ - char pr_zomb; /* zombie */ - char pr_nice; /* nice val */ - unsigned long pr_flag; /* flags */ - __kernel_uid_t pr_uid; - __kernel_gid_t pr_gid; - pid_t pr_pid, pr_ppid, pr_pgrp, pr_sid; - /* Lots missing */ - char pr_fname[16]; /* filename of executable */ - char pr_psargs[ELF_PRARGSZ]; /* initial part of arg list */ -}; - -#ifndef __KERNEL__ -typedef struct elf_prstatus prstatus_t; -typedef struct elf_prpsinfo prpsinfo_t; -#define PRARGSZ ELF_PRARGSZ -#endif - - -#endif /* _UAPI_LINUX_ELFCORE_H */ diff --git a/scripts/headers_install.sh b/scripts/headers_install.sh index 955cf3aedf21..9314247bb222 100755 --- a/scripts/headers_install.sh +++ b/scripts/headers_install.sh @@ -86,7 +86,6 @@ arch/x86/include/uapi/asm/auxvec.h:CONFIG_X86_64 arch/x86/include/uapi/asm/mman.h:CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS include/uapi/asm-generic/fcntl.h:CONFIG_64BIT include/uapi/linux/atmdev.h:CONFIG_COMPAT -include/uapi/linux/elfcore.h:CONFIG_BINFMT_ELF_FDPIC include/uapi/linux/eventpoll.h:CONFIG_PM_SLEEP include/uapi/linux/hw_breakpoint.h:CONFIG_HAVE_MIXED_BREAKPOINTS_REGS include/uapi/linux/pktcdvd.h:CONFIG_CDROM_PKTCDVD_WCACHE diff --git a/usr/include/Makefile b/usr/include/Makefile index 55362f3ab393..f6b3c85d900e 100644 --- a/usr/include/Makefile +++ b/usr/include/Makefile @@ -28,7 +28,6 @@ no-header-test += linux/am437x-vpfe.h no-header-test += linux/android/binder.h no-header-test += linux/android/binderfs.h no-header-test += linux/coda.h -no-header-test += linux/elfcore.h no-header-test += linux/errqueue.h no-header-test += linux/fsmap.h no-header-test += linux/hdlc/ioctl.h -- cgit v1.2.3-59-g8ed1b From 8f4c0e01789c18674acdf17cae3822b3dc3db715 Mon Sep 17 00:00:00 2001 From: Murali Karicheri Date: Wed, 22 Jul 2020 10:40:16 -0400 Subject: hsr: enhance netlink socket interface to support PRP Parallel Redundancy Protocol (PRP) is another redundancy protocol introduced by IEC 63439 standard. It is similar to HSR in many aspects:- - Use a pair of Ethernet interfaces to created the PRP device - Use a 6 byte redundancy protocol part (RCT, Redundancy Check Trailer) similar to HSR Tag. - Has Link Redundancy Entity (LRE) that works with RCT to implement redundancy. Key difference is that the protocol unit is a trailer instead of a prefix as in HSR. That makes it inter-operable with tradition network components such as bridges/switches which treat it as pad bytes, whereas HSR nodes requires some kind of translators (Called redbox) to talk to regular network devices. This features allows regular linux box to be converted to a DAN-P box. DAN-P stands for Dual Attached Node - PRP similar to DAN-H (Dual Attached Node - HSR). Add a comment at the header/source code to explicitly state that the driver files also handles PRP protocol as well. Signed-off-by: Murali Karicheri Signed-off-by: David S. Miller --- include/uapi/linux/hsr_netlink.h | 2 +- include/uapi/linux/if_link.h | 12 +++++++++++- net/hsr/Kconfig | 35 +++++++++++++++++++++++------------ net/hsr/hsr_debugfs.c | 2 +- net/hsr/hsr_device.c | 7 +++++-- net/hsr/hsr_device.h | 2 ++ net/hsr/hsr_forward.c | 2 ++ net/hsr/hsr_forward.h | 2 ++ net/hsr/hsr_framereg.c | 1 + net/hsr/hsr_framereg.h | 2 ++ net/hsr/hsr_main.c | 2 ++ net/hsr/hsr_main.h | 11 ++++++++++- net/hsr/hsr_netlink.c | 38 +++++++++++++++++++++++++++++++------- net/hsr/hsr_netlink.h | 2 ++ net/hsr/hsr_slave.c | 2 ++ net/hsr/hsr_slave.h | 2 ++ 16 files changed, 99 insertions(+), 25 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/hsr_netlink.h b/include/uapi/linux/hsr_netlink.h index c218ef9c35dd..d540ea9bbef4 100644 --- a/include/uapi/linux/hsr_netlink.h +++ b/include/uapi/linux/hsr_netlink.h @@ -17,7 +17,7 @@ /* Generic Netlink HSR family definition */ -/* attributes */ +/* attributes for HSR or PRP node */ enum { HSR_A_UNSPEC, HSR_A_NODE_ADDR, diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index af8f31987526..63af64646358 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -907,7 +907,14 @@ enum { #define IFLA_IPOIB_MAX (__IFLA_IPOIB_MAX - 1) -/* HSR section */ +/* HSR/PRP section, both uses same interface */ + +/* Different redundancy protocols for hsr device */ +enum { + HSR_PROTOCOL_HSR, + HSR_PROTOCOL_PRP, + HSR_PROTOCOL_MAX, +}; enum { IFLA_HSR_UNSPEC, @@ -917,6 +924,9 @@ enum { IFLA_HSR_SUPERVISION_ADDR, /* Supervision frame multicast addr */ IFLA_HSR_SEQ_NR, IFLA_HSR_VERSION, /* HSR version */ + IFLA_HSR_PROTOCOL, /* Indicate different protocol than + * HSR. For example PRP. + */ __IFLA_HSR_MAX, }; diff --git a/net/hsr/Kconfig b/net/hsr/Kconfig index 8095b034e76e..1b048c17b6c8 100644 --- a/net/hsr/Kconfig +++ b/net/hsr/Kconfig @@ -4,24 +4,35 @@ # config HSR - tristate "High-availability Seamless Redundancy (HSR)" + tristate "High-availability Seamless Redundancy (HSR & PRP)" help + This enables IEC 62439 defined High-availability Seamless + Redundancy (HSR) and Parallel Redundancy Protocol (PRP). + If you say Y here, then your Linux box will be able to act as a - DANH ("Doubly attached node implementing HSR"). For this to work, - your Linux box needs (at least) two physical Ethernet interfaces, - and it must be connected as a node in a ring network together with - other HSR capable nodes. + DANH ("Doubly attached node implementing HSR") or DANP ("Doubly + attached node implementing PRP"). For this to work, your Linux box + needs (at least) two physical Ethernet interfaces. + + For DANH, it must be connected as a node in a ring network together + with other HSR capable nodes. All Ethernet frames sent over the HSR + device will be sent in both directions on the ring (over both slave + ports), giving a redundant, instant fail-over network. Each HSR node + in the ring acts like a bridge for HSR frames, but filters frames + that have been forwarded earlier. - All Ethernet frames sent over the hsr device will be sent in both - directions on the ring (over both slave ports), giving a redundant, - instant fail-over network. Each HSR node in the ring acts like a - bridge for HSR frames, but filters frames that have been forwarded - earlier. + For DANP, it must be connected as a node connecting to two + separate networks over the two slave interfaces. Like HSR, Ethernet + frames sent over the PRP device will be sent to both networks giving + a redundant, instant fail-over network. Unlike HSR, PRP networks + can have Singly Attached Nodes (SAN) such as PC, printer, bridges + etc and will be able to communicate with DANP nodes. This code is a "best effort" to comply with the HSR standard as described in IEC 62439-3:2010 (HSRv0) and IEC 62439-3:2012 (HSRv1), - but no compliancy tests have been made. Use iproute2 to select - the version you desire. + and PRP standard described in IEC 62439-4:2012 (PRP), but no + compliancy tests have been made. Use iproute2 to select the protocol + you would like to use. You need to perform any and all necessary tests yourself before relying on this code in a safety critical system! diff --git a/net/hsr/hsr_debugfs.c b/net/hsr/hsr_debugfs.c index 9787ef11ca71..c1932c0a15be 100644 --- a/net/hsr/hsr_debugfs.c +++ b/net/hsr/hsr_debugfs.c @@ -1,5 +1,5 @@ /* - * hsr_debugfs code + * debugfs code for HSR & PRP * Copyright (C) 2019 Texas Instruments Incorporated * * Author(s): diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index 8a927b647829..40ac45123a62 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -3,9 +3,8 @@ * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se - * * This file contains device methods for creating, using and destroying - * virtual HSR devices. + * virtual HSR or PRP devices. */ #include @@ -427,6 +426,10 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], ether_addr_copy(hsr_dev->dev_addr, slave[0]->dev_addr); + /* currently PRP is not supported */ + if (protocol_version == PRP_V1) + return -EPROTONOSUPPORT; + /* Make sure we recognize frames from ourselves in hsr_rcv() */ res = hsr_create_self_node(hsr, hsr_dev->dev_addr, slave[1]->dev_addr); diff --git a/net/hsr/hsr_device.h b/net/hsr/hsr_device.h index b8f9262ed101..868373822ee4 100644 --- a/net/hsr/hsr_device.h +++ b/net/hsr/hsr_device.h @@ -3,6 +3,8 @@ * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + * + * include file for HSR and PRP. */ #ifndef __HSR_DEVICE_H diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c index ab8dca0c0b65..55adb4dbd235 100644 --- a/net/hsr/hsr_forward.c +++ b/net/hsr/hsr_forward.c @@ -3,6 +3,8 @@ * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + * + * Frame router for HSR and PRP. */ #include "hsr_forward.h" diff --git a/net/hsr/hsr_forward.h b/net/hsr/hsr_forward.h index 51a69295566c..b2a6fa319d94 100644 --- a/net/hsr/hsr_forward.h +++ b/net/hsr/hsr_forward.h @@ -3,6 +3,8 @@ * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + * + * include file for HSR and PRP. */ #ifndef __HSR_FORWARD_H diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index 530de24b1fb5..13b2190e6556 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -8,6 +8,7 @@ * interface. A frame is identified by its source MAC address and its HSR * sequence number. This code keeps track of senders and their sequence numbers * to allow filtering of duplicate frames, and to detect HSR ring errors. + * Same code handles filtering of duplicates for PRP as well. */ #include diff --git a/net/hsr/hsr_framereg.h b/net/hsr/hsr_framereg.h index 0f0fa12b4329..c06447780d05 100644 --- a/net/hsr/hsr_framereg.h +++ b/net/hsr/hsr_framereg.h @@ -3,6 +3,8 @@ * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + * + * include file for HSR and PRP. */ #ifndef __HSR_FRAMEREG_H diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c index 144da15f0a81..2fd1976e5b1c 100644 --- a/net/hsr/hsr_main.c +++ b/net/hsr/hsr_main.c @@ -3,6 +3,8 @@ * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + * + * Event handling for HSR and PRP devices. */ #include diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index f74193465bf5..8cf10d67d5f9 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -3,6 +3,8 @@ * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + * + * include file for HSR and PRP. */ #ifndef __HSR_PRIVATE_H @@ -131,6 +133,13 @@ struct hsr_port { enum hsr_port_type type; }; +/* used by driver internally to differentiate various protocols */ +enum hsr_version { + HSR_V0 = 0, + HSR_V1, + PRP_V1, +}; + struct hsr_priv { struct rcu_head rcu_head; struct list_head ports; @@ -141,7 +150,7 @@ struct hsr_priv { int announce_count; u16 sequence_nr; u16 sup_sequence_nr; /* For HSRv1 separate seq_nr for supervision */ - u8 prot_version; /* Indicate if HSRv0 or HSRv1. */ + enum hsr_version prot_version; /* Indicate if HSRv0, HSRv1 or PRPv1 */ spinlock_t seqnr_lock; /* locking for sequence_nr */ spinlock_t list_lock; /* locking for node list */ unsigned char sup_multicast_addr[ETH_ALEN]; diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c index 6e14b7d22639..06c3cd988760 100644 --- a/net/hsr/hsr_netlink.c +++ b/net/hsr/hsr_netlink.c @@ -4,7 +4,7 @@ * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se * - * Routines for handling Netlink messages for HSR. + * Routines for handling Netlink messages for HSR and PRP. */ #include "hsr_netlink.h" @@ -22,6 +22,7 @@ static const struct nla_policy hsr_policy[IFLA_HSR_MAX + 1] = { [IFLA_HSR_VERSION] = { .type = NLA_U8 }, [IFLA_HSR_SUPERVISION_ADDR] = { .len = ETH_ALEN }, [IFLA_HSR_SEQ_NR] = { .type = NLA_U16 }, + [IFLA_HSR_PROTOCOL] = { .type = NLA_U8 }, }; /* Here, it seems a netdevice has already been allocated for us, and the @@ -31,8 +32,10 @@ static int hsr_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { + enum hsr_version proto_version; + unsigned char multicast_spec; + u8 proto = HSR_PROTOCOL_HSR; struct net_device *link[2]; - unsigned char multicast_spec, hsr_version; if (!data) { NL_SET_ERR_MSG_MOD(extack, "No slave devices specified"); @@ -69,18 +72,34 @@ static int hsr_newlink(struct net *src_net, struct net_device *dev, else multicast_spec = nla_get_u8(data[IFLA_HSR_MULTICAST_SPEC]); + if (data[IFLA_HSR_PROTOCOL]) + proto = nla_get_u8(data[IFLA_HSR_PROTOCOL]); + + if (proto >= HSR_PROTOCOL_MAX) { + NL_SET_ERR_MSG_MOD(extack, "Unsupported protocol\n"); + return -EINVAL; + } + if (!data[IFLA_HSR_VERSION]) { - hsr_version = 0; + proto_version = HSR_V0; } else { - hsr_version = nla_get_u8(data[IFLA_HSR_VERSION]); - if (hsr_version > 1) { + if (proto == HSR_PROTOCOL_PRP) { + NL_SET_ERR_MSG_MOD(extack, "PRP version unsupported\n"); + return -EINVAL; + } + + proto_version = nla_get_u8(data[IFLA_HSR_VERSION]); + if (proto_version > HSR_V1) { NL_SET_ERR_MSG_MOD(extack, - "Only versions 0..1 are supported"); + "Only HSR version 0/1 supported\n"); return -EINVAL; } } - return hsr_dev_finalize(dev, link, multicast_spec, hsr_version, extack); + if (proto == HSR_PROTOCOL_PRP) + proto_version = PRP_V1; + + return hsr_dev_finalize(dev, link, multicast_spec, proto_version, extack); } static void hsr_dellink(struct net_device *dev, struct list_head *head) @@ -102,6 +121,7 @@ static void hsr_dellink(struct net_device *dev, struct list_head *head) static int hsr_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct hsr_priv *hsr = netdev_priv(dev); + u8 proto = HSR_PROTOCOL_HSR; struct hsr_port *port; port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_A); @@ -120,6 +140,10 @@ static int hsr_fill_info(struct sk_buff *skb, const struct net_device *dev) hsr->sup_multicast_addr) || nla_put_u16(skb, IFLA_HSR_SEQ_NR, hsr->sequence_nr)) goto nla_put_failure; + if (hsr->prot_version == PRP_V1) + proto = HSR_PROTOCOL_PRP; + if (nla_put_u8(skb, IFLA_HSR_PROTOCOL, proto)) + goto nla_put_failure; return 0; diff --git a/net/hsr/hsr_netlink.h b/net/hsr/hsr_netlink.h index 1121bb192a18..501552d9753b 100644 --- a/net/hsr/hsr_netlink.h +++ b/net/hsr/hsr_netlink.h @@ -3,6 +3,8 @@ * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + * + * include file for HSR and PRP. */ #ifndef __HSR_NETLINK_H diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c index 25b6ffba26cd..b5c0834de338 100644 --- a/net/hsr/hsr_slave.c +++ b/net/hsr/hsr_slave.c @@ -3,6 +3,8 @@ * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + * + * Frame handler other utility functions for HSR and PRP. */ #include "hsr_slave.h" diff --git a/net/hsr/hsr_slave.h b/net/hsr/hsr_slave.h index 8953ea279ce9..9708a4f0ec09 100644 --- a/net/hsr/hsr_slave.h +++ b/net/hsr/hsr_slave.h @@ -2,6 +2,8 @@ /* Copyright 2011-2014 Autronica Fire and Security AS * * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + * + * include file for HSR and PRP. */ #ifndef __HSR_SLAVE_H -- cgit v1.2.3-59-g8ed1b From 08b95c338e0c5a96e47f4ca314ea1e7580ecb5d7 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 8 Jul 2020 14:11:52 +0300 Subject: fanotify: remove event FAN_DIR_MODIFY It was never enabled in uapi and its functionality is about to be superseded by events FAN_CREATE, FAN_DELETE, FAN_MOVE with group flag FAN_REPORT_NAME. Keep a place holder variable name_event instead of removing the name recording code since it will be used by the new events. Link: https://lore.kernel.org/r/20200708111156.24659-17-amir73il@gmail.com Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara --- fs/notify/fanotify/fanotify.c | 9 ++------- fs/notify/fsnotify.c | 2 +- include/linux/fsnotify.h | 6 ------ include/linux/fsnotify_backend.h | 4 +--- include/uapi/linux/fanotify.h | 1 - 5 files changed, 4 insertions(+), 18 deletions(-) (limited to 'include/uapi') diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index e417c64c365b..e6ba605732d7 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -425,6 +425,7 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, gfp_t gfp = GFP_KERNEL_ACCOUNT; struct inode *id = fanotify_fid_inode(mask, data, data_type, dir); const struct path *path = fsnotify_data_path(data, data_type); + bool name_event = false; /* * For queues with unlimited length lost events are not expected and @@ -442,12 +443,7 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, if (fanotify_is_perm_event(mask)) { event = fanotify_alloc_perm_event(path, gfp); - } else if (mask & FAN_DIR_MODIFY && !(WARN_ON_ONCE(!file_name))) { - /* - * For FAN_DIR_MODIFY event, we report the fid of the directory - * and the name of the modified entry. - * Allocate an fanotify_name_event struct and copy the name. - */ + } else if (name_event && file_name) { event = fanotify_alloc_name_event(id, fsid, file_name, gfp); } else if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) { event = fanotify_alloc_fid_event(id, fsid, gfp); @@ -528,7 +524,6 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask, BUILD_BUG_ON(FAN_MOVED_FROM != FS_MOVED_FROM); BUILD_BUG_ON(FAN_CREATE != FS_CREATE); BUILD_BUG_ON(FAN_DELETE != FS_DELETE); - BUILD_BUG_ON(FAN_DIR_MODIFY != FS_DIR_MODIFY); BUILD_BUG_ON(FAN_DELETE_SELF != FS_DELETE_SELF); BUILD_BUG_ON(FAN_MOVE_SELF != FS_MOVE_SELF); BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD); diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index c4ac4d13e10f..f12a554be3f0 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -393,7 +393,7 @@ static __init int fsnotify_init(void) { int ret; - BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 26); + BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 25); ret = init_srcu_struct(&fsnotify_mark_srcu); if (ret) diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 316c9b820517..9b2566d273a9 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -30,12 +30,6 @@ static inline void fsnotify_name(struct inode *dir, __u32 mask, const struct qstr *name, u32 cookie) { fsnotify(dir, mask, child, FSNOTIFY_EVENT_INODE, name, cookie); - /* - * Send another flavor of the event without child inode data and - * without the specific event type (e.g. FS_CREATE|FS_IS_DIR). - * The name is relative to the dir inode the event is reported to. - */ - fsnotify(dir, FS_DIR_MODIFY, dir, FSNOTIFY_EVENT_INODE, name, 0); } static inline void fsnotify_dirent(struct inode *dir, struct dentry *dentry, diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 0de130cbf72d..94a4ff3d5bbe 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -47,7 +47,6 @@ #define FS_OPEN_PERM 0x00010000 /* open event in an permission hook */ #define FS_ACCESS_PERM 0x00020000 /* access event in a permissions hook */ #define FS_OPEN_EXEC_PERM 0x00040000 /* open/exec event in a permission hook */ -#define FS_DIR_MODIFY 0x00080000 /* Directory entry was modified */ #define FS_EXCL_UNLINK 0x04000000 /* do not send events if object is unlinked */ /* This inode cares about things that happen to its children. Always set for @@ -67,8 +66,7 @@ * The watching parent may get an FS_ATTRIB|FS_EVENT_ON_CHILD event * when a directory entry inside a child subdir changes. */ -#define ALL_FSNOTIFY_DIRENT_EVENTS (FS_CREATE | FS_DELETE | FS_MOVE | \ - FS_DIR_MODIFY) +#define ALL_FSNOTIFY_DIRENT_EVENTS (FS_CREATE | FS_DELETE | FS_MOVE) #define ALL_FSNOTIFY_PERM_EVENTS (FS_OPEN_PERM | FS_ACCESS_PERM | \ FS_OPEN_EXEC_PERM) diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h index a88c7c6d0692..7f2f17eacbf9 100644 --- a/include/uapi/linux/fanotify.h +++ b/include/uapi/linux/fanotify.h @@ -24,7 +24,6 @@ #define FAN_OPEN_PERM 0x00010000 /* File open in perm check */ #define FAN_ACCESS_PERM 0x00020000 /* File accessed in perm check */ #define FAN_OPEN_EXEC_PERM 0x00040000 /* File open/exec in perm check */ -#define FAN_DIR_MODIFY 0x00080000 /* Directory entry was modified */ #define FAN_EVENT_ON_CHILD 0x08000000 /* Interested in child events */ -- cgit v1.2.3-59-g8ed1b From 83b7a59896dd24015a34b7f00027f0ff3747972f Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 16 Jul 2020 11:42:26 +0300 Subject: fanotify: add basic support for FAN_REPORT_DIR_FID For now, the flag is mutually exclusive with FAN_REPORT_FID. Events include a single info record of type FAN_EVENT_INFO_TYPE_DFID with a directory file handle. For now, events are only reported for: - Directory modification events - Events on children of a watching directory - Events on directory objects Soon, we will add support for reporting the parent directory fid for events on non-directories with filesystem/mount mark and support for reporting both parent directory fid and child fid. Link: https://lore.kernel.org/r/20200716084230.30611-19-amir73il@gmail.com Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara --- fs/notify/fanotify/fanotify.c | 34 ++++++++++++++++-- fs/notify/fanotify/fanotify_user.c | 71 ++++++++++++++++++++++++++++++++------ include/linux/fanotify.h | 2 +- include/uapi/linux/fanotify.h | 11 +++--- 4 files changed, 101 insertions(+), 17 deletions(-) (limited to 'include/uapi') diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 3baf93e998c1..fc2e1fab34af 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -223,7 +223,7 @@ out: static u32 fanotify_group_event_mask(struct fsnotify_group *group, struct fsnotify_iter_info *iter_info, u32 event_mask, const void *data, - int data_type) + int data_type, struct inode *dir) { __u32 marks_mask = 0, marks_ignored_mask = 0; __u32 test_mask, user_mask = FANOTIFY_OUTGOING_EVENTS | @@ -243,6 +243,10 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group, /* Path type events are only relevant for files and dirs */ if (!d_is_reg(path->dentry) && !d_can_lookup(path->dentry)) return 0; + } else if (!(fid_mode & FAN_REPORT_FID)) { + /* Do we have a directory inode to report? */ + if (!dir && !(event_mask & FS_ISDIR)) + return 0; } fsnotify_foreach_obj_type(type) { @@ -396,6 +400,28 @@ static struct inode *fanotify_fid_inode(u32 event_mask, const void *data, return fsnotify_data_inode(data, data_type); } +/* + * The inode to use as identifier when reporting dir fid depends on the event. + * Report the modified directory inode on dirent modification events. + * Report the "victim" inode if "victim" is a directory. + * Report the parent inode if "victim" is not a directory and event is + * reported to parent. + * Otherwise, do not report dir fid. + */ +static struct inode *fanotify_dfid_inode(u32 event_mask, const void *data, + int data_type, struct inode *dir) +{ + struct inode *inode = fsnotify_data_inode(data, data_type); + + if (event_mask & ALL_FSNOTIFY_DIRENT_EVENTS) + return dir; + + if (S_ISDIR(inode->i_mode)) + return inode; + + return dir; +} + static struct fanotify_event *fanotify_alloc_path_event(const struct path *path, gfp_t gfp) { @@ -491,10 +517,14 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, struct fanotify_event *event = NULL; gfp_t gfp = GFP_KERNEL_ACCOUNT; struct inode *id = fanotify_fid_inode(mask, data, data_type, dir); + struct inode *dirid = fanotify_dfid_inode(mask, data, data_type, dir); const struct path *path = fsnotify_data_path(data, data_type); unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); bool name_event = false; + if ((fid_mode & FAN_REPORT_DIR_FID) && dirid) + id = dirid; + /* * For queues with unlimited length lost events are not expected and * can possibly have security implications. Avoid losing events when @@ -605,7 +635,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask, BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 19); mask = fanotify_group_event_mask(group, iter_info, mask, data, - data_type); + data_type, dir); if (!mask) return 0; diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 3842ef00b52e..e494400711c9 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -216,7 +216,7 @@ static int process_access_response(struct fsnotify_group *group, } static int copy_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh, - const char *name, size_t name_len, + int info_type, const char *name, size_t name_len, char __user *buf, size_t count) { struct fanotify_event_info_fid info = { }; @@ -229,7 +229,7 @@ static int copy_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh, pr_debug("%s: fh_len=%zu name_len=%zu, info_len=%zu, count=%zu\n", __func__, fh_len, name_len, info_len, count); - if (!fh_len || (name && !name_len)) + if (!fh_len) return 0; if (WARN_ON_ONCE(len < sizeof(info) || len > count)) @@ -239,8 +239,21 @@ static int copy_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh, * Copy event info fid header followed by variable sized file handle * and optionally followed by variable sized filename. */ - info.hdr.info_type = name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME : - FAN_EVENT_INFO_TYPE_FID; + switch (info_type) { + case FAN_EVENT_INFO_TYPE_FID: + case FAN_EVENT_INFO_TYPE_DFID: + if (WARN_ON_ONCE(name_len)) + return -EFAULT; + break; + case FAN_EVENT_INFO_TYPE_DFID_NAME: + if (WARN_ON_ONCE(!name || !name_len)) + return -EFAULT; + break; + default: + return -EFAULT; + } + + info.hdr.info_type = info_type; info.hdr.len = len; info.fsid = *fsid; if (copy_to_user(buf, &info, sizeof(info))) @@ -304,8 +317,10 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, struct fanotify_event_metadata metadata; struct path *path = fanotify_event_path(event); struct fanotify_info *info = fanotify_event_info(event); + unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); struct file *f = NULL; int ret, fd = FAN_NOFD; + int info_type = 0; pr_debug("%s: group=%p event=%p\n", __func__, group, event); @@ -346,9 +361,10 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, /* Event info records order is: dir fid + name, child fid */ if (fanotify_event_dir_fh_len(event)) { + info_type = FAN_EVENT_INFO_TYPE_DFID_NAME; ret = copy_info_to_user(fanotify_event_fsid(event), fanotify_info_dir_fh(info), - fanotify_info_name(info), + info_type, fanotify_info_name(info), info->name_len, buf, count); if (ret < 0) return ret; @@ -358,9 +374,33 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, } if (fanotify_event_object_fh_len(event)) { + if (fid_mode == FAN_REPORT_FID || info_type) { + /* + * With only group flag FAN_REPORT_FID only type FID is + * reported. Second info record type is always FID. + */ + info_type = FAN_EVENT_INFO_TYPE_FID; + } else if ((event->mask & ALL_FSNOTIFY_DIRENT_EVENTS) || + (event->mask & FAN_ONDIR)) { + /* + * With group flag FAN_REPORT_DIR_FID, a single info + * record has type DFID for directory entry modification + * event and for event on a directory. + */ + info_type = FAN_EVENT_INFO_TYPE_DFID; + } else { + /* + * With group flags FAN_REPORT_DIR_FID|FAN_REPORT_FID, + * a single info record has type FID for event on a + * non-directory, when there is no directory to report. + * For example, on FAN_DELETE_SELF event. + */ + info_type = FAN_EVENT_INFO_TYPE_FID; + } + ret = copy_info_to_user(fanotify_event_fsid(event), fanotify_event_object_fh(event), - NULL, 0, buf, count); + info_type, NULL, 0, buf, count); if (ret < 0) return ret; @@ -861,6 +901,8 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) struct fsnotify_group *group; int f_flags, fd; struct user_struct *user; + unsigned int fid_mode = flags & FANOTIFY_FID_BITS; + unsigned int class = flags & FANOTIFY_CLASS_BITS; pr_debug("%s: flags=%x event_f_flags=%x\n", __func__, flags, event_f_flags); @@ -887,10 +929,19 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) return -EINVAL; } - if ((flags & FANOTIFY_FID_BITS) && - (flags & FANOTIFY_CLASS_BITS) != FAN_CLASS_NOTIF) + if (fid_mode && class != FAN_CLASS_NOTIF) return -EINVAL; + /* Reporting either object fid or dir fid */ + switch (fid_mode) { + case 0: + case FAN_REPORT_FID: + case FAN_REPORT_DIR_FID: + break; + default: + return -EINVAL; + } + user = get_current_user(); if (atomic_read(&user->fanotify_listeners) > FANOTIFY_DEFAULT_MAX_LISTENERS) { free_uid(user); @@ -926,7 +977,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) group->fanotify_data.f_flags = event_f_flags; init_waitqueue_head(&group->fanotify_data.access_waitq); INIT_LIST_HEAD(&group->fanotify_data.access_list); - switch (flags & FANOTIFY_CLASS_BITS) { + switch (class) { case FAN_CLASS_NOTIF: group->priority = FS_PRIO_0; break; @@ -1236,7 +1287,7 @@ COMPAT_SYSCALL_DEFINE6(fanotify_mark, */ static int __init fanotify_user_setup(void) { - BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 8); + BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 9); BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9); fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h index bbbee11d2521..4ddac97b2bf7 100644 --- a/include/linux/fanotify.h +++ b/include/linux/fanotify.h @@ -18,7 +18,7 @@ #define FANOTIFY_CLASS_BITS (FAN_CLASS_NOTIF | FAN_CLASS_CONTENT | \ FAN_CLASS_PRE_CONTENT) -#define FANOTIFY_FID_BITS (FAN_REPORT_FID) +#define FANOTIFY_FID_BITS (FAN_REPORT_FID | FAN_REPORT_DIR_FID) #define FANOTIFY_INIT_FLAGS (FANOTIFY_CLASS_BITS | FANOTIFY_FID_BITS | \ FAN_REPORT_TID | \ diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h index 7f2f17eacbf9..21afebf77fd7 100644 --- a/include/uapi/linux/fanotify.h +++ b/include/uapi/linux/fanotify.h @@ -53,6 +53,7 @@ /* Flags to determine fanotify event format */ #define FAN_REPORT_TID 0x00000100 /* event->pid is thread id */ #define FAN_REPORT_FID 0x00000200 /* Report unique file id */ +#define FAN_REPORT_DIR_FID 0x00000400 /* Report unique directory id */ /* Deprecated - do not use this in programs and do not add new flags here! */ #define FAN_ALL_INIT_FLAGS (FAN_CLOEXEC | FAN_NONBLOCK | \ @@ -117,6 +118,7 @@ struct fanotify_event_metadata { #define FAN_EVENT_INFO_TYPE_FID 1 #define FAN_EVENT_INFO_TYPE_DFID_NAME 2 +#define FAN_EVENT_INFO_TYPE_DFID 3 /* Variable length info record following event metadata */ struct fanotify_event_info_header { @@ -126,10 +128,11 @@ struct fanotify_event_info_header { }; /* - * Unique file identifier info record. This is used both for - * FAN_EVENT_INFO_TYPE_FID records and for FAN_EVENT_INFO_TYPE_DFID_NAME - * records. For FAN_EVENT_INFO_TYPE_DFID_NAME there is additionally a null - * terminated name immediately after the file handle. + * Unique file identifier info record. + * This structure is used for records of types FAN_EVENT_INFO_TYPE_FID, + * FAN_EVENT_INFO_TYPE_DFID and FAN_EVENT_INFO_TYPE_DFID_NAME. + * For FAN_EVENT_INFO_TYPE_DFID_NAME there is additionally a null terminated + * name immediately after the file handle. */ struct fanotify_event_info_fid { struct fanotify_event_info_header hdr; -- cgit v1.2.3-59-g8ed1b From 929943b38daf817f2e6d303ea04401651fc3bc05 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 16 Jul 2020 11:42:28 +0300 Subject: fanotify: add support for FAN_REPORT_NAME Introduce a new fanotify_init() flag FAN_REPORT_NAME. It requires the flag FAN_REPORT_DIR_FID and there is a constant for setting both flags named FAN_REPORT_DFID_NAME. For a group with flag FAN_REPORT_NAME, the parent fid and name are reported for directory entry modification events (create/detete/move) and for events on non-directory objects. Events on directories themselves are reported with their own fid and "." as the name. The parent fid and name are reported with an info record of type FAN_EVENT_INFO_TYPE_DFID_NAME, similar to the way that parent fid is reported with into type FAN_EVENT_INFO_TYPE_DFID, but with an appended null terminated name string. Link: https://lore.kernel.org/r/20200716084230.30611-21-amir73il@gmail.com Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara --- fs/notify/fanotify/fanotify.c | 18 ++++++++++++++- fs/notify/fanotify/fanotify_user.c | 45 ++++++++++++++++++++++++++++++-------- include/linux/fanotify.h | 2 +- include/uapi/linux/fanotify.h | 4 ++++ 4 files changed, 58 insertions(+), 11 deletions(-) (limited to 'include/uapi') diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index fc2e1fab34af..d793f3e56b26 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -522,9 +522,25 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); bool name_event = false; - if ((fid_mode & FAN_REPORT_DIR_FID) && dirid) + if ((fid_mode & FAN_REPORT_DIR_FID) && dirid) { id = dirid; + /* + * We record file name only in a group with FAN_REPORT_NAME + * and when we have a directory inode to report. + * + * For directory entry modification event, we record the fid of + * the directory and the name of the modified entry. + * + * For event on non-directory that is reported to parent, we + * record the fid of the parent and the name of the child. + */ + if ((fid_mode & FAN_REPORT_NAME) && + ((mask & ALL_FSNOTIFY_DIRENT_EVENTS) || + !(mask & FAN_ONDIR))) + name_event = true; + } + /* * For queues with unlimited length lost events are not expected and * can possibly have security implications. Avoid losing events when diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 7caa64d028ba..6b839790cb42 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -64,18 +64,27 @@ static int fanotify_fid_info_len(int fh_len, int name_len) return roundup(FANOTIFY_INFO_HDR_LEN + info_len, FANOTIFY_EVENT_ALIGN); } -static int fanotify_event_info_len(struct fanotify_event *event) +static int fanotify_event_info_len(unsigned int fid_mode, + struct fanotify_event *event) { struct fanotify_info *info = fanotify_event_info(event); int dir_fh_len = fanotify_event_dir_fh_len(event); int fh_len = fanotify_event_object_fh_len(event); int info_len = 0; + int dot_len = 0; - if (dir_fh_len) + if (dir_fh_len) { info_len += fanotify_fid_info_len(dir_fh_len, info->name_len); + } else if ((fid_mode & FAN_REPORT_NAME) && (event->mask & FAN_ONDIR)) { + /* + * With group flag FAN_REPORT_NAME, if name was not recorded in + * event on a directory, we will report the name ".". + */ + dot_len = 1; + } if (fh_len) - info_len += fanotify_fid_info_len(fh_len, 0); + info_len += fanotify_fid_info_len(fh_len, dot_len); return info_len; } @@ -91,6 +100,7 @@ static struct fanotify_event *get_one_event(struct fsnotify_group *group, { size_t event_size = FAN_EVENT_METADATA_LEN; struct fanotify_event *event = NULL; + unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); pr_debug("%s: group=%p count=%zd\n", __func__, group, count); @@ -98,8 +108,8 @@ static struct fanotify_event *get_one_event(struct fsnotify_group *group, if (fsnotify_notify_queue_is_empty(group)) goto out; - if (FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS)) { - event_size += fanotify_event_info_len( + if (fid_mode) { + event_size += fanotify_event_info_len(fid_mode, FANOTIFY_E(fsnotify_peek_first_event(group))); } @@ -325,7 +335,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, pr_debug("%s: group=%p event=%p\n", __func__, group, event); metadata.event_len = FAN_EVENT_METADATA_LEN + - fanotify_event_info_len(event); + fanotify_event_info_len(fid_mode, event); metadata.metadata_len = FAN_EVENT_METADATA_LEN; metadata.vers = FANOTIFY_METADATA_VERSION; metadata.reserved = 0; @@ -374,12 +384,25 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, } if (fanotify_event_object_fh_len(event)) { + const char *dot = NULL; + int dot_len = 0; + if (fid_mode == FAN_REPORT_FID || info_type) { /* * With only group flag FAN_REPORT_FID only type FID is * reported. Second info record type is always FID. */ info_type = FAN_EVENT_INFO_TYPE_FID; + } else if ((fid_mode & FAN_REPORT_NAME) && + (event->mask & FAN_ONDIR)) { + /* + * With group flag FAN_REPORT_NAME, if name was not + * recorded in an event on a directory, report the + * name "." with info type DFID_NAME. + */ + info_type = FAN_EVENT_INFO_TYPE_DFID_NAME; + dot = "."; + dot_len = 1; } else if ((event->mask & ALL_FSNOTIFY_DIRENT_EVENTS) || (event->mask & FAN_ONDIR)) { /* @@ -400,7 +423,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, ret = copy_info_to_user(fanotify_event_fsid(event), fanotify_event_object_fh(event), - info_type, NULL, 0, buf, count); + info_type, dot, dot_len, buf, count); if (ret < 0) return ret; @@ -932,11 +955,15 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) if (fid_mode && class != FAN_CLASS_NOTIF) return -EINVAL; - /* Reporting either object fid or dir fid */ + /* + * Reporting either object fid or dir fid. + * Child name is reported with parent fid so requires dir fid. + */ switch (fid_mode) { case 0: case FAN_REPORT_FID: case FAN_REPORT_DIR_FID: + case FAN_REPORT_DFID_NAME: break; default: return -EINVAL; @@ -1294,7 +1321,7 @@ COMPAT_SYSCALL_DEFINE6(fanotify_mark, */ static int __init fanotify_user_setup(void) { - BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 9); + BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 10); BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9); fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h index 4ddac97b2bf7..3e9c56ee651f 100644 --- a/include/linux/fanotify.h +++ b/include/linux/fanotify.h @@ -18,7 +18,7 @@ #define FANOTIFY_CLASS_BITS (FAN_CLASS_NOTIF | FAN_CLASS_CONTENT | \ FAN_CLASS_PRE_CONTENT) -#define FANOTIFY_FID_BITS (FAN_REPORT_FID | FAN_REPORT_DIR_FID) +#define FANOTIFY_FID_BITS (FAN_REPORT_FID | FAN_REPORT_DFID_NAME) #define FANOTIFY_INIT_FLAGS (FANOTIFY_CLASS_BITS | FANOTIFY_FID_BITS | \ FAN_REPORT_TID | \ diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h index 21afebf77fd7..fbf9c5c7dd59 100644 --- a/include/uapi/linux/fanotify.h +++ b/include/uapi/linux/fanotify.h @@ -54,6 +54,10 @@ #define FAN_REPORT_TID 0x00000100 /* event->pid is thread id */ #define FAN_REPORT_FID 0x00000200 /* Report unique file id */ #define FAN_REPORT_DIR_FID 0x00000400 /* Report unique directory id */ +#define FAN_REPORT_NAME 0x00000800 /* Report events with name */ + +/* Convenience macro - FAN_REPORT_NAME requires FAN_REPORT_DIR_FID */ +#define FAN_REPORT_DFID_NAME (FAN_REPORT_DIR_FID | FAN_REPORT_NAME) /* Deprecated - do not use this in programs and do not add new flags here! */ #define FAN_ALL_INIT_FLAGS (FAN_CLOEXEC | FAN_NONBLOCK | \ -- cgit v1.2.3-59-g8ed1b From e1613b5714ee6c186c9628e9958edf65e9d9cddd Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 27 Jul 2020 15:47:15 -0700 Subject: bpf: Fix bpf_ringbuf_output() signature to return long Due to bpf tree fix merge, bpf_ringbuf_output() signature ended up with int as a return type, while all other helpers got converted to returning long. So fix it in bpf-next now. Fixes: b0659d8a950d ("bpf: Fix definition of bpf_ringbuf_output() helper in UAPI comments") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200727224715.652037-1-andriin@fb.com --- include/uapi/linux/bpf.h | 2 +- tools/include/uapi/linux/bpf.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e1ba4ae6a916..eb5e0c38eb2c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3241,7 +3241,7 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * - * int bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags) + * long bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags) * Description * Copy *size* bytes from *data* into a ring buffer *ringbuf*. * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index e1ba4ae6a916..eb5e0c38eb2c 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3241,7 +3241,7 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * - * int bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags) + * long bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags) * Description * Copy *size* bytes from *data* into a ring buffer *ringbuf*. * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification -- cgit v1.2.3-59-g8ed1b From bc2d214af5dbcf1e53be9a23d95905a585657ff4 Mon Sep 17 00:00:00 2001 From: Bodo Stroesser Date: Sun, 26 Jul 2020 17:35:09 +0200 Subject: scsi: target: tcmu: Implement tmr_notify callback This patch implements the tmr_notify callback for tcmu. When the callback is called, tcmu checks the list of aborted commands it received as parameter: - aborted commands in the qfull_queue are removed from the queue and target_complete_command is called - from the cmd_ids of aborted commands currently uncompleted in cmd ring it creates a list of aborted cmd_ids. Finally a TMR notification is written to cmd ring containing TMR type and cmd_id list. If there is no space in ring, the TMR notification is queued on a TMR specific queue. The TMR specific queue 'tmr_queue' can be seen as a extension of the cmd ring. At the end of each iexecution of tcmu_complete_commands() we check whether tmr_queue contains TMRs and try to move them onto the ring. If tmr_queue is not empty after that, we don't call run_qfull_queue() because commands must not overtake TMRs. This way we guarantee that cmd_ids in TMR notification received by userspace either match an active, not yet completed command or are no longer valid due to userspace having complete some cmd_ids meanwhile. New commands that were assigned to an aborted cmd_id will always appear on the cmd ring _after_ the TMR. Link: https://lore.kernel.org/r/20200726153510.13077-8-bstroesser@ts.fujitsu.com Reviewed-by: Mike Christie Signed-off-by: Bodo Stroesser Signed-off-by: Martin K. Petersen --- drivers/target/target_core_user.c | 225 ++++++++++++++++++++++++++++++++-- include/uapi/linux/target_core_user.h | 25 ++++ 2 files changed, 241 insertions(+), 9 deletions(-) (limited to 'include/uapi') diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c index bddd40f07929..cb5a561a46e8 100644 --- a/drivers/target/target_core_user.c +++ b/drivers/target/target_core_user.c @@ -137,6 +137,7 @@ struct tcmu_dev { struct mutex cmdr_lock; struct list_head qfull_queue; + struct list_head tmr_queue; uint32_t dbi_max; uint32_t dbi_thresh; @@ -183,6 +184,15 @@ struct tcmu_cmd { #define TCMU_CMD_BIT_EXPIRED 0 unsigned long flags; }; + +struct tcmu_tmr { + struct list_head queue_entry; + + uint8_t tmr_type; + uint32_t tmr_cmd_cnt; + int16_t tmr_cmd_ids[0]; +}; + /* * To avoid dead lock the mutex lock order should always be: * @@ -844,6 +854,9 @@ static bool is_ring_space_avail(struct tcmu_dev *udev, struct tcmu_cmd *cmd, return false; } + if (!data_needed) + return true; + /* try to check and get the data blocks as needed */ space = spc_bitmap_free(udev->data_bitmap, udev->dbi_thresh); if ((space * DATA_BLOCK_SIZE) < data_needed) { @@ -1106,6 +1119,60 @@ queue: return 1; } +/** + * queue_tmr_ring - queue tmr info to ring or internally + * @udev: related tcmu_dev + * @tmr: tcmu_tmr containing tmr info to queue + * + * Returns: + * 0 success + * 1 internally queued to wait for ring memory to free. + */ +static int +queue_tmr_ring(struct tcmu_dev *udev, struct tcmu_tmr *tmr) +{ + struct tcmu_tmr_entry *entry; + int cmd_size; + int id_list_sz; + struct tcmu_mailbox *mb = udev->mb_addr; + uint32_t cmd_head; + + if (test_bit(TCMU_DEV_BIT_BROKEN, &udev->flags)) + goto out_free; + + id_list_sz = sizeof(tmr->tmr_cmd_ids[0]) * tmr->tmr_cmd_cnt; + cmd_size = round_up(sizeof(*entry) + id_list_sz, TCMU_OP_ALIGN_SIZE); + + if (!list_empty(&udev->tmr_queue) || + !is_ring_space_avail(udev, NULL, cmd_size, 0)) { + list_add_tail(&tmr->queue_entry, &udev->tmr_queue); + pr_debug("adding tmr %p on dev %s to TMR ring space wait queue\n", + tmr, udev->name); + return 1; + } + + cmd_head = ring_insert_padding(udev, cmd_size); + + entry = (void *)mb + CMDR_OFF + cmd_head; + memset(entry, 0, cmd_size); + tcmu_hdr_set_op(&entry->hdr.len_op, TCMU_OP_TMR); + tcmu_hdr_set_len(&entry->hdr.len_op, cmd_size); + entry->tmr_type = tmr->tmr_type; + entry->cmd_cnt = tmr->tmr_cmd_cnt; + memcpy(&entry->cmd_ids[0], &tmr->tmr_cmd_ids[0], id_list_sz); + tcmu_flush_dcache_range(entry, cmd_size); + + UPDATE_HEAD(mb->cmd_head, cmd_size, udev->cmdr_size); + tcmu_flush_dcache_range(mb, sizeof(*mb)); + + uio_event_notify(&udev->uio_info); + +out_free: + kfree(tmr); + + return 0; +} + static sense_reason_t tcmu_queue_cmd(struct se_cmd *se_cmd) { @@ -1141,6 +1208,85 @@ static void tcmu_set_next_deadline(struct list_head *queue, del_timer(timer); } +static int +tcmu_tmr_type(enum tcm_tmreq_table tmf) +{ + switch (tmf) { + case TMR_ABORT_TASK: return TCMU_TMR_ABORT_TASK; + case TMR_ABORT_TASK_SET: return TCMU_TMR_ABORT_TASK_SET; + case TMR_CLEAR_ACA: return TCMU_TMR_CLEAR_ACA; + case TMR_CLEAR_TASK_SET: return TCMU_TMR_CLEAR_TASK_SET; + case TMR_LUN_RESET: return TCMU_TMR_LUN_RESET; + case TMR_TARGET_WARM_RESET: return TCMU_TMR_TARGET_WARM_RESET; + case TMR_TARGET_COLD_RESET: return TCMU_TMR_TARGET_COLD_RESET; + case TMR_LUN_RESET_PRO: return TCMU_TMR_LUN_RESET_PRO; + default: return TCMU_TMR_UNKNOWN; + } +} + +static void +tcmu_tmr_notify(struct se_device *se_dev, enum tcm_tmreq_table tmf, + struct list_head *cmd_list) +{ + int i = 0, cmd_cnt = 0; + bool unqueued = false; + uint16_t *cmd_ids = NULL; + struct tcmu_cmd *cmd; + struct se_cmd *se_cmd; + struct tcmu_tmr *tmr; + struct tcmu_dev *udev = TCMU_DEV(se_dev); + + mutex_lock(&udev->cmdr_lock); + + /* First we check for aborted commands in qfull_queue */ + list_for_each_entry(se_cmd, cmd_list, state_list) { + i++; + if (!se_cmd->priv) + continue; + cmd = se_cmd->priv; + /* Commands on qfull queue have no id yet */ + if (cmd->cmd_id) { + cmd_cnt++; + continue; + } + pr_debug("Removing aborted command %p from queue on dev %s.\n", + cmd, udev->name); + + list_del_init(&cmd->queue_entry); + tcmu_free_cmd(cmd); + target_complete_cmd(se_cmd, SAM_STAT_TASK_ABORTED); + unqueued = true; + } + if (unqueued) + tcmu_set_next_deadline(&udev->qfull_queue, &udev->qfull_timer); + + pr_debug("TMR event %d on dev %s, aborted cmds %d, afflicted cmd_ids %d\n", + tcmu_tmr_type(tmf), udev->name, i, cmd_cnt); + + tmr = kmalloc(sizeof(*tmr) + cmd_cnt * sizeof(*cmd_ids), GFP_KERNEL); + if (!tmr) + goto unlock; + + tmr->tmr_type = tcmu_tmr_type(tmf); + tmr->tmr_cmd_cnt = cmd_cnt; + + if (cmd_cnt != 0) { + cmd_cnt = 0; + list_for_each_entry(se_cmd, cmd_list, state_list) { + if (!se_cmd->priv) + continue; + cmd = se_cmd->priv; + if (cmd->cmd_id) + tmr->tmr_cmd_ids[cmd_cnt++] = cmd->cmd_id; + } + } + + queue_tmr_ring(udev, tmr); + +unlock: + mutex_unlock(&udev->cmdr_lock); +} + static void tcmu_handle_completion(struct tcmu_cmd *cmd, struct tcmu_cmd_entry *entry) { struct se_cmd *se_cmd = cmd->se_cmd; @@ -1208,11 +1354,43 @@ out: tcmu_free_cmd(cmd); } +static int tcmu_run_tmr_queue(struct tcmu_dev *udev) +{ + struct tcmu_tmr *tmr, *tmp; + LIST_HEAD(tmrs); + + if (list_empty(&udev->tmr_queue)) + return 1; + + pr_debug("running %s's tmr queue\n", udev->name); + + list_splice_init(&udev->tmr_queue, &tmrs); + + list_for_each_entry_safe(tmr, tmp, &tmrs, queue_entry) { + list_del_init(&tmr->queue_entry); + + pr_debug("removing tmr %p on dev %s from queue\n", + tmr, udev->name); + + if (queue_tmr_ring(udev, tmr)) { + pr_debug("ran out of space during tmr queue run\n"); + /* + * tmr was requeued, so just put all tmrs back in + * the queue + */ + list_splice_tail(&tmrs, &udev->tmr_queue); + return 0; + } + } + + return 1; +} + static unsigned int tcmu_handle_completions(struct tcmu_dev *udev) { struct tcmu_mailbox *mb; struct tcmu_cmd *cmd; - int handled = 0; + bool free_space = false; if (test_bit(TCMU_DEV_BIT_BROKEN, &udev->flags)) { pr_err("ring broken, not handling completions\n"); @@ -1235,7 +1413,10 @@ static unsigned int tcmu_handle_completions(struct tcmu_dev *udev) tcmu_flush_dcache_range(entry, ring_left < sizeof(*entry) ? ring_left : sizeof(*entry)); - if (tcmu_hdr_get_op(entry->hdr.len_op) == TCMU_OP_PAD) { + free_space = true; + + if (tcmu_hdr_get_op(entry->hdr.len_op) == TCMU_OP_PAD || + tcmu_hdr_get_op(entry->hdr.len_op) == TCMU_OP_TMR) { UPDATE_HEAD(udev->cmdr_last_cleaned, tcmu_hdr_get_len(entry->hdr.len_op), udev->cmdr_size); @@ -1256,9 +1437,9 @@ static unsigned int tcmu_handle_completions(struct tcmu_dev *udev) UPDATE_HEAD(udev->cmdr_last_cleaned, tcmu_hdr_get_len(entry->hdr.len_op), udev->cmdr_size); - - handled++; } + if (free_space) + free_space = tcmu_run_tmr_queue(udev); if (atomic_read(&global_db_count) > tcmu_global_max_blocks && idr_is_empty(&udev->commands) && list_empty(&udev->qfull_queue)) { @@ -1271,7 +1452,7 @@ static unsigned int tcmu_handle_completions(struct tcmu_dev *udev) if (udev->cmd_time_out) tcmu_set_next_deadline(&udev->inflight_queue, &udev->cmd_timer); - return handled; + return free_space; } static void tcmu_check_expired_ring_cmd(struct tcmu_cmd *cmd) @@ -1381,6 +1562,7 @@ static struct se_device *tcmu_alloc_device(struct se_hba *hba, const char *name) INIT_LIST_HEAD(&udev->node); INIT_LIST_HEAD(&udev->timedout_entry); INIT_LIST_HEAD(&udev->qfull_queue); + INIT_LIST_HEAD(&udev->tmr_queue); INIT_LIST_HEAD(&udev->inflight_queue); idr_init(&udev->commands); @@ -1455,8 +1637,8 @@ static int tcmu_irqcontrol(struct uio_info *info, s32 irq_on) struct tcmu_dev *udev = container_of(info, struct tcmu_dev, uio_info); mutex_lock(&udev->cmdr_lock); - tcmu_handle_completions(udev); - run_qfull_queue(udev, false); + if (tcmu_handle_completions(udev)) + run_qfull_queue(udev, false); mutex_unlock(&udev->cmdr_lock); return 0; @@ -1609,6 +1791,16 @@ static void tcmu_blocks_release(struct radix_tree_root *blocks, } } +static void tcmu_remove_all_queued_tmr(struct tcmu_dev *udev) +{ + struct tcmu_tmr *tmr, *tmp; + + list_for_each_entry_safe(tmr, tmp, &udev->tmr_queue, queue_entry) { + list_del_init(&tmr->queue_entry); + kfree(tmr); + } +} + static void tcmu_dev_kref_release(struct kref *kref) { struct tcmu_dev *udev = container_of(kref, struct tcmu_dev, kref); @@ -1631,6 +1823,8 @@ static void tcmu_dev_kref_release(struct kref *kref) if (tcmu_check_and_free_pending_cmd(cmd) != 0) all_expired = false; } + /* There can be left over TMR cmds. Remove them. */ + tcmu_remove_all_queued_tmr(udev); if (!list_empty(&udev->qfull_queue)) all_expired = false; idr_destroy(&udev->commands); @@ -1885,7 +2079,9 @@ static int tcmu_configure_device(struct se_device *dev) /* Initialise the mailbox of the ring buffer */ mb = udev->mb_addr; mb->version = TCMU_MAILBOX_VERSION; - mb->flags = TCMU_MAILBOX_FLAG_CAP_OOOC | TCMU_MAILBOX_FLAG_CAP_READ_LEN; + mb->flags = TCMU_MAILBOX_FLAG_CAP_OOOC | + TCMU_MAILBOX_FLAG_CAP_READ_LEN | + TCMU_MAILBOX_FLAG_CAP_TMR; mb->cmdr_off = CMDR_OFF; mb->cmdr_size = udev->cmdr_size; @@ -2055,6 +2251,15 @@ static void tcmu_reset_ring(struct tcmu_dev *udev, u8 err_level) del_timer(&udev->cmd_timer); + /* + * ring is empty and qfull queue never contains aborted commands. + * So TMRs in tmr queue do not contain relevant cmd_ids. + * After a ring reset userspace should do a fresh start, so + * even LUN RESET message is no longer relevant. + * Therefore remove all TMRs from qfull queue + */ + tcmu_remove_all_queued_tmr(udev); + run_qfull_queue(udev, false); mutex_unlock(&udev->cmdr_lock); @@ -2607,6 +2812,7 @@ static struct target_backend_ops tcmu_ops = { .destroy_device = tcmu_destroy_device, .free_device = tcmu_free_device, .parse_cdb = tcmu_parse_cdb, + .tmr_notify = tcmu_tmr_notify, .set_configfs_dev_params = tcmu_set_configfs_dev_params, .show_configfs_dev_params = tcmu_show_configfs_dev_params, .get_device_type = sbc_get_device_type, @@ -2633,7 +2839,8 @@ static void find_free_blocks(void) } /* Try to complete the finished commands first */ - tcmu_handle_completions(udev); + if (tcmu_handle_completions(udev)) + run_qfull_queue(udev, false); /* Skip the udevs in idle */ if (!udev->dbi_thresh) { diff --git a/include/uapi/linux/target_core_user.h b/include/uapi/linux/target_core_user.h index b7b57967d90f..95b1597f16ae 100644 --- a/include/uapi/linux/target_core_user.h +++ b/include/uapi/linux/target_core_user.h @@ -45,6 +45,7 @@ #define ALIGN_SIZE 64 /* Should be enough for most CPUs */ #define TCMU_MAILBOX_FLAG_CAP_OOOC (1 << 0) /* Out-of-order completions */ #define TCMU_MAILBOX_FLAG_CAP_READ_LEN (1 << 1) /* Read data length */ +#define TCMU_MAILBOX_FLAG_CAP_TMR (1 << 2) /* TMR notifications */ struct tcmu_mailbox { __u16 version; @@ -62,6 +63,7 @@ struct tcmu_mailbox { enum tcmu_opcode { TCMU_OP_PAD = 0, TCMU_OP_CMD, + TCMU_OP_TMR, }; /* @@ -128,6 +130,29 @@ struct tcmu_cmd_entry { } __packed; +struct tcmu_tmr_entry { + struct tcmu_cmd_entry_hdr hdr; + +#define TCMU_TMR_UNKNOWN 0 +#define TCMU_TMR_ABORT_TASK 1 +#define TCMU_TMR_ABORT_TASK_SET 2 +#define TCMU_TMR_CLEAR_ACA 3 +#define TCMU_TMR_CLEAR_TASK_SET 4 +#define TCMU_TMR_LUN_RESET 5 +#define TCMU_TMR_TARGET_WARM_RESET 6 +#define TCMU_TMR_TARGET_COLD_RESET 7 +/* Pseudo reset due to received PR OUT */ +#define TCMU_TMR_LUN_RESET_PRO 128 + __u8 tmr_type; + + __u8 __pad1; + __u16 __pad2; + __u32 cmd_cnt; + __u64 __pad3; + __u64 __pad4; + __u16 cmd_ids[0]; +} __packed; + #define TCMU_OP_ALIGN_SIZE sizeof(__u64) enum tcmu_genl_cmd { -- cgit v1.2.3-59-g8ed1b From 556c811f24b30cc883733a2eaf9e939817589231 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Wed, 22 Jul 2020 17:03:09 +0300 Subject: RDMA/efa: Expose maximum TX doorbell batch The device reports the maximum number of bytes to be written before ringing the doorbell (zero means unlimited). This patch queries the max batch size and reports it back to the userspace library. Link: https://lore.kernel.org/r/20200722140312.3651-2-galpress@amazon.com Reviewed-by: Daniel Kranzdorf Reviewed-by: Firas JahJah Signed-off-by: Gal Pressman Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/efa/efa_admin_cmds_defs.h | 11 +++++++++++ drivers/infiniband/hw/efa/efa_com_cmd.c | 1 + drivers/infiniband/hw/efa/efa_com_cmd.h | 1 + drivers/infiniband/hw/efa/efa_verbs.c | 1 + include/uapi/rdma/efa-abi.h | 4 +++- 5 files changed, 17 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h b/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h index bef2bd291054..03e7388af06e 100644 --- a/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h +++ b/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h @@ -632,6 +632,17 @@ struct efa_admin_feature_queue_attr_desc { /* Maximum number of SGEs for a single RDMA read WQE */ u16 max_wr_rdma_sges; + + /* + * Maximum number of bytes that can be written to SQ between two + * consecutive doorbells (in units of 64B). Driver must ensure that only + * complete WQEs are written to queue before issuing a doorbell. + * Examples: max_tx_batch=16 and WQE size = 64B, means up to 16 WQEs can + * be written to SQ between two consecutive doorbells. max_tx_batch=11 + * and WQE size = 128B, means up to 5 WQEs can be written to SQ between + * two consecutive doorbells. Zero means unlimited. + */ + u16 max_tx_batch; }; struct efa_admin_feature_aenq_desc { diff --git a/drivers/infiniband/hw/efa/efa_com_cmd.c b/drivers/infiniband/hw/efa/efa_com_cmd.c index fabd8df2e78f..53cfde5c43d8 100644 --- a/drivers/infiniband/hw/efa/efa_com_cmd.c +++ b/drivers/infiniband/hw/efa/efa_com_cmd.c @@ -480,6 +480,7 @@ int efa_com_get_device_attr(struct efa_com_dev *edev, result->max_llq_size = resp.u.queue_attr.max_llq_size; result->sub_cqs_per_cq = resp.u.queue_attr.sub_cqs_per_cq; result->max_wr_rdma_sge = resp.u.queue_attr.max_wr_rdma_sges; + result->max_tx_batch = resp.u.queue_attr.max_tx_batch; err = efa_com_get_feature(edev, &resp, EFA_ADMIN_NETWORK_ATTR); if (err) { diff --git a/drivers/infiniband/hw/efa/efa_com_cmd.h b/drivers/infiniband/hw/efa/efa_com_cmd.h index 41ce4a476ee6..8df2a26d57d4 100644 --- a/drivers/infiniband/hw/efa/efa_com_cmd.h +++ b/drivers/infiniband/hw/efa/efa_com_cmd.h @@ -127,6 +127,7 @@ struct efa_com_get_device_attr_result { u16 max_sq_sge; u16 max_rq_sge; u16 max_wr_rdma_sge; + u16 max_tx_batch; u8 db_bar; }; diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index 08313f7c73bc..f49d14cebe4a 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -1525,6 +1525,7 @@ int efa_alloc_ucontext(struct ib_ucontext *ibucontext, struct ib_udata *udata) resp.sub_cqs_per_cq = dev->dev_attr.sub_cqs_per_cq; resp.inline_buf_size = dev->dev_attr.inline_buf_size; resp.max_llq_size = dev->dev_attr.max_llq_size; + resp.max_tx_batch = dev->dev_attr.max_tx_batch; if (udata && udata->outlen) { err = ib_copy_to_udata(udata, &resp, diff --git a/include/uapi/rdma/efa-abi.h b/include/uapi/rdma/efa-abi.h index 53b6e2036a9b..10781763da37 100644 --- a/include/uapi/rdma/efa-abi.h +++ b/include/uapi/rdma/efa-abi.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) */ /* - * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef EFA_ABI_USER_H @@ -31,6 +31,8 @@ struct efa_ibv_alloc_ucontext_resp { __u16 sub_cqs_per_cq; __u16 inline_buf_size; __u32 max_llq_size; /* bytes */ + __u16 max_tx_batch; /* units of 64 bytes */ + __u8 reserved_90[6]; }; struct efa_ibv_alloc_pd_resp { -- cgit v1.2.3-59-g8ed1b From da2924bdca99768442c5e0ed0a9024ae79d62765 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Wed, 22 Jul 2020 17:03:10 +0300 Subject: RDMA/efa: Expose minimum SQ size The device reports the minimum SQ size required for creation. This patch queries the min SQ size and reports it back to the userspace library. Link: https://lore.kernel.org/r/20200722140312.3651-3-galpress@amazon.com Reviewed-by: Firas JahJah Reviewed-by: Shadi Ammouri Signed-off-by: Gal Pressman Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/efa/efa_admin_cmds_defs.h | 4 ++-- drivers/infiniband/hw/efa/efa_com_cmd.c | 1 + drivers/infiniband/hw/efa/efa_com_cmd.h | 1 + drivers/infiniband/hw/efa/efa_verbs.c | 1 + include/uapi/rdma/efa-abi.h | 3 ++- 5 files changed, 7 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h b/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h index 03e7388af06e..5484b08bbc5d 100644 --- a/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h +++ b/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h @@ -606,8 +606,8 @@ struct efa_admin_feature_queue_attr_desc { /* Number of sub-CQs to be created for each CQ */ u16 sub_cqs_per_cq; - /* MBZ */ - u16 reserved; + /* Minimum number of WQEs per SQ */ + u16 min_sq_depth; /* Maximum number of SGEs (buffers) allowed for a single send WQE */ u16 max_wr_send_sges; diff --git a/drivers/infiniband/hw/efa/efa_com_cmd.c b/drivers/infiniband/hw/efa/efa_com_cmd.c index 53cfde5c43d8..6ac23627f65a 100644 --- a/drivers/infiniband/hw/efa/efa_com_cmd.c +++ b/drivers/infiniband/hw/efa/efa_com_cmd.c @@ -481,6 +481,7 @@ int efa_com_get_device_attr(struct efa_com_dev *edev, result->sub_cqs_per_cq = resp.u.queue_attr.sub_cqs_per_cq; result->max_wr_rdma_sge = resp.u.queue_attr.max_wr_rdma_sges; result->max_tx_batch = resp.u.queue_attr.max_tx_batch; + result->min_sq_depth = resp.u.queue_attr.min_sq_depth; err = efa_com_get_feature(edev, &resp, EFA_ADMIN_NETWORK_ATTR); if (err) { diff --git a/drivers/infiniband/hw/efa/efa_com_cmd.h b/drivers/infiniband/hw/efa/efa_com_cmd.h index 8df2a26d57d4..190bac23f585 100644 --- a/drivers/infiniband/hw/efa/efa_com_cmd.h +++ b/drivers/infiniband/hw/efa/efa_com_cmd.h @@ -128,6 +128,7 @@ struct efa_com_get_device_attr_result { u16 max_rq_sge; u16 max_wr_rdma_sge; u16 max_tx_batch; + u16 min_sq_depth; u8 db_bar; }; diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index f49d14cebe4a..26102ab333b2 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -1526,6 +1526,7 @@ int efa_alloc_ucontext(struct ib_ucontext *ibucontext, struct ib_udata *udata) resp.inline_buf_size = dev->dev_attr.inline_buf_size; resp.max_llq_size = dev->dev_attr.max_llq_size; resp.max_tx_batch = dev->dev_attr.max_tx_batch; + resp.min_sq_wr = dev->dev_attr.min_sq_depth; if (udata && udata->outlen) { err = ib_copy_to_udata(udata, &resp, diff --git a/include/uapi/rdma/efa-abi.h b/include/uapi/rdma/efa-abi.h index 10781763da37..7ef2306f8dd4 100644 --- a/include/uapi/rdma/efa-abi.h +++ b/include/uapi/rdma/efa-abi.h @@ -32,7 +32,8 @@ struct efa_ibv_alloc_ucontext_resp { __u16 inline_buf_size; __u32 max_llq_size; /* bytes */ __u16 max_tx_batch; /* units of 64 bytes */ - __u8 reserved_90[6]; + __u16 min_sq_wr; + __u8 reserved_a0[4]; }; struct efa_ibv_alloc_pd_resp { -- cgit v1.2.3-59-g8ed1b From a5d87b698547233321466b2dc91271f5855a4df6 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Wed, 22 Jul 2020 17:03:11 +0300 Subject: RDMA/efa: User/kernel compatibility handshake mechanism Introduce a mechanism that performs an handshake between the userspace provider and kernel driver which verifies that the user supports all required features in order to operate correctly. The handshake verifies the needed functionality by comparing the reported device caps and the provider caps. If the device reports a non-zero capability the appropriate comp mask is required from the userspace provider in order to allocate the context. Link: https://lore.kernel.org/r/20200722140312.3651-4-galpress@amazon.com Reviewed-by: Shadi Ammouri Reviewed-by: Yossi Leybovich Signed-off-by: Gal Pressman Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/efa/efa_verbs.c | 40 +++++++++++++++++++++++++++++++++++ include/uapi/rdma/efa-abi.h | 10 +++++++++ 2 files changed, 50 insertions(+) (limited to 'include/uapi') diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index 26102ab333b2..fda175836fb6 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -1501,11 +1501,39 @@ static int efa_dealloc_uar(struct efa_dev *dev, u16 uarn) return efa_com_dealloc_uar(&dev->edev, ¶ms); } +#define EFA_CHECK_USER_COMP(_dev, _comp_mask, _attr, _mask, _attr_str) \ + (_attr_str = (!(_dev)->dev_attr._attr || ((_comp_mask) & (_mask))) ? \ + NULL : #_attr) + +static int efa_user_comp_handshake(const struct ib_ucontext *ibucontext, + const struct efa_ibv_alloc_ucontext_cmd *cmd) +{ + struct efa_dev *dev = to_edev(ibucontext->device); + char *attr_str; + + if (EFA_CHECK_USER_COMP(dev, cmd->comp_mask, max_tx_batch, + EFA_ALLOC_UCONTEXT_CMD_COMP_TX_BATCH, attr_str)) + goto err; + + if (EFA_CHECK_USER_COMP(dev, cmd->comp_mask, min_sq_depth, + EFA_ALLOC_UCONTEXT_CMD_COMP_MIN_SQ_WR, + attr_str)) + goto err; + + return 0; + +err: + ibdev_dbg(&dev->ibdev, "Userspace handshake failed for %s attribute\n", + attr_str); + return -EOPNOTSUPP; +} + int efa_alloc_ucontext(struct ib_ucontext *ibucontext, struct ib_udata *udata) { struct efa_ucontext *ucontext = to_eucontext(ibucontext); struct efa_dev *dev = to_edev(ibucontext->device); struct efa_ibv_alloc_ucontext_resp resp = {}; + struct efa_ibv_alloc_ucontext_cmd cmd = {}; struct efa_com_alloc_uar_result result; int err; @@ -1514,6 +1542,18 @@ int efa_alloc_ucontext(struct ib_ucontext *ibucontext, struct ib_udata *udata) * we will ack input fields in our response. */ + err = ib_copy_from_udata(&cmd, udata, + min(sizeof(cmd), udata->inlen)); + if (err) { + ibdev_dbg(&dev->ibdev, + "Cannot copy udata for alloc_ucontext\n"); + goto err_out; + } + + err = efa_user_comp_handshake(ibucontext, &cmd); + if (err) + goto err_out; + err = efa_com_alloc_uar(&dev->edev, &result); if (err) goto err_out; diff --git a/include/uapi/rdma/efa-abi.h b/include/uapi/rdma/efa-abi.h index 7ef2306f8dd4..507a2862bedb 100644 --- a/include/uapi/rdma/efa-abi.h +++ b/include/uapi/rdma/efa-abi.h @@ -20,6 +20,16 @@ * hex bit offset of the field. */ +enum { + EFA_ALLOC_UCONTEXT_CMD_COMP_TX_BATCH = 1 << 0, + EFA_ALLOC_UCONTEXT_CMD_COMP_MIN_SQ_WR = 1 << 1, +}; + +struct efa_ibv_alloc_ucontext_cmd { + __u32 comp_mask; + __u8 reserved_20[4]; +}; + enum efa_ibv_user_cmds_supp_udata { EFA_USER_CMDS_SUPP_UDATA_QUERY_DEVICE = 1 << 0, EFA_USER_CMDS_SUPP_UDATA_CREATE_AH = 1 << 1, -- cgit v1.2.3-59-g8ed1b From 50935339c394adfb3d7253055e3bc10ee70264b0 Mon Sep 17 00:00:00 2001 From: "Alexander A. Klimov" Date: Sat, 25 Jul 2020 19:02:25 +0200 Subject: netfilter: Replace HTTP links with HTTPS ones Rationale: Reduces attack surface on kernel devs opening the links for MITM as HTTPS traffic is much harder to manipulate. Deterministic algorithm: For each file: If not .svg: For each line: If doesn't contain `\bxmlns\b`: For each link, `\bhttp://[^# \t\r\n]*(?:\w|/)`: If neither `\bgnu\.org/license`, nor `\bmozilla\.org/MPL\b`: If both the HTTP and HTTPS versions return 200 OK and serve the same content: Replace HTTP with HTTPS. Signed-off-by: Alexander A. Klimov Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/xt_connmark.h | 2 +- net/decnet/netfilter/dn_rtmsg.c | 2 +- net/netfilter/Kconfig | 2 +- net/netfilter/nfnetlink_acct.c | 2 +- net/netfilter/nft_set_pipapo.c | 4 ++-- net/netfilter/xt_CONNSECMARK.c | 2 +- net/netfilter/xt_connmark.c | 2 +- net/netfilter/xt_nfacct.c | 2 +- net/netfilter/xt_time.c | 2 +- 9 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/netfilter/xt_connmark.h b/include/uapi/linux/netfilter/xt_connmark.h index 1aa5c955ee1e..f01c19b83a2b 100644 --- a/include/uapi/linux/netfilter/xt_connmark.h +++ b/include/uapi/linux/netfilter/xt_connmark.h @@ -4,7 +4,7 @@ #include -/* Copyright (C) 2002,2004 MARA Systems AB +/* Copyright (C) 2002,2004 MARA Systems AB * by Henrik Nordstrom * * This program is free software; you can redistribute it and/or modify diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c index dc705769acc9..26a9193df783 100644 --- a/net/decnet/netfilter/dn_rtmsg.c +++ b/net/decnet/netfilter/dn_rtmsg.c @@ -6,7 +6,7 @@ * * DECnet Routing Message Grabulator * - * (C) 2000 ChyGwyn Limited - http://www.chygwyn.com/ + * (C) 2000 ChyGwyn Limited - https://www.chygwyn.com/ * * Author: Steven Whitehouse */ diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 0ffe2b8723c4..25313c29d799 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -447,7 +447,7 @@ config NF_TABLES replace the existing {ip,ip6,arp,eb}_tables infrastructure. It provides a pseudo-state machine with an extensible instruction-set (also known as expressions) that the userspace 'nft' utility - (http://www.netfilter.org/projects/nftables) uses to build the + (https://www.netfilter.org/projects/nftables) uses to build the rule-set. It also comes with the generic set infrastructure that allows you to construct mappings between matchings and actions for performance lookups. diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index 5827117f2635..5bfec829c12f 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* * (C) 2011 Pablo Neira Ayuso - * (C) 2011 Intra2net AG + * (C) 2011 Intra2net AG */ #include #include diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index cc6082a5f7ad..9944523f5c2c 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -312,7 +312,7 @@ * Jay Ligatti, Josh Kuhn, and Chris Gage. * Proceedings of the IEEE International Conference on Computer * Communication Networks (ICCCN), August 2010. - * http://www.cse.usf.edu/~ligatti/papers/grouper-conf.pdf + * https://www.cse.usf.edu/~ligatti/papers/grouper-conf.pdf * * [Rottenstreich 2010] * Worst-Case TCAM Rule Expansion @@ -325,7 +325,7 @@ * Kirill Kogan, Sergey Nikolenko, Ori Rottenstreich, William Culhane, * and Patrick Eugster. * Proceedings of the 2014 ACM conference on SIGCOMM, August 2014. - * http://www.sigcomm.org/sites/default/files/ccr/papers/2014/August/2619239-2626294.pdf + * https://www.sigcomm.org/sites/default/files/ccr/papers/2014/August/2619239-2626294.pdf */ #include diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c index a5c8b653476a..76acecf3e757 100644 --- a/net/netfilter/xt_CONNSECMARK.c +++ b/net/netfilter/xt_CONNSECMARK.c @@ -6,7 +6,7 @@ * with the SECMARK target and state match. * * Based somewhat on CONNMARK: - * Copyright (C) 2002,2004 MARA Systems AB + * Copyright (C) 2002,2004 MARA Systems AB * by Henrik Nordstrom * * (C) 2006,2008 Red Hat, Inc., James Morris diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c index eec2f3a88d73..e5ebc0810675 100644 --- a/net/netfilter/xt_connmark.c +++ b/net/netfilter/xt_connmark.c @@ -2,7 +2,7 @@ /* * xt_connmark - Netfilter module to operate on connection marks * - * Copyright (C) 2002,2004 MARA Systems AB + * Copyright (C) 2002,2004 MARA Systems AB * by Henrik Nordstrom * Copyright © CC Computer Consultants GmbH, 2007 - 2008 * Jan Engelhardt diff --git a/net/netfilter/xt_nfacct.c b/net/netfilter/xt_nfacct.c index 5aab6df74e0f..a97c2259bbc8 100644 --- a/net/netfilter/xt_nfacct.c +++ b/net/netfilter/xt_nfacct.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* * (C) 2011 Pablo Neira Ayuso - * (C) 2011 Intra2net AG + * (C) 2011 Intra2net AG */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c index 67cb98489415..6aa12d0f54e2 100644 --- a/net/netfilter/xt_time.c +++ b/net/netfilter/xt_time.c @@ -5,7 +5,7 @@ * based on ipt_time by Fabrice MARIE * This is a module which is used for time matching * It is using some modified code from dietlibc (localtime() function) - * that you can find at http://www.fefe.de/dietlibc/ + * that you can find at https://www.fefe.de/dietlibc/ * This file is distributed under the terms of the GNU General Public * License (GPL). Copies of the GPL can be obtained from gnu.org/gpl. */ -- cgit v1.2.3-59-g8ed1b From df78a0c0b67de58934877aad61e0431a2bd0caf1 Mon Sep 17 00:00:00 2001 From: Thomas Pedersen Date: Mon, 1 Jun 2020 23:22:47 -0700 Subject: nl80211: S1G band and channel definitions Gives drivers the definitions needed to advertise support for S1G bands. Signed-off-by: Thomas Pedersen Link: https://lore.kernel.org/r/20200602062247.23212-1-thomas@adapt-ip.com Link: https://lore.kernel.org/r/20200731055636.795173-1-thomas@adapt-ip.com Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/ath10k/mac.c | 9 ++------- include/net/cfg80211.h | 17 +++++++++++++++++ include/uapi/linux/nl80211.h | 16 ++++++++++++++++ net/mac80211/chan.c | 7 ++++++- net/mac80211/scan.c | 1 + net/mac80211/tx.c | 1 + net/mac80211/util.c | 5 +++++ net/wireless/chan.c | 35 +++++++++++++++++++++++++++++++++++ net/wireless/core.c | 5 +++-- net/wireless/util.c | 8 ++++++++ 10 files changed, 94 insertions(+), 10 deletions(-) (limited to 'include/uapi') diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c index 919d15584d4a..3c0c33a9f30c 100644 --- a/drivers/net/wireless/ath/ath10k/mac.c +++ b/drivers/net/wireless/ath/ath10k/mac.c @@ -568,11 +568,7 @@ chan_to_phymode(const struct cfg80211_chan_def *chandef) case NL80211_CHAN_WIDTH_40: phymode = MODE_11NG_HT40; break; - case NL80211_CHAN_WIDTH_5: - case NL80211_CHAN_WIDTH_10: - case NL80211_CHAN_WIDTH_80: - case NL80211_CHAN_WIDTH_80P80: - case NL80211_CHAN_WIDTH_160: + default: phymode = MODE_UNKNOWN; break; } @@ -597,8 +593,7 @@ chan_to_phymode(const struct cfg80211_chan_def *chandef) case NL80211_CHAN_WIDTH_80P80: phymode = MODE_11AC_VHT80_80; break; - case NL80211_CHAN_WIDTH_5: - case NL80211_CHAN_WIDTH_10: + default: phymode = MODE_UNKNOWN; break; } diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index fc7e8807838d..ac6e58193426 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -417,6 +417,22 @@ struct ieee80211_edmg { enum ieee80211_edmg_bw_config bw_config; }; +/** + * struct ieee80211_sta_s1g_cap - STA's S1G capabilities + * + * This structure describes most essential parameters needed + * to describe 802.11ah S1G capabilities for a STA. + * + * @s1g_supported: is STA an S1G STA + * @cap: S1G capabilities information + * @nss_mcs: Supported NSS MCS set + */ +struct ieee80211_sta_s1g_cap { + bool s1g; + u8 cap[10]; /* use S1G_CAPAB_ */ + u8 nss_mcs[5]; +}; + /** * struct ieee80211_supported_band - frequency band definition * @@ -448,6 +464,7 @@ struct ieee80211_supported_band { int n_bitrates; struct ieee80211_sta_ht_cap ht_cap; struct ieee80211_sta_vht_cap vht_cap; + struct ieee80211_sta_s1g_cap s1g_cap; struct ieee80211_edmg edmg_cap; u16 n_iftype_data; const struct ieee80211_sband_iftype_data *iftype_data; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 4e6339ab1fce..ad183469f9af 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4437,6 +4437,11 @@ enum nl80211_key_mode { * attribute must be provided as well * @NL80211_CHAN_WIDTH_5: 5 MHz OFDM channel * @NL80211_CHAN_WIDTH_10: 10 MHz OFDM channel + * @NL80211_CHAN_WIDTH_1: 1 MHz OFDM channel + * @NL80211_CHAN_WIDTH_2: 2 MHz OFDM channel + * @NL80211_CHAN_WIDTH_4: 4 MHz OFDM channel + * @NL80211_CHAN_WIDTH_8: 8 MHz OFDM channel + * @NL80211_CHAN_WIDTH_16: 16 MHz OFDM channel */ enum nl80211_chan_width { NL80211_CHAN_WIDTH_20_NOHT, @@ -4447,6 +4452,11 @@ enum nl80211_chan_width { NL80211_CHAN_WIDTH_160, NL80211_CHAN_WIDTH_5, NL80211_CHAN_WIDTH_10, + NL80211_CHAN_WIDTH_1, + NL80211_CHAN_WIDTH_2, + NL80211_CHAN_WIDTH_4, + NL80211_CHAN_WIDTH_8, + NL80211_CHAN_WIDTH_16, }; /** @@ -4457,11 +4467,15 @@ enum nl80211_chan_width { * @NL80211_BSS_CHAN_WIDTH_20: control channel is 20 MHz wide or compatible * @NL80211_BSS_CHAN_WIDTH_10: control channel is 10 MHz wide * @NL80211_BSS_CHAN_WIDTH_5: control channel is 5 MHz wide + * @NL80211_BSS_CHAN_WIDTH_1: control channel is 1 MHz wide + * @NL80211_BSS_CHAN_WIDTH_2: control channel is 2 MHz wide */ enum nl80211_bss_scan_width { NL80211_BSS_CHAN_WIDTH_20, NL80211_BSS_CHAN_WIDTH_10, NL80211_BSS_CHAN_WIDTH_5, + NL80211_BSS_CHAN_WIDTH_1, + NL80211_BSS_CHAN_WIDTH_2, }; /** @@ -4740,6 +4754,7 @@ enum nl80211_txrate_gi { * @NL80211_BAND_5GHZ: around 5 GHz band (4.9 - 5.7 GHz) * @NL80211_BAND_60GHZ: around 60 GHz band (58.32 - 69.12 GHz) * @NL80211_BAND_6GHZ: around 6 GHz band (5.9 - 7.2 GHz) + * @NL80211_BAND_S1GHZ: around 900MHz, supported by S1G PHYs * @NUM_NL80211_BANDS: number of bands, avoid using this in userspace * since newer kernel versions may support more bands */ @@ -4748,6 +4763,7 @@ enum nl80211_band { NL80211_BAND_5GHZ, NL80211_BAND_60GHZ, NL80211_BAND_6GHZ, + NL80211_BAND_S1GHZ, NUM_NL80211_BANDS, }; diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index e6e192f53e4e..08cf9da9c1e3 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -313,9 +313,14 @@ void ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, lockdep_assert_held(&local->chanctx_mtx); - /* don't optimize 5MHz, 10MHz, and radar_enabled confs */ + /* don't optimize non-20MHz based and radar_enabled confs */ if (ctx->conf.def.width == NL80211_CHAN_WIDTH_5 || ctx->conf.def.width == NL80211_CHAN_WIDTH_10 || + ctx->conf.def.width == NL80211_CHAN_WIDTH_1 || + ctx->conf.def.width == NL80211_CHAN_WIDTH_2 || + ctx->conf.def.width == NL80211_CHAN_WIDTH_4 || + ctx->conf.def.width == NL80211_CHAN_WIDTH_8 || + ctx->conf.def.width == NL80211_CHAN_WIDTH_16 || ctx->conf.radar_enabled) { ctx->conf.min_def = ctx->conf.def; return; diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index ad90bbe57457..8003be6dae8a 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -913,6 +913,7 @@ static void ieee80211_scan_state_set_channel(struct ieee80211_local *local, case NL80211_BSS_CHAN_WIDTH_10: local->scan_chandef.width = NL80211_CHAN_WIDTH_10; break; + default: case NL80211_BSS_CHAN_WIDTH_20: /* If scanning on oper channel, use whatever channel-type * is currently in use. diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 1a2941e5244f..ee30ef441f4a 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -166,6 +166,7 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx, if (r->flags & IEEE80211_RATE_MANDATORY_A) mrate = r->bitrate; break; + case NL80211_BAND_S1GHZ: case NL80211_BAND_60GHZ: /* TODO, for now fall through */ case NUM_NL80211_BANDS: diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 21c94094a699..64a83ecd0a73 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -3730,6 +3730,11 @@ u32 ieee80211_chandef_downgrade(struct cfg80211_chan_def *c) c->width = NL80211_CHAN_WIDTH_20_NOHT; ret = IEEE80211_STA_DISABLE_HT | IEEE80211_STA_DISABLE_VHT; break; + case NL80211_CHAN_WIDTH_1: + case NL80211_CHAN_WIDTH_2: + case NL80211_CHAN_WIDTH_4: + case NL80211_CHAN_WIDTH_8: + case NL80211_CHAN_WIDTH_16: case NL80211_CHAN_WIDTH_5: case NL80211_CHAN_WIDTH_10: WARN_ON_ONCE(1); diff --git a/net/wireless/chan.c b/net/wireless/chan.c index cddf92c5d09e..90f0f82cd9ca 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -153,6 +153,11 @@ bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef) control_freq = chandef->chan->center_freq; switch (chandef->width) { + case NL80211_CHAN_WIDTH_1: + case NL80211_CHAN_WIDTH_2: + case NL80211_CHAN_WIDTH_4: + case NL80211_CHAN_WIDTH_8: + case NL80211_CHAN_WIDTH_16: case NL80211_CHAN_WIDTH_5: case NL80211_CHAN_WIDTH_10: case NL80211_CHAN_WIDTH_20: @@ -263,6 +268,21 @@ static int cfg80211_chandef_get_width(const struct cfg80211_chan_def *c) int width; switch (c->width) { + case NL80211_CHAN_WIDTH_1: + width = 1; + break; + case NL80211_CHAN_WIDTH_2: + width = 2; + break; + case NL80211_CHAN_WIDTH_4: + width = 4; + break; + case NL80211_CHAN_WIDTH_8: + width = 8; + break; + case NL80211_CHAN_WIDTH_16: + width = 16; + break; case NL80211_CHAN_WIDTH_5: width = 5; break; @@ -911,6 +931,21 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy, control_freq = chandef->chan->center_freq; switch (chandef->width) { + case NL80211_CHAN_WIDTH_1: + width = 1; + break; + case NL80211_CHAN_WIDTH_2: + width = 2; + break; + case NL80211_CHAN_WIDTH_4: + width = 4; + break; + case NL80211_CHAN_WIDTH_8: + width = 8; + break; + case NL80211_CHAN_WIDTH_16: + width = 16; + break; case NL80211_CHAN_WIDTH_5: width = 5; break; diff --git a/net/wireless/core.c b/net/wireless/core.c index c623d9bf5096..1971d7e6eb55 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -803,10 +803,11 @@ int wiphy_register(struct wiphy *wiphy) if (WARN_ON(!sband->n_channels)) return -EINVAL; /* - * on 60GHz band, there are no legacy rates, so + * on 60GHz or sub-1Ghz band, there are no legacy rates, so * n_bitrates is 0 */ - if (WARN_ON(band != NL80211_BAND_60GHZ && + if (WARN_ON((band != NL80211_BAND_60GHZ && + band != NL80211_BAND_S1GHZ) && !sband->n_bitrates)) return -EINVAL; diff --git a/net/wireless/util.c b/net/wireless/util.c index 4d3b76f94f55..26a977343c3b 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -102,6 +102,8 @@ u32 ieee80211_channel_to_freq_khz(int chan, enum nl80211_band band) if (chan < 7) return MHZ_TO_KHZ(56160 + chan * 2160); break; + case NL80211_BAND_S1GHZ: + return 902000 + chan * 500; default: ; } @@ -210,6 +212,12 @@ static void set_mandatory_flags_band(struct ieee80211_supported_band *sband) WARN_ON(!sband->ht_cap.ht_supported); WARN_ON((sband->ht_cap.mcs.rx_mask[0] & 0x1e) != 0x1e); break; + case NL80211_BAND_S1GHZ: + /* Figure 9-589bd: 3 means unsupported, so != 3 means at least + * mandatory is ok. + */ + WARN_ON((sband->s1g_cap.nss_mcs[0] & 0x3) == 0x3); + break; case NUM_NL80211_BANDS: default: WARN_ON(1); -- cgit v1.2.3-59-g8ed1b From 987021726f9f41a1daf335c57cd7b6261109cdb2 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 15 Jul 2020 09:43:21 -0700 Subject: net/wireless: nl80211.h: drop duplicate words in comments Drop doubled words in several comments. Signed-off-by: Randy Dunlap Cc: netdev@vger.kernel.org Cc: Kalle Valo Cc: linux-wireless@vger.kernel.org Cc: Johannes Berg Link: https://lore.kernel.org/r/20200715164325.9109-1-rdunlap@infradead.org Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index ad183469f9af..f47a7a8d0216 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -363,7 +363,7 @@ * @NL80211_CMD_SET_STATION: Set station attributes for station identified by * %NL80211_ATTR_MAC on the interface identified by %NL80211_ATTR_IFINDEX. * @NL80211_CMD_NEW_STATION: Add a station with given attributes to the - * the interface identified by %NL80211_ATTR_IFINDEX. + * interface identified by %NL80211_ATTR_IFINDEX. * @NL80211_CMD_DEL_STATION: Remove a station identified by %NL80211_ATTR_MAC * or, if no MAC address given, all stations, on the interface identified * by %NL80211_ATTR_IFINDEX. %NL80211_ATTR_MGMT_SUBTYPE and @@ -383,7 +383,7 @@ * @NL80211_CMD_DEL_MPATH: Delete a mesh path to the destination given by * %NL80211_ATTR_MAC. * @NL80211_CMD_NEW_PATH: Add a mesh path with given attributes to the - * the interface identified by %NL80211_ATTR_IFINDEX. + * interface identified by %NL80211_ATTR_IFINDEX. * @NL80211_CMD_DEL_PATH: Remove a mesh path identified by %NL80211_ATTR_MAC * or, if no MAC address given, all mesh paths, on the interface identified * by %NL80211_ATTR_IFINDEX. @@ -934,7 +934,7 @@ * @NL80211_CMD_SET_COALESCE: Configure coalesce rules or clear existing rules. * * @NL80211_CMD_CHANNEL_SWITCH: Perform a channel switch by announcing the - * the new channel information (Channel Switch Announcement - CSA) + * new channel information (Channel Switch Announcement - CSA) * in the beacon for some time (as defined in the * %NL80211_ATTR_CH_SWITCH_COUNT parameter) and then change to the * new channel. Userspace provides the new channel information (using @@ -1113,7 +1113,7 @@ * randomization may be enabled and configured by specifying the * %NL80211_ATTR_MAC and %NL80211_ATTR_MAC_MASK attributes. * If a timeout is requested, use the %NL80211_ATTR_TIMEOUT attribute. - * A u64 cookie for further %NL80211_ATTR_COOKIE use is is returned in + * A u64 cookie for further %NL80211_ATTR_COOKIE use is returned in * the netlink extended ack message. * * To cancel a measurement, close the socket that requested it. @@ -1511,7 +1511,7 @@ enum nl80211_commands { * rates as defined by IEEE 802.11 7.3.2.2 but without the length * restriction (at most %NL80211_MAX_SUPP_RATES). * @NL80211_ATTR_STA_VLAN: interface index of VLAN interface to move station - * to, or the AP interface the station was originally added to to. + * to, or the AP interface the station was originally added to. * @NL80211_ATTR_STA_INFO: information about a station, part of station info * given for %NL80211_CMD_GET_STATION, nested attribute containing * info as possible, see &enum nl80211_sta_info. @@ -2084,7 +2084,7 @@ enum nl80211_commands { * @NL80211_ATTR_STA_SUPPORTED_CHANNELS: array of supported channels. * * @NL80211_ATTR_STA_SUPPORTED_OPER_CLASSES: array of supported - * supported operating classes. + * operating classes. * * @NL80211_ATTR_HANDLE_DFS: A flag indicating whether user space * controls DFS operation in IBSS mode. If the flag is included in @@ -2395,7 +2395,7 @@ enum nl80211_commands { * nl80211_txq_stats) * @NL80211_ATTR_TXQ_LIMIT: Total packet limit for the TXQ queues for this phy. * The smaller of this and the memory limit is enforced. - * @NL80211_ATTR_TXQ_MEMORY_LIMIT: Total memory memory limit (in bytes) for the + * @NL80211_ATTR_TXQ_MEMORY_LIMIT: Total memory limit (in bytes) for the * TXQ queues for this phy. The smaller of this and the packet limit is * enforced. * @NL80211_ATTR_TXQ_QUANTUM: TXQ scheduler quantum (bytes). Number of bytes @@ -5652,7 +5652,7 @@ enum nl80211_feature_flags { * enum nl80211_ext_feature_index - bit index of extended features. * @NL80211_EXT_FEATURE_VHT_IBSS: This driver supports IBSS with VHT datarates. * @NL80211_EXT_FEATURE_RRM: This driver supports RRM. When featured, user can - * can request to use RRM (see %NL80211_ATTR_USE_RRM) with + * request to use RRM (see %NL80211_ATTR_USE_RRM) with * %NL80211_CMD_ASSOCIATE and %NL80211_CMD_CONNECT requests, which will set * the ASSOC_REQ_USE_RRM flag in the association request even if * NL80211_FEATURE_QUIET is not advertized. @@ -6061,7 +6061,7 @@ enum nl80211_dfs_state { }; /** - * enum enum nl80211_protocol_features - nl80211 protocol features + * enum nl80211_protocol_features - nl80211 protocol features * @NL80211_PROTOCOL_FEATURE_SPLIT_WIPHY_DUMP: nl80211 supports splitting * wiphy dumps (if requested by the application with the attribute * %NL80211_ATTR_SPLIT_WIPHY_DUMP. Also supported is filtering the -- cgit v1.2.3-59-g8ed1b From 0f55c0c500f2bbfc5cc5590cdf6973b3f64dc195 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 15 Jul 2020 09:43:22 -0700 Subject: net/wireless: wireless.h: drop duplicate word in comments Drop doubled word "threshold" in a comment. Signed-off-by: Randy Dunlap Cc: netdev@vger.kernel.org Cc: Kalle Valo Cc: linux-wireless@vger.kernel.org Cc: Johannes Berg Link: https://lore.kernel.org/r/20200715164325.9109-2-rdunlap@infradead.org Signed-off-by: Johannes Berg --- include/uapi/linux/wireless.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/wireless.h b/include/uapi/linux/wireless.h index 24f3371ad826..08967b3f19c8 100644 --- a/include/uapi/linux/wireless.h +++ b/include/uapi/linux/wireless.h @@ -914,7 +914,7 @@ union iwreq_data { struct iw_param sens; /* signal level threshold */ struct iw_param bitrate; /* default bit rate */ struct iw_param txpower; /* default transmit power */ - struct iw_param rts; /* RTS threshold threshold */ + struct iw_param rts; /* RTS threshold */ struct iw_param frag; /* Fragmentation threshold */ __u32 mode; /* Operation mode */ struct iw_param retry; /* Retry limits & lifetime */ -- cgit v1.2.3-59-g8ed1b From e3718a611470d311a92c60d4eb535270b49a7108 Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Wed, 17 Jun 2020 09:30:33 +0200 Subject: cfg80211/mac80211: add mesh_param "mesh_nolearn" to skip path discovery MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, before being able to forward a packet between two 802.11s nodes, both a PLINK handshake is performed upon receiving a beacon and then later a PREQ/PREP exchange for path discovery is performed on demand upon receiving a data frame to forward. When running a mesh protocol on top of an 802.11s interface, like batman-adv, we do not need the multi-hop mesh routing capabilities of 802.11s and usually set mesh_fwding=0. However, even with mesh_fwding=0 the PREQ/PREP path discovery is still performed on demand. Even though in this scenario the next hop PREQ/PREP will determine is always the direct 11s neighbor node. The new mesh_nolearn parameter allows to skip the PREQ/PREP exchange in this scenario, leading to a reduced delay, reduced packet buffering and simplifies HWMP in general. mesh_nolearn is still rather conservative in that if the packet destination is not a direct 11s neighbor, it will fall back to PREQ/PREP path discovery. For normal, multi-hop 802.11s mesh routing it is usually not advisable to enable mesh_nolearn as a transmission to a direct but distant neighbor might be worse than reaching that same node via a more robust / higher throughput etc. multi-hop path. Cc: Sven Eckelmann Cc: Simon Wunderlich Signed-off-by: Linus Lüssing Link: https://lore.kernel.org/r/20200617073034.26149-1-linus.luessing@c0d3.blue [fix nl80211 policy to range 0/1 only] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 6 ++++++ include/uapi/linux/nl80211.h | 7 +++++++ net/mac80211/cfg.c | 2 ++ net/mac80211/debugfs_netdev.c | 2 ++ net/mac80211/mesh_hwmp.c | 39 +++++++++++++++++++++++++++++++++++++++ net/wireless/mesh.c | 1 + net/wireless/nl80211.c | 7 ++++++- net/wireless/trace.h | 4 +++- 8 files changed, 66 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index fa4d5627397f..78b220950942 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1870,6 +1870,11 @@ struct bss_parameters { * connected to a mesh gate in mesh formation info. If false, the * value in mesh formation is determined by the presence of root paths * in the mesh path table + * @dot11MeshNolearn: Try to avoid multi-hop path discovery (e.g. PREQ/PREP + * for HWMP) if the destination is a direct neighbor. Note that this might + * not be the optimal decision as a multi-hop route might be better. So + * if using this setting you will likely also want to disable + * dot11MeshForwarding and use another mesh routing protocol on top. */ struct mesh_config { u16 dot11MeshRetryTimeout; @@ -1901,6 +1906,7 @@ struct mesh_config { enum nl80211_mesh_power_mode power_mode; u16 dot11MeshAwakeWindowDuration; u32 plink_timeout; + bool dot11MeshNolearn; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index f47a7a8d0216..a83d8faf88ac 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4236,6 +4236,12 @@ enum nl80211_mesh_power_mode { * field. If left unset then the mesh formation field will only * advertise such if there is an active root mesh path. * + * @NL80211_MESHCONF_NOLEARN: Try to avoid multi-hop path discovery (e.g. + * PREQ/PREP for HWMP) if the destination is a direct neighbor. Note that + * this might not be the optimal decision as a multi-hop route might be + * better. So if using this setting you will likely also want to disable + * dot11MeshForwarding and use another mesh routing protocol on top. + * * @__NL80211_MESHCONF_ATTR_AFTER_LAST: internal use */ enum nl80211_meshconf_params { @@ -4269,6 +4275,7 @@ enum nl80211_meshconf_params { NL80211_MESHCONF_AWAKE_WINDOW, NL80211_MESHCONF_PLINK_TIMEOUT, NL80211_MESHCONF_CONNECTED_TO_GATE, + NL80211_MESHCONF_NOLEARN, /* keep last */ __NL80211_MESHCONF_ATTR_AFTER_LAST, diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index b4a74064675e..9af56b848544 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2126,6 +2126,8 @@ static int ieee80211_update_mesh_config(struct wiphy *wiphy, if (_chg_mesh_attr(NL80211_MESHCONF_CONNECTED_TO_GATE, mask)) conf->dot11MeshConnectedToMeshGate = nconf->dot11MeshConnectedToMeshGate; + if (_chg_mesh_attr(NL80211_MESHCONF_NOLEARN, mask)) + conf->dot11MeshNolearn = nconf->dot11MeshNolearn; ieee80211_mbss_info_change_notify(sdata, BSS_CHANGED_BEACON); return 0; } diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index d7e955127d5c..09eab2c3f380 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -638,6 +638,7 @@ IEEE80211_IF_FILE(dot11MeshAwakeWindowDuration, u.mesh.mshcfg.dot11MeshAwakeWindowDuration, DEC); IEEE80211_IF_FILE(dot11MeshConnectedToMeshGate, u.mesh.mshcfg.dot11MeshConnectedToMeshGate, DEC); +IEEE80211_IF_FILE(dot11MeshNolearn, u.mesh.mshcfg.dot11MeshNolearn, DEC); #endif #define DEBUGFS_ADD_MODE(name, mode) \ @@ -762,6 +763,7 @@ static void add_mesh_config(struct ieee80211_sub_if_data *sdata) MESHPARAMS_ADD(power_mode); MESHPARAMS_ADD(dot11MeshAwakeWindowDuration); MESHPARAMS_ADD(dot11MeshConnectedToMeshGate); + MESHPARAMS_ADD(dot11MeshNolearn); #undef MESHPARAMS_ADD } #endif diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index bae3a3e15b88..bec23d2eee7a 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -1172,6 +1172,40 @@ int mesh_nexthop_resolve(struct ieee80211_sub_if_data *sdata, return -ENOENT; } +/** + * mesh_nexthop_lookup_nolearn - try to set next hop without path discovery + * @skb: 802.11 frame to be sent + * @sdata: network subif the frame will be sent through + * + * Check if the meshDA (addr3) of a unicast frame is a direct neighbor. + * And if so, set the RA (addr1) to it to transmit to this node directly, + * avoiding PREQ/PREP path discovery. + * + * Returns: 0 if the next hop was found and -ENOENT otherwise. + */ +static int mesh_nexthop_lookup_nolearn(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb) +{ + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; + struct sta_info *sta; + + if (is_multicast_ether_addr(hdr->addr1)) + return -ENOENT; + + rcu_read_lock(); + sta = sta_info_get(sdata, hdr->addr3); + + if (!sta || sta->mesh->plink_state != NL80211_PLINK_ESTAB) { + rcu_read_unlock(); + return -ENOENT; + } + rcu_read_unlock(); + + memcpy(hdr->addr1, hdr->addr3, ETH_ALEN); + memcpy(hdr->addr2, sdata->vif.addr, ETH_ALEN); + return 0; +} + /** * mesh_nexthop_lookup - put the appropriate next hop on a mesh frame. Calling * this function is considered "using" the associated mpath, so preempt a path @@ -1185,11 +1219,16 @@ int mesh_nexthop_resolve(struct ieee80211_sub_if_data *sdata, int mesh_nexthop_lookup(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { + struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; struct mesh_path *mpath; struct sta_info *next_hop; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; u8 *target_addr = hdr->addr3; + if (ifmsh->mshcfg.dot11MeshNolearn && + !mesh_nexthop_lookup_nolearn(sdata, skb)) + return 0; + mpath = mesh_path_lookup(sdata, target_addr); if (!mpath || !(mpath->flags & MESH_PATH_ACTIVE)) return -ENOENT; diff --git a/net/wireless/mesh.c b/net/wireless/mesh.c index eac5aa1419fc..e4e363138279 100644 --- a/net/wireless/mesh.c +++ b/net/wireless/mesh.c @@ -78,6 +78,7 @@ const struct mesh_config default_mesh_config = { .power_mode = NL80211_MESH_POWER_ACTIVE, .dot11MeshAwakeWindowDuration = MESH_DEFAULT_AWAKE_WINDOW, .plink_timeout = MESH_DEFAULT_PLINK_TIMEOUT, + .dot11MeshNolearn = false, }; const struct mesh_setup default_mesh_setup = { diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 6fdf818f66cf..257c06315464 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -6885,7 +6885,9 @@ static int nl80211_get_mesh_config(struct sk_buff *skb, nla_put_u32(msg, NL80211_MESHCONF_PLINK_TIMEOUT, cur_params.plink_timeout) || nla_put_u8(msg, NL80211_MESHCONF_CONNECTED_TO_GATE, - cur_params.dot11MeshConnectedToMeshGate)) + cur_params.dot11MeshConnectedToMeshGate) || + nla_put_u8(msg, NL80211_MESHCONF_NOLEARN, + cur_params.dot11MeshNolearn)) goto nla_put_failure; nla_nest_end(msg, pinfoattr); genlmsg_end(msg, hdr); @@ -6943,6 +6945,7 @@ nl80211_meshconf_params_policy[NL80211_MESHCONF_ATTR_MAX+1] = { [NL80211_MESHCONF_AWAKE_WINDOW] = { .type = NLA_U16 }, [NL80211_MESHCONF_PLINK_TIMEOUT] = { .type = NLA_U32 }, [NL80211_MESHCONF_CONNECTED_TO_GATE] = NLA_POLICY_RANGE(NLA_U8, 0, 1), + [NL80211_MESHCONF_NOLEARN] = NLA_POLICY_RANGE(NLA_U8, 0, 1), }; static const struct nla_policy @@ -7094,6 +7097,8 @@ do { \ NL80211_MESHCONF_AWAKE_WINDOW, nla_get_u16); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, plink_timeout, mask, NL80211_MESHCONF_PLINK_TIMEOUT, nla_get_u32); + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshNolearn, mask, + NL80211_MESHCONF_NOLEARN, nla_get_u8); if (mask_out) *mask_out = mask; diff --git a/net/wireless/trace.h b/net/wireless/trace.h index b23cab016521..6e218a0acd4e 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -68,7 +68,8 @@ __field(u16, ht_opmode) \ __field(u32, dot11MeshHWMPactivePathToRootTimeout) \ __field(u16, dot11MeshHWMProotInterval) \ - __field(u16, dot11MeshHWMPconfirmationInterval) + __field(u16, dot11MeshHWMPconfirmationInterval) \ + __field(bool, dot11MeshNolearn) #define MESH_CFG_ASSIGN \ do { \ __entry->dot11MeshRetryTimeout = conf->dot11MeshRetryTimeout; \ @@ -109,6 +110,7 @@ conf->dot11MeshHWMProotInterval; \ __entry->dot11MeshHWMPconfirmationInterval = \ conf->dot11MeshHWMPconfirmationInterval; \ + __entry->dot11MeshNolearn = conf->dot11MeshNolearn; \ } while (0) #define CHAN_ENTRY __field(enum nl80211_band, band) \ -- cgit v1.2.3-59-g8ed1b From 184eebe664f0e11c485f6d309fe56297b3f75e9e Mon Sep 17 00:00:00 2001 From: Markus Theil Date: Thu, 11 Jun 2020 16:02:37 +0200 Subject: cfg80211/mac80211: add connected to auth server to meshconf Besides information about num of peerings and gate connectivity, the mesh formation byte also contains a flag for authentication server connectivity, that currently cannot be set in the mesh conf. This patch adds this capability, which is necessary to implement 802.1X authentication in mesh mode. Signed-off-by: Markus Theil Link: https://lore.kernel.org/r/20200611140238.427461-1-markus.theil@tu-ilmenau.de Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 1 + include/uapi/linux/nl80211.h | 5 +++++ net/mac80211/cfg.c | 3 +++ net/mac80211/debugfs_netdev.c | 3 +++ net/mac80211/mesh.c | 5 ++++- net/wireless/nl80211.c | 8 +++++++- 6 files changed, 23 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 78b220950942..8d5071f84ffe 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1895,6 +1895,7 @@ struct mesh_config { u16 dot11MeshHWMPnetDiameterTraversalTime; u8 dot11MeshHWMPRootMode; bool dot11MeshConnectedToMeshGate; + bool dot11MeshConnectedToAuthServer; u16 dot11MeshHWMPRannInterval; bool dot11MeshGateAnnouncementProtocol; bool dot11MeshForwarding; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index a83d8faf88ac..f1770e3756f4 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4242,6 +4242,10 @@ enum nl80211_mesh_power_mode { * better. So if using this setting you will likely also want to disable * dot11MeshForwarding and use another mesh routing protocol on top. * + * @NL80211_MESHCONF_CONNECTED_TO_AS: If set to true then this mesh STA + * will advertise that it is connected to a authentication server + * in the mesh formation field. + * * @__NL80211_MESHCONF_ATTR_AFTER_LAST: internal use */ enum nl80211_meshconf_params { @@ -4276,6 +4280,7 @@ enum nl80211_meshconf_params { NL80211_MESHCONF_PLINK_TIMEOUT, NL80211_MESHCONF_CONNECTED_TO_GATE, NL80211_MESHCONF_NOLEARN, + NL80211_MESHCONF_CONNECTED_TO_AS, /* keep last */ __NL80211_MESHCONF_ATTR_AFTER_LAST, diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 9af56b848544..6a6531a50e54 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2128,6 +2128,9 @@ static int ieee80211_update_mesh_config(struct wiphy *wiphy, nconf->dot11MeshConnectedToMeshGate; if (_chg_mesh_attr(NL80211_MESHCONF_NOLEARN, mask)) conf->dot11MeshNolearn = nconf->dot11MeshNolearn; + if (_chg_mesh_attr(NL80211_MESHCONF_CONNECTED_TO_AS, mask)) + conf->dot11MeshConnectedToAuthServer = + nconf->dot11MeshConnectedToAuthServer; ieee80211_mbss_info_change_notify(sdata, BSS_CHANGED_BEACON); return 0; } diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index 09eab2c3f380..fe8a7a87e513 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -639,6 +639,8 @@ IEEE80211_IF_FILE(dot11MeshAwakeWindowDuration, IEEE80211_IF_FILE(dot11MeshConnectedToMeshGate, u.mesh.mshcfg.dot11MeshConnectedToMeshGate, DEC); IEEE80211_IF_FILE(dot11MeshNolearn, u.mesh.mshcfg.dot11MeshNolearn, DEC); +IEEE80211_IF_FILE(dot11MeshConnectedToAuthServer, + u.mesh.mshcfg.dot11MeshConnectedToAuthServer, DEC); #endif #define DEBUGFS_ADD_MODE(name, mode) \ @@ -764,6 +766,7 @@ static void add_mesh_config(struct ieee80211_sub_if_data *sdata) MESHPARAMS_ADD(dot11MeshAwakeWindowDuration); MESHPARAMS_ADD(dot11MeshConnectedToMeshGate); MESHPARAMS_ADD(dot11MeshNolearn); + MESHPARAMS_ADD(dot11MeshConnectedToAuthServer); #undef MESHPARAMS_ADD } #endif diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 96f0323c0a3d..d0db6af16427 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -260,6 +260,7 @@ int mesh_add_meshconf_ie(struct ieee80211_sub_if_data *sdata, bool is_connected_to_gate = ifmsh->num_gates > 0 || ifmsh->mshcfg.dot11MeshGateAnnouncementProtocol || ifmsh->mshcfg.dot11MeshConnectedToMeshGate; + bool is_connected_to_as = ifmsh->mshcfg.dot11MeshConnectedToAuthServer; if (skb_tailroom(skb) < 2 + meshconf_len) return -ENOMEM; @@ -284,7 +285,9 @@ int mesh_add_meshconf_ie(struct ieee80211_sub_if_data *sdata, /* Mesh Formation Info - number of neighbors */ neighbors = atomic_read(&ifmsh->estab_plinks); neighbors = min_t(int, neighbors, IEEE80211_MAX_MESH_PEERINGS); - *pos++ = (neighbors << 1) | is_connected_to_gate; + *pos++ = (is_connected_to_as << 7) | + (neighbors << 1) | + is_connected_to_gate; /* Mesh capability */ *pos = 0x00; *pos |= ifmsh->mshcfg.dot11MeshForwarding ? diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 257c06315464..434fd06dc5cf 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -6887,7 +6887,9 @@ static int nl80211_get_mesh_config(struct sk_buff *skb, nla_put_u8(msg, NL80211_MESHCONF_CONNECTED_TO_GATE, cur_params.dot11MeshConnectedToMeshGate) || nla_put_u8(msg, NL80211_MESHCONF_NOLEARN, - cur_params.dot11MeshNolearn)) + cur_params.dot11MeshNolearn) || + nla_put_u8(msg, NL80211_MESHCONF_CONNECTED_TO_AS, + cur_params.dot11MeshConnectedToAuthServer)) goto nla_put_failure; nla_nest_end(msg, pinfoattr); genlmsg_end(msg, hdr); @@ -6946,6 +6948,7 @@ nl80211_meshconf_params_policy[NL80211_MESHCONF_ATTR_MAX+1] = { [NL80211_MESHCONF_PLINK_TIMEOUT] = { .type = NLA_U32 }, [NL80211_MESHCONF_CONNECTED_TO_GATE] = NLA_POLICY_RANGE(NLA_U8, 0, 1), [NL80211_MESHCONF_NOLEARN] = NLA_POLICY_RANGE(NLA_U8, 0, 1), + [NL80211_MESHCONF_CONNECTED_TO_AS] = NLA_POLICY_RANGE(NLA_U8, 0, 1), }; static const struct nla_policy @@ -7058,6 +7061,9 @@ do { \ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshConnectedToMeshGate, mask, NL80211_MESHCONF_CONNECTED_TO_GATE, nla_get_u8); + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshConnectedToAuthServer, mask, + NL80211_MESHCONF_CONNECTED_TO_AS, + nla_get_u8); /* * Check HT operation mode based on * IEEE 802.11-2016 9.4.2.57 HT Operation element. -- cgit v1.2.3-59-g8ed1b From 1303a51c24100b3b1915d6f9072fe5ae5bb4c5f6 Mon Sep 17 00:00:00 2001 From: Markus Theil Date: Thu, 11 Jun 2020 16:02:38 +0200 Subject: cfg80211/mac80211: add connected to auth server to station info This patch adds the necessary bits to later query the auth server flag for every peer from iw. Signed-off-by: Markus Theil Link: https://lore.kernel.org/r/20200611140238.427461-2-markus.theil@tu-ilmenau.de Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 3 +++ include/uapi/linux/nl80211.h | 3 +++ net/mac80211/sta_info.c | 4 +++- net/mac80211/sta_info.h | 2 ++ net/wireless/nl80211.c | 1 + 5 files changed, 12 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 8d5071f84ffe..39fe21edd2c5 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1598,6 +1598,7 @@ struct cfg80211_tid_stats { * an FCS error. This counter should be incremented only when TA of the * received packet with an FCS error matches the peer MAC address. * @airtime_link_metric: mesh airtime link metric. + * @connected_to_as: true if mesh STA has a path to authentication server */ struct station_info { u64 filled; @@ -1655,6 +1656,8 @@ struct station_info { u32 fcs_err_count; u32 airtime_link_metric; + + u8 connected_to_as; }; #if IS_ENABLED(CONFIG_CFG80211) diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index f1770e3756f4..d6b6599a6001 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -3370,6 +3370,8 @@ enum nl80211_sta_bss_param { * @NL80211_STA_INFO_AIRTIME_LINK_METRIC: airtime link metric for mesh station * @NL80211_STA_INFO_ASSOC_AT_BOOTTIME: Timestamp (CLOCK_BOOTTIME, nanoseconds) * of STA's association + * @NL80211_STA_INFO_CONNECTED_TO_AS: set to true if STA has a path to a + * authentication server (u8, 0 or 1) * @__NL80211_STA_INFO_AFTER_LAST: internal * @NL80211_STA_INFO_MAX: highest possible station info attribute */ @@ -3417,6 +3419,7 @@ enum nl80211_sta_info { NL80211_STA_INFO_AIRTIME_WEIGHT, NL80211_STA_INFO_AIRTIME_LINK_METRIC, NL80211_STA_INFO_ASSOC_AT_BOOTTIME, + NL80211_STA_INFO_CONNECTED_TO_AS, /* keep last */ __NL80211_STA_INFO_AFTER_LAST, diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index cd8487bc6fc2..a39773b40457 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -2426,7 +2426,8 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, BIT_ULL(NL80211_STA_INFO_LOCAL_PM) | BIT_ULL(NL80211_STA_INFO_PEER_PM) | BIT_ULL(NL80211_STA_INFO_NONPEER_PM) | - BIT_ULL(NL80211_STA_INFO_CONNECTED_TO_GATE); + BIT_ULL(NL80211_STA_INFO_CONNECTED_TO_GATE) | + BIT_ULL(NL80211_STA_INFO_CONNECTED_TO_AS); sinfo->llid = sta->mesh->llid; sinfo->plid = sta->mesh->plid; @@ -2439,6 +2440,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, sinfo->peer_pm = sta->mesh->peer_pm; sinfo->nonpeer_pm = sta->mesh->nonpeer_pm; sinfo->connected_to_gate = sta->mesh->connected_to_gate; + sinfo->connected_to_as = sta->mesh->connected_to_as; #endif } diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 49728047dfad..9d398c9daa4c 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -385,6 +385,7 @@ DECLARE_EWMA(mesh_tx_rate_avg, 8, 16) * @processed_beacon: set to true after peer rates and capabilities are * processed * @connected_to_gate: true if mesh STA has a path to a mesh gate + * @connected_to_as: true if mesh STA has a path to a authentication server * @fail_avg: moving percentage of failed MSDUs * @tx_rate_avg: moving average of tx bitrate */ @@ -404,6 +405,7 @@ struct mesh_sta { bool processed_beacon; bool connected_to_gate; + bool connected_to_as; enum nl80211_plink_state plink_state; u32 plink_timeout; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 434fd06dc5cf..13a38aab1565 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -5395,6 +5395,7 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, PUT_SINFO(PEER_PM, peer_pm, u32); PUT_SINFO(NONPEER_PM, nonpeer_pm, u32); PUT_SINFO(CONNECTED_TO_GATE, connected_to_gate, u8); + PUT_SINFO(CONNECTED_TO_AS, connected_to_as, u8); if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_BSS_PARAM)) { bss_param = nla_nest_start_noflag(msg, -- cgit v1.2.3-59-g8ed1b From fd17dba1c860d39f655a3a08387c21e3ceca8c55 Mon Sep 17 00:00:00 2001 From: Veerendranath Jakkam Date: Mon, 20 Jul 2020 13:12:25 +0530 Subject: cfg80211: Add support to advertize OCV support Add a new feature flag that drivers can use to advertize support for Operating Channel Validation (OCV) when using driver's SME for RSNA handshakes. Signed-off-by: Veerendranath Jakkam Link: https://lore.kernel.org/r/20200720074225.8990-1-vjakkam@codeaurora.org Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index d6b6599a6001..a3ae2b060a55 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -5804,6 +5804,9 @@ enum nl80211_feature_flags { * @NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211_TX_STATUS: The driver * can report tx status for control port over nl80211 tx operations. * + * @NL80211_EXT_FEATURE_OPERATING_CHANNEL_VALIDATION: Driver supports Operating + * Channel Validation (OCV) when using driver's SME for RSNA handshakes. + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -5859,6 +5862,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT, NL80211_EXT_FEATURE_SCAN_FREQ_KHZ, NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211_TX_STATUS, + NL80211_EXT_FEATURE_OPERATING_CHANNEL_VALIDATION, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, -- cgit v1.2.3-59-g8ed1b From f96622749a67d40ad5efe8a58d5fc95313097aa0 Mon Sep 17 00:00:00 2001 From: Chung-Hsien Hsu Date: Tue, 23 Jun 2020 08:49:35 -0500 Subject: nl80211: support 4-way handshake offloading for WPA/WPA2-PSK in AP mode Let drivers advertise support for AP-mode WPA/WPA2-PSK 4-way handshake offloading with a new NL80211_EXT_FEATURE_4WAY_HANDSHAKE_AP_PSK flag. Extend use of NL80211_ATTR_PMK attribute indicating it might be passed as part of NL80211_CMD_START_AP command, and contain the PSK (which is the PMK, hence the name). The driver is assumed to handle the 4-way handshake by itself in this case, instead of relying on userspace. Signed-off-by: Chung-Hsien Hsu Signed-off-by: Chi-Hsien Lin Link: https://lore.kernel.org/r/20200623134938.39997-2-chi-hsien.lin@cypress.com Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 41 ++++++++++++++++++++++++++++------------- net/wireless/nl80211.c | 4 +++- 2 files changed, 31 insertions(+), 14 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index a3ae2b060a55..631f3a997b3c 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -183,18 +183,27 @@ * * By setting @NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_PSK flag drivers * can indicate they support offloading EAPOL handshakes for WPA/WPA2 - * preshared key authentication. In %NL80211_CMD_CONNECT the preshared - * key should be specified using %NL80211_ATTR_PMK. Drivers supporting - * this offload may reject the %NL80211_CMD_CONNECT when no preshared - * key material is provided, for example when that driver does not - * support setting the temporal keys through %CMD_NEW_KEY. + * preshared key authentication in station mode. In %NL80211_CMD_CONNECT + * the preshared key should be specified using %NL80211_ATTR_PMK. Drivers + * supporting this offload may reject the %NL80211_CMD_CONNECT when no + * preshared key material is provided, for example when that driver does + * not support setting the temporal keys through %NL80211_CMD_NEW_KEY. * * Similarly @NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_1X flag can be * set by drivers indicating offload support of the PTK/GTK EAPOL - * handshakes during 802.1X authentication. In order to use the offload - * the %NL80211_CMD_CONNECT should have %NL80211_ATTR_WANT_1X_4WAY_HS - * attribute flag. Drivers supporting this offload may reject the - * %NL80211_CMD_CONNECT when the attribute flag is not present. + * handshakes during 802.1X authentication in station mode. In order to + * use the offload the %NL80211_CMD_CONNECT should have + * %NL80211_ATTR_WANT_1X_4WAY_HS attribute flag. Drivers supporting this + * offload may reject the %NL80211_CMD_CONNECT when the attribute flag is + * not present. + * + * By setting @NL80211_EXT_FEATURE_4WAY_HANDSHAKE_AP_PSK flag drivers + * can indicate they support offloading EAPOL handshakes for WPA/WPA2 + * preshared key authentication in AP mode. In %NL80211_CMD_START_AP + * the preshared key should be specified using %NL80211_ATTR_PMK. Drivers + * supporting this offload may reject the %NL80211_CMD_START_AP when no + * preshared key material is provided, for example when that driver does + * not support setting the temporal keys through %NL80211_CMD_NEW_KEY. * * For 802.1X the PMK or PMK-R0 are set by providing %NL80211_ATTR_PMK * using %NL80211_CMD_SET_PMK. For offloaded FT support also @@ -2362,10 +2371,11 @@ enum nl80211_commands { * * @NL80211_ATTR_PMK: attribute for passing PMK key material. Used with * %NL80211_CMD_SET_PMKSA for the PMKSA identified by %NL80211_ATTR_PMKID. - * For %NL80211_CMD_CONNECT it is used to provide PSK for offloading 4-way - * handshake for WPA/WPA2-PSK networks. For 802.1X authentication it is - * used with %NL80211_CMD_SET_PMK. For offloaded FT support this attribute - * specifies the PMK-R0 if NL80211_ATTR_PMKR0_NAME is included as well. + * For %NL80211_CMD_CONNECT and %NL80211_CMD_START_AP it is used to provide + * PSK for offloading 4-way handshake for WPA/WPA2-PSK networks. For 802.1X + * authentication it is used with %NL80211_CMD_SET_PMK. For offloaded FT + * support this attribute specifies the PMK-R0 if NL80211_ATTR_PMKR0_NAME + * is included as well. * * @NL80211_ATTR_SCHED_SCAN_MULTI: flag attribute which user-space shall use to * indicate that it supports multiple active scheduled scan requests. @@ -5807,6 +5817,10 @@ enum nl80211_feature_flags { * @NL80211_EXT_FEATURE_OPERATING_CHANNEL_VALIDATION: Driver supports Operating * Channel Validation (OCV) when using driver's SME for RSNA handshakes. * + * @NL80211_EXT_FEATURE_4WAY_HANDSHAKE_AP_PSK: Device wants to do 4-way + * handshake with PSK in AP mode (PSK is passed as part of the start AP + * command). + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -5863,6 +5877,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_SCAN_FREQ_KHZ, NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211_TX_STATUS, NL80211_EXT_FEATURE_OPERATING_CHANNEL_VALIDATION, + NL80211_EXT_FEATURE_4WAY_HANDSHAKE_AP_PSK, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 8d78a6fc59a3..a096682ec0ad 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -9442,7 +9442,9 @@ static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev, if (nla_len(info->attrs[NL80211_ATTR_PMK]) != WLAN_PMK_LEN) return -EINVAL; if (!wiphy_ext_feature_isset(&rdev->wiphy, - NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_PSK)) + NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_PSK) && + !wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_4WAY_HANDSHAKE_AP_PSK)) return -EINVAL; settings->psk = nla_data(info->attrs[NL80211_ATTR_PMK]); } -- cgit v1.2.3-59-g8ed1b From 48040793fa6003d211f021c6ad273477bcd90d91 Mon Sep 17 00:00:00 2001 From: Yousuk Seung Date: Thu, 30 Jul 2020 15:44:40 -0700 Subject: tcp: add earliest departure time to SCM_TIMESTAMPING_OPT_STATS This change adds TCP_NLA_EDT to SCM_TIMESTAMPING_OPT_STATS that reports the earliest departure time(EDT) of the timestamped skb. By tracking EDT values of the skb from different timestamps, we can observe when and how much the value changed. This allows to measure the precise delay injected on the sender host e.g. by a bpf-base throttler. Signed-off-by: Yousuk Seung Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Acked-by: Soheil Hassas Yeganeh Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/linux/tcp.h | 3 ++- include/uapi/linux/tcp.h | 1 + net/core/skbuff.c | 2 +- net/ipv4/tcp.c | 6 +++++- 4 files changed, 9 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 527d668a5275..14b62d7df942 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -484,7 +484,8 @@ static inline void tcp_saved_syn_free(struct tcp_sock *tp) tp->saved_syn = NULL; } -struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk); +struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, + const struct sk_buff *orig_skb); static inline u16 tcp_mss_clamp(const struct tcp_sock *tp, u16 mss) { diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index f2acb2566333..cfcb10b75483 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -313,6 +313,7 @@ enum { TCP_NLA_SRTT, /* smoothed RTT in usecs */ TCP_NLA_TIMEOUT_REHASH, /* Timeout-triggered rehash attempts */ TCP_NLA_BYTES_NOTSENT, /* Bytes in write queue not yet sent */ + TCP_NLA_EDT, /* Earliest departure time (CLOCK_MONOTONIC) */ }; /* for TCP_MD5SIG socket option */ diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b8afefe6f6b6..4e2edfbe0e19 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4692,7 +4692,7 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) && sk->sk_protocol == IPPROTO_TCP && sk->sk_type == SOCK_STREAM) { - skb = tcp_get_timestamping_opt_stats(sk); + skb = tcp_get_timestamping_opt_stats(sk, orig_skb); opt_stats = true; } else #endif diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4afec552f211..c06d2bfd2ec4 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3501,10 +3501,12 @@ static size_t tcp_opt_stats_get_size(void) nla_total_size(sizeof(u32)) + /* TCP_NLA_SRTT */ nla_total_size(sizeof(u16)) + /* TCP_NLA_TIMEOUT_REHASH */ nla_total_size(sizeof(u32)) + /* TCP_NLA_BYTES_NOTSENT */ + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_EDT */ 0; } -struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) +struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, + const struct sk_buff *orig_skb) { const struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *stats; @@ -3558,6 +3560,8 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH, tp->timeout_rehash); nla_put_u32(stats, TCP_NLA_BYTES_NOTSENT, max_t(int, 0, tp->write_seq - tp->snd_nxt)); + nla_put_u64_64bit(stats, TCP_NLA_EDT, orig_skb->skb_mstamp_ns, + TCP_NLA_PAD); return stats; } -- cgit v1.2.3-59-g8ed1b From 829eb208e80d6db95c0201cb8fa00c2f9ad87faf Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Fri, 31 Jul 2020 17:34:01 -0700 Subject: rtnetlink: add support for protodown reason netdev protodown is a mechanism that allows protocols to hold an interface down. It was initially introduced in the kernel to hold links down by a multihoming protocol. There was also an attempt to introduce protodown reason at the time but was rejected. protodown and protodown reason is supported by almost every switching and routing platform. It was ok for a while to live without a protodown reason. But, its become more critical now given more than one protocol may need to keep a link down on a system at the same time. eg: vrrp peer node, port security, multihoming protocol. Its common for Network operators and protocol developers to look for such a reason on a networking box (Its also known as errDisable by most networking operators) This patch adds support for link protodown reason attribute. There are two ways to maintain protodown reasons. (a) enumerate every possible reason code in kernel - A protocol developer has to make a request and have that appear in a certain kernel version (b) provide the bits in the kernel, and allow user-space (sysadmin or NOS distributions) to manage the bit-to-reasonname map. - This makes extending reason codes easier (kind of like the iproute2 table to vrf-name map /etc/iproute2/rt_tables.d/) This patch takes approach (b). a few things about the patch: - It treats the protodown reason bits as counter to indicate active protodown users - Since protodown attribute is already an exposed UAPI, the reason is not enforced on a protodown set. Its a no-op if not used. the patch follows the below algorithm: - presence of reason bits set indicates protodown is in use - user can set protodown and protodown reason in a single or multiple setlink operations - setlink operation to clear protodown, will return -EBUSY if there are active protodown reason bits - reason is not included in link dumps if not used example with patched iproute2: $cat /etc/iproute2/protodown_reasons.d/r.conf 0 mlag 1 evpn 2 vrrp 3 psecurity $ip link set dev vxlan0 protodown on protodown_reason vrrp on $ip link set dev vxlan0 protodown_reason mlag on $ip link show 14: vxlan0: mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000 link/ether f6:06:be:17:91:e7 brd ff:ff:ff:ff:ff:ff protodown on $ip link set dev vxlan0 protodown_reason mlag off $ip link set dev vxlan0 protodown off protodown_reason vrrp off Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ++ include/uapi/linux/if_link.h | 10 ++++ net/core/dev.c | 25 ++++++++++ net/core/rtnetlink.c | 113 +++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 147 insertions(+), 5 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ac2cd3f49aba..ba0fa6b22787 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2058,6 +2058,8 @@ struct net_device { struct timer_list watchdog_timer; int watchdog_timeo; + u32 proto_down_reason; + struct list_head todo_list; int __percpu *pcpu_refcnt; @@ -3810,6 +3812,8 @@ int dev_get_port_parent_id(struct net_device *dev, bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b); int dev_change_proto_down(struct net_device *dev, bool proto_down); int dev_change_proto_down_generic(struct net_device *dev, bool proto_down); +void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask, + u32 value); struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again); struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq, int *ret); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 63af64646358..7fba4de511de 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -170,12 +170,22 @@ enum { IFLA_PROP_LIST, IFLA_ALT_IFNAME, /* Alternative ifname */ IFLA_PERM_ADDRESS, + IFLA_PROTO_DOWN_REASON, __IFLA_MAX }; #define IFLA_MAX (__IFLA_MAX - 1) +enum { + IFLA_PROTO_DOWN_REASON_UNSPEC, + IFLA_PROTO_DOWN_REASON_MASK, /* u32, mask for reason bits */ + IFLA_PROTO_DOWN_REASON_VALUE, /* u32, reason bit value */ + + __IFLA_PROTO_DOWN_REASON_CNT, + IFLA_PROTO_DOWN_REASON_MAX = __IFLA_PROTO_DOWN_REASON_CNT - 1 +}; + /* backwards compatibility for userspace */ #ifndef __KERNEL__ #define IFLA_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct ifinfomsg)))) diff --git a/net/core/dev.c b/net/core/dev.c index 38a6371d9bc5..f7ef0f5c5569 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8715,6 +8715,31 @@ int dev_change_proto_down_generic(struct net_device *dev, bool proto_down) } EXPORT_SYMBOL(dev_change_proto_down_generic); +/** + * dev_change_proto_down_reason - proto down reason + * + * @dev: device + * @mask: proto down mask + * @value: proto down value + */ +void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask, + u32 value) +{ + int b; + + if (!mask) { + dev->proto_down_reason = value; + } else { + for_each_set_bit(b, &mask, 32) { + if (value & (1 << b)) + dev->proto_down_reason |= BIT(b); + else + dev->proto_down_reason &= ~BIT(b); + } + } +} +EXPORT_SYMBOL(dev_change_proto_down_reason); + u32 __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op, enum bpf_netdev_command cmd) { diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 85a4b0101f76..a54c3e0f2ee1 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1000,6 +1000,16 @@ static size_t rtnl_prop_list_size(const struct net_device *dev) return size; } +static size_t rtnl_proto_down_size(const struct net_device *dev) +{ + size_t size = nla_total_size(1); + + if (dev->proto_down_reason) + size += nla_total_size(0) + nla_total_size(4); + + return size; +} + static noinline size_t if_nlmsg_size(const struct net_device *dev, u32 ext_filter_mask) { @@ -1041,7 +1051,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(4) /* IFLA_EVENT */ + nla_total_size(4) /* IFLA_NEW_NETNSID */ + nla_total_size(4) /* IFLA_NEW_IFINDEX */ - + nla_total_size(1) /* IFLA_PROTO_DOWN */ + + rtnl_proto_down_size(dev) /* proto down */ + nla_total_size(4) /* IFLA_TARGET_NETNSID */ + nla_total_size(4) /* IFLA_CARRIER_UP_COUNT */ + nla_total_size(4) /* IFLA_CARRIER_DOWN_COUNT */ @@ -1658,6 +1668,35 @@ nest_cancel: return ret; } +static int rtnl_fill_proto_down(struct sk_buff *skb, + const struct net_device *dev) +{ + struct nlattr *pr; + u32 preason; + + if (nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down)) + goto nla_put_failure; + + preason = dev->proto_down_reason; + if (!preason) + return 0; + + pr = nla_nest_start(skb, IFLA_PROTO_DOWN_REASON); + if (!pr) + return -EMSGSIZE; + + if (nla_put_u32(skb, IFLA_PROTO_DOWN_REASON_VALUE, preason)) { + nla_nest_cancel(skb, pr); + goto nla_put_failure; + } + + nla_nest_end(skb, pr); + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, struct net *src_net, int type, u32 pid, u32 seq, u32 change, @@ -1708,13 +1747,15 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, nla_put_u32(skb, IFLA_CARRIER_CHANGES, atomic_read(&dev->carrier_up_count) + atomic_read(&dev->carrier_down_count)) || - nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down) || nla_put_u32(skb, IFLA_CARRIER_UP_COUNT, atomic_read(&dev->carrier_up_count)) || nla_put_u32(skb, IFLA_CARRIER_DOWN_COUNT, atomic_read(&dev->carrier_down_count))) goto nla_put_failure; + if (rtnl_fill_proto_down(skb, dev)) + goto nla_put_failure; + if (event != IFLA_EVENT_NONE) { if (nla_put_u32(skb, IFLA_EVENT, event)) goto nla_put_failure; @@ -1834,6 +1875,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_ALT_IFNAME] = { .type = NLA_STRING, .len = ALTIFNAMSIZ - 1 }, [IFLA_PERM_ADDRESS] = { .type = NLA_REJECT }, + [IFLA_PROTO_DOWN_REASON] = { .type = NLA_NESTED }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -2483,6 +2525,67 @@ static int do_set_master(struct net_device *dev, int ifindex, return 0; } +static const struct nla_policy ifla_proto_down_reason_policy[IFLA_PROTO_DOWN_REASON_VALUE + 1] = { + [IFLA_PROTO_DOWN_REASON_MASK] = { .type = NLA_U32 }, + [IFLA_PROTO_DOWN_REASON_VALUE] = { .type = NLA_U32 }, +}; + +static int do_set_proto_down(struct net_device *dev, + struct nlattr *nl_proto_down, + struct nlattr *nl_proto_down_reason, + struct netlink_ext_ack *extack) +{ + struct nlattr *pdreason[IFLA_PROTO_DOWN_REASON_MAX + 1]; + const struct net_device_ops *ops = dev->netdev_ops; + unsigned long mask = 0; + u32 value; + bool proto_down; + int err; + + if (!ops->ndo_change_proto_down) { + NL_SET_ERR_MSG(extack, "Protodown not supported by device"); + return -EOPNOTSUPP; + } + + if (nl_proto_down_reason) { + err = nla_parse_nested_deprecated(pdreason, + IFLA_PROTO_DOWN_REASON_MAX, + nl_proto_down_reason, + ifla_proto_down_reason_policy, + NULL); + if (err < 0) + return err; + + if (!pdreason[IFLA_PROTO_DOWN_REASON_VALUE]) { + NL_SET_ERR_MSG(extack, "Invalid protodown reason value"); + return -EINVAL; + } + + value = nla_get_u32(pdreason[IFLA_PROTO_DOWN_REASON_VALUE]); + + if (pdreason[IFLA_PROTO_DOWN_REASON_MASK]) + mask = nla_get_u32(pdreason[IFLA_PROTO_DOWN_REASON_MASK]); + + dev_change_proto_down_reason(dev, mask, value); + } + + if (nl_proto_down) { + proto_down = nla_get_u8(nl_proto_down); + + /* Dont turn off protodown if there are active reasons */ + if (!proto_down && dev->proto_down_reason) { + NL_SET_ERR_MSG(extack, "Cannot clear protodown, active reasons"); + return -EBUSY; + } + err = dev_change_proto_down(dev, + proto_down); + if (err) + return err; + } + + return 0; +} + #define DO_SETLINK_MODIFIED 0x01 /* notify flag means notify + modified. */ #define DO_SETLINK_NOTIFY 0x03 @@ -2771,9 +2874,9 @@ static int do_setlink(const struct sk_buff *skb, } err = 0; - if (tb[IFLA_PROTO_DOWN]) { - err = dev_change_proto_down(dev, - nla_get_u8(tb[IFLA_PROTO_DOWN])); + if (tb[IFLA_PROTO_DOWN] || tb[IFLA_PROTO_DOWN_REASON]) { + err = do_set_proto_down(dev, tb[IFLA_PROTO_DOWN], + tb[IFLA_PROTO_DOWN_REASON], extack); if (err) goto errout; status |= DO_SETLINK_NOTIFY; -- cgit v1.2.3-59-g8ed1b From 73b11c2ab072d5b0599d1e12cc126f55ee306daf Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 31 Jul 2020 11:28:26 -0700 Subject: bpf: Add support for forced LINK_DETACH command Add LINK_DETACH command to force-detach bpf_link without destroying it. It has the same behavior as auto-detaching of bpf_link due to cgroup dying for bpf_cgroup_link or net_device being destroyed for bpf_xdp_link. In such case, bpf_link is still a valid kernel object, but is defuncts and doesn't hold BPF program attached to corresponding BPF hook. This functionality allows users with enough access rights to manually force-detach attached bpf_link without killing respective owner process. This patch implements LINK_DETACH for cgroup, xdp, and netns links, mostly re-using existing link release handling code. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200731182830.286260-2-andriin@fb.com --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 5 +++++ kernel/bpf/cgroup.c | 15 ++++++++++++++- kernel/bpf/net_namespace.c | 8 ++++++++ kernel/bpf/syscall.c | 26 ++++++++++++++++++++++++++ net/core/dev.c | 11 ++++++++++- 6 files changed, 64 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 40c5e206ecf2..cef4ef0d2b4e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -793,6 +793,7 @@ struct bpf_link { struct bpf_link_ops { void (*release)(struct bpf_link *link); void (*dealloc)(struct bpf_link *link); + int (*detach)(struct bpf_link *link); int (*update_prog)(struct bpf_link *link, struct bpf_prog *new_prog, struct bpf_prog *old_prog); void (*show_fdinfo)(const struct bpf_link *link, struct seq_file *seq); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index eb5e0c38eb2c..b134e679e9db 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -117,6 +117,7 @@ enum bpf_cmd { BPF_LINK_GET_NEXT_ID, BPF_ENABLE_STATS, BPF_ITER_CREATE, + BPF_LINK_DETACH, }; enum bpf_map_type { @@ -634,6 +635,10 @@ union bpf_attr { __u32 old_prog_fd; } link_update; + struct { + __u32 link_fd; + } link_detach; + struct { /* struct used by BPF_ENABLE_STATS command */ __u32 type; } enable_stats; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 957cce1d5168..83ff127ef7ae 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -814,6 +814,7 @@ static void bpf_cgroup_link_release(struct bpf_link *link) { struct bpf_cgroup_link *cg_link = container_of(link, struct bpf_cgroup_link, link); + struct cgroup *cg; /* link might have been auto-detached by dying cgroup already, * in that case our work is done here @@ -832,8 +833,12 @@ static void bpf_cgroup_link_release(struct bpf_link *link) WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link, cg_link->type)); + cg = cg_link->cgroup; + cg_link->cgroup = NULL; + mutex_unlock(&cgroup_mutex); - cgroup_put(cg_link->cgroup); + + cgroup_put(cg); } static void bpf_cgroup_link_dealloc(struct bpf_link *link) @@ -844,6 +849,13 @@ static void bpf_cgroup_link_dealloc(struct bpf_link *link) kfree(cg_link); } +static int bpf_cgroup_link_detach(struct bpf_link *link) +{ + bpf_cgroup_link_release(link); + + return 0; +} + static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link, struct seq_file *seq) { @@ -883,6 +895,7 @@ static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link, static const struct bpf_link_ops bpf_cgroup_link_lops = { .release = bpf_cgroup_link_release, .dealloc = bpf_cgroup_link_dealloc, + .detach = bpf_cgroup_link_detach, .update_prog = cgroup_bpf_replace, .show_fdinfo = bpf_cgroup_link_show_fdinfo, .fill_link_info = bpf_cgroup_link_fill_link_info, diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index 71405edd667c..542f275bf252 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -142,9 +142,16 @@ static void bpf_netns_link_release(struct bpf_link *link) bpf_prog_array_free(old_array); out_unlock: + net_link->net = NULL; mutex_unlock(&netns_bpf_mutex); } +static int bpf_netns_link_detach(struct bpf_link *link) +{ + bpf_netns_link_release(link); + return 0; +} + static void bpf_netns_link_dealloc(struct bpf_link *link) { struct bpf_netns_link *net_link = @@ -228,6 +235,7 @@ static void bpf_netns_link_show_fdinfo(const struct bpf_link *link, static const struct bpf_link_ops bpf_netns_link_ops = { .release = bpf_netns_link_release, .dealloc = bpf_netns_link_dealloc, + .detach = bpf_netns_link_detach, .update_prog = bpf_netns_link_update_prog, .fill_link_info = bpf_netns_link_fill_info, .show_fdinfo = bpf_netns_link_show_fdinfo, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cd3d599e9e90..2f343ce15747 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3991,6 +3991,29 @@ out_put_link: return ret; } +#define BPF_LINK_DETACH_LAST_FIELD link_detach.link_fd + +static int link_detach(union bpf_attr *attr) +{ + struct bpf_link *link; + int ret; + + if (CHECK_ATTR(BPF_LINK_DETACH)) + return -EINVAL; + + link = bpf_link_get_from_fd(attr->link_detach.link_fd); + if (IS_ERR(link)) + return PTR_ERR(link); + + if (link->ops->detach) + ret = link->ops->detach(link); + else + ret = -EOPNOTSUPP; + + bpf_link_put(link); + return ret; +} + static int bpf_link_inc_not_zero(struct bpf_link *link) { return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? 0 : -ENOENT; @@ -4240,6 +4263,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_ITER_CREATE: err = bpf_iter_create(&attr); break; + case BPF_LINK_DETACH: + err = link_detach(&attr); + break; default: err = -EINVAL; break; diff --git a/net/core/dev.c b/net/core/dev.c index a2a57988880a..c8b911b10187 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8979,12 +8979,20 @@ static void bpf_xdp_link_release(struct bpf_link *link) /* if racing with net_device's tear down, xdp_link->dev might be * already NULL, in which case link was already auto-detached */ - if (xdp_link->dev) + if (xdp_link->dev) { WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link)); + xdp_link->dev = NULL; + } rtnl_unlock(); } +static int bpf_xdp_link_detach(struct bpf_link *link) +{ + bpf_xdp_link_release(link); + return 0; +} + static void bpf_xdp_link_dealloc(struct bpf_link *link) { struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); @@ -9066,6 +9074,7 @@ out_unlock: static const struct bpf_link_ops bpf_xdp_link_lops = { .release = bpf_xdp_link_release, .dealloc = bpf_xdp_link_dealloc, + .detach = bpf_xdp_link_detach, .show_fdinfo = bpf_xdp_link_show_fdinfo, .fill_link_info = bpf_xdp_link_fill_link_info, .update_prog = bpf_xdp_link_update, -- cgit v1.2.3-59-g8ed1b From f8951902b9daa65ba240ce8a054c727748df2147 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 18 Jul 2020 17:32:08 -0700 Subject: MTD: mtd-abi.h: drop a duplicated word Drop the repeated word "mode" in a comment. Signed-off-by: Randy Dunlap Cc: Miquel Raynal Cc: Richard Weinberger Cc: Vignesh Raghavendra Cc: linux-mtd@lists.infradead.org Signed-off-by: Richard Weinberger --- include/uapi/mtd/mtd-abi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/mtd/mtd-abi.h b/include/uapi/mtd/mtd-abi.h index 4b48fbf7d343..65b9db936557 100644 --- a/include/uapi/mtd/mtd-abi.h +++ b/include/uapi/mtd/mtd-abi.h @@ -262,7 +262,7 @@ struct mtd_ecc_stats { * @MTD_FILE_MODE_OTP_USER: OTP enabled in user mode * @MTD_FILE_MODE_RAW: OTP disabled, ECC disabled * - * These modes can be set via ioctl(MTDFILEMODE). The mode mode will be retained + * These modes can be set via ioctl(MTDFILEMODE). The mode will be retained * separately for each open file descriptor. * * Note: %MTD_FILE_MODE_RAW provides the same functionality as %MTD_OPS_RAW - -- cgit v1.2.3-59-g8ed1b From e5a52fd2b8cdb700b3c07b030e050a49ef3156b9 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 18 Jul 2020 17:33:17 -0700 Subject: xen/gntdev: gntdev.h: drop a duplicated word Drop the repeated word "of" in a comment. Signed-off-by: Randy Dunlap Reviewed-by: Juergen Gross Cc: Boris Ostrovsky Cc: Juergen Gross Cc: xen-devel@lists.xenproject.org Link: https://lore.kernel.org/r/20200719003317.21454-1-rdunlap@infradead.org Signed-off-by: Juergen Gross --- include/uapi/xen/gntdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/xen/gntdev.h b/include/uapi/xen/gntdev.h index fe4423e518c6..9ac5515b9bc2 100644 --- a/include/uapi/xen/gntdev.h +++ b/include/uapi/xen/gntdev.h @@ -66,7 +66,7 @@ struct ioctl_gntdev_map_grant_ref { /* * Removes the grant references from the mapping table of an instance of - * of gntdev. N.B. munmap() must be called on the relevant virtual address(es) + * gntdev. N.B. munmap() must be called on the relevant virtual address(es) * before this ioctl is called, or an error will result. */ #define IOCTL_GNTDEV_UNMAP_GRANT_REF \ -- cgit v1.2.3-59-g8ed1b From 321bd212619a7269308696e4ddc446930ea73fad Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 24 Jun 2020 18:24:33 -0400 Subject: virtio: VIRTIO_F_IOMMU_PLATFORM -> VIRTIO_F_ACCESS_PLATFORM Rename the bit to match latest virtio spec. Add a compat macro to avoid breaking existing userspace. Signed-off-by: Michael S. Tsirkin Reviewed-by: David Hildenbrand --- arch/um/drivers/virtio_uml.c | 2 +- drivers/vdpa/ifcvf/ifcvf_base.h | 2 +- drivers/vdpa/vdpa_sim/vdpa_sim.c | 4 ++-- drivers/vhost/net.c | 4 ++-- drivers/vhost/vdpa.c | 2 +- drivers/virtio/virtio_balloon.c | 2 +- drivers/virtio/virtio_ring.c | 2 +- include/linux/virtio_config.h | 2 +- include/uapi/linux/virtio_config.h | 10 +++++++--- tools/virtio/linux/virtio_config.h | 2 +- 10 files changed, 18 insertions(+), 14 deletions(-) (limited to 'include/uapi') diff --git a/arch/um/drivers/virtio_uml.c b/arch/um/drivers/virtio_uml.c index 351aee52aca6..a6c4bb6c2c01 100644 --- a/arch/um/drivers/virtio_uml.c +++ b/arch/um/drivers/virtio_uml.c @@ -385,7 +385,7 @@ static irqreturn_t vu_req_interrupt(int irq, void *data) } break; case VHOST_USER_SLAVE_IOTLB_MSG: - /* not supported - VIRTIO_F_IOMMU_PLATFORM */ + /* not supported - VIRTIO_F_ACCESS_PLATFORM */ case VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG: /* not supported - VHOST_USER_PROTOCOL_F_HOST_NOTIFIER */ default: diff --git a/drivers/vdpa/ifcvf/ifcvf_base.h b/drivers/vdpa/ifcvf/ifcvf_base.h index f4554412e607..24af422b5a3e 100644 --- a/drivers/vdpa/ifcvf/ifcvf_base.h +++ b/drivers/vdpa/ifcvf/ifcvf_base.h @@ -29,7 +29,7 @@ (1ULL << VIRTIO_F_VERSION_1) | \ (1ULL << VIRTIO_NET_F_STATUS) | \ (1ULL << VIRTIO_F_ORDER_PLATFORM) | \ - (1ULL << VIRTIO_F_IOMMU_PLATFORM) | \ + (1ULL << VIRTIO_F_ACCESS_PLATFORM) | \ (1ULL << VIRTIO_NET_F_MRG_RXBUF)) /* Only one queue pair for now. */ diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index c7334cc65bb2..a9bc5e0fb353 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -55,7 +55,7 @@ struct vdpasim_virtqueue { static u64 vdpasim_features = (1ULL << VIRTIO_F_ANY_LAYOUT) | (1ULL << VIRTIO_F_VERSION_1) | - (1ULL << VIRTIO_F_IOMMU_PLATFORM); + (1ULL << VIRTIO_F_ACCESS_PLATFORM); /* State of each vdpasim device */ struct vdpasim { @@ -450,7 +450,7 @@ static int vdpasim_set_features(struct vdpa_device *vdpa, u64 features) struct vdpasim *vdpasim = vdpa_to_sim(vdpa); /* DMA mapping must be done by driver */ - if (!(features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))) + if (!(features & (1ULL << VIRTIO_F_ACCESS_PLATFORM))) return -EINVAL; vdpasim->features = features & vdpasim_features; diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index e992decfec53..8e0921d3805d 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -73,7 +73,7 @@ enum { VHOST_NET_FEATURES = VHOST_FEATURES | (1ULL << VHOST_NET_F_VIRTIO_NET_HDR) | (1ULL << VIRTIO_NET_F_MRG_RXBUF) | - (1ULL << VIRTIO_F_IOMMU_PLATFORM) + (1ULL << VIRTIO_F_ACCESS_PLATFORM) }; enum { @@ -1653,7 +1653,7 @@ static int vhost_net_set_features(struct vhost_net *n, u64 features) !vhost_log_access_ok(&n->dev)) goto out_unlock; - if ((features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))) { + if ((features & (1ULL << VIRTIO_F_ACCESS_PLATFORM))) { if (vhost_init_device_iotlb(&n->dev, true)) goto out_unlock; } diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index a54b60d6623f..18869a35d408 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -31,7 +31,7 @@ enum { (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) | (1ULL << VIRTIO_F_ANY_LAYOUT) | (1ULL << VIRTIO_F_VERSION_1) | - (1ULL << VIRTIO_F_IOMMU_PLATFORM) | + (1ULL << VIRTIO_F_ACCESS_PLATFORM) | (1ULL << VIRTIO_F_RING_PACKED) | (1ULL << VIRTIO_F_ORDER_PLATFORM) | (1ULL << VIRTIO_RING_F_INDIRECT_DESC) | diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 8be02f333b7a..54fd989f9353 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -1129,7 +1129,7 @@ static int virtballoon_validate(struct virtio_device *vdev) else if (!virtio_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_POISON)) __virtio_clear_bit(vdev, VIRTIO_BALLOON_F_REPORTING); - __virtio_clear_bit(vdev, VIRTIO_F_IOMMU_PLATFORM); + __virtio_clear_bit(vdev, VIRTIO_F_ACCESS_PLATFORM); return 0; } diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 58b96baa8d48..a1a5c2a91426 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -2225,7 +2225,7 @@ void vring_transport_features(struct virtio_device *vdev) break; case VIRTIO_F_VERSION_1: break; - case VIRTIO_F_IOMMU_PLATFORM: + case VIRTIO_F_ACCESS_PLATFORM: break; case VIRTIO_F_RING_PACKED: break; diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index bb4cc4910750..f2cc2a0df174 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -171,7 +171,7 @@ static inline bool virtio_has_iommu_quirk(const struct virtio_device *vdev) * Note the reverse polarity of the quirk feature (compared to most * other features), this is for compatibility with legacy systems. */ - return !virtio_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM); + return !virtio_has_feature(vdev, VIRTIO_F_ACCESS_PLATFORM); } static inline diff --git a/include/uapi/linux/virtio_config.h b/include/uapi/linux/virtio_config.h index ff8e7dc9d4dd..b5eda06f0d57 100644 --- a/include/uapi/linux/virtio_config.h +++ b/include/uapi/linux/virtio_config.h @@ -67,13 +67,17 @@ #define VIRTIO_F_VERSION_1 32 /* - * If clear - device has the IOMMU bypass quirk feature. - * If set - use platform tools to detect the IOMMU. + * If clear - device has the platform DMA (e.g. IOMMU) bypass quirk feature. + * If set - use platform DMA tools to access the memory. * * Note the reverse polarity (compared to most other features), * this is for compatibility with legacy systems. */ -#define VIRTIO_F_IOMMU_PLATFORM 33 +#define VIRTIO_F_ACCESS_PLATFORM 33 +#ifndef __KERNEL__ +/* Legacy name for VIRTIO_F_ACCESS_PLATFORM (for compatibility with old userspace) */ +#define VIRTIO_F_IOMMU_PLATFORM VIRTIO_F_ACCESS_PLATFORM +#endif /* __KERNEL__ */ /* This feature indicates support for the packed virtqueue layout. */ #define VIRTIO_F_RING_PACKED 34 diff --git a/tools/virtio/linux/virtio_config.h b/tools/virtio/linux/virtio_config.h index dbf14c1e2188..f99ae42668e0 100644 --- a/tools/virtio/linux/virtio_config.h +++ b/tools/virtio/linux/virtio_config.h @@ -51,7 +51,7 @@ static inline bool virtio_has_iommu_quirk(const struct virtio_device *vdev) * Note the reverse polarity of the quirk feature (compared to most * other features), this is for compatibility with legacy systems. */ - return !virtio_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM); + return !virtio_has_feature(vdev, VIRTIO_F_ACCESS_PLATFORM); } static inline bool virtio_is_little_endian(struct virtio_device *vdev) -- cgit v1.2.3-59-g8ed1b From 9d2f627b7ec9d5d3246b6cec17f290ee6778c83b Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Fri, 31 Jul 2020 14:20:56 +0200 Subject: net: openvswitch: add masks cache hit counter Add a counter that counts the number of masks cache hits, and export it through the megaflow netlink statistics. Reviewed-by: Paolo Abeni Reviewed-by: Tonghao Zhang Signed-off-by: Eelco Chaudron Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 2 +- net/openvswitch/datapath.c | 5 ++++- net/openvswitch/datapath.h | 3 +++ net/openvswitch/flow_table.c | 19 ++++++++++++++----- net/openvswitch/flow_table.h | 3 ++- 5 files changed, 24 insertions(+), 8 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 9b14519e74d9..7cb76e5ca7cf 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -102,8 +102,8 @@ struct ovs_dp_megaflow_stats { __u64 n_mask_hit; /* Number of masks used for flow lookups. */ __u32 n_masks; /* Number of masks for the datapath. */ __u32 pad0; /* Pad for future expension. */ + __u64 n_cache_hit; /* Number of cache matches for flow lookups. */ __u64 pad1; /* Pad for future expension. */ - __u64 pad2; /* Pad for future expension. */ }; struct ovs_vport_stats { diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 6b6822f82f70..f45fee760504 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -225,13 +225,14 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) struct dp_stats_percpu *stats; u64 *stats_counter; u32 n_mask_hit; + u32 n_cache_hit; int error; stats = this_cpu_ptr(dp->stats_percpu); /* Look up flow. */ flow = ovs_flow_tbl_lookup_stats(&dp->table, key, skb_get_hash(skb), - &n_mask_hit); + &n_mask_hit, &n_cache_hit); if (unlikely(!flow)) { struct dp_upcall_info upcall; @@ -262,6 +263,7 @@ out: u64_stats_update_begin(&stats->syncp); (*stats_counter)++; stats->n_mask_hit += n_mask_hit; + stats->n_cache_hit += n_cache_hit; u64_stats_update_end(&stats->syncp); } @@ -699,6 +701,7 @@ static void get_dp_stats(const struct datapath *dp, struct ovs_dp_stats *stats, stats->n_missed += local_stats.n_missed; stats->n_lost += local_stats.n_lost; mega_stats->n_mask_hit += local_stats.n_mask_hit; + mega_stats->n_cache_hit += local_stats.n_cache_hit; } } diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 24fcec22fde2..38f7d3e66ca6 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -38,12 +38,15 @@ * @n_mask_hit: Number of masks looked up for flow match. * @n_mask_hit / (@n_hit + @n_missed) will be the average masks looked * up per packet. + * @n_cache_hit: The number of received packets that had their mask found using + * the mask cache. */ struct dp_stats_percpu { u64 n_hit; u64 n_missed; u64 n_lost; u64 n_mask_hit; + u64 n_cache_hit; struct u64_stats_sync syncp; }; diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index af22c9ee28dd..a5912ea05352 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -667,6 +667,7 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl, struct mask_array *ma, const struct sw_flow_key *key, u32 *n_mask_hit, + u32 *n_cache_hit, u32 *index) { u64 *usage_counters = this_cpu_ptr(ma->masks_usage_cntr); @@ -682,6 +683,7 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl, u64_stats_update_begin(&ma->syncp); usage_counters[*index]++; u64_stats_update_end(&ma->syncp); + (*n_cache_hit)++; return flow; } } @@ -719,7 +721,8 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl, struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, const struct sw_flow_key *key, u32 skb_hash, - u32 *n_mask_hit) + u32 *n_mask_hit, + u32 *n_cache_hit) { struct mask_array *ma = rcu_dereference(tbl->mask_array); struct table_instance *ti = rcu_dereference(tbl->ti); @@ -729,10 +732,13 @@ struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, int seg; *n_mask_hit = 0; + *n_cache_hit = 0; if (unlikely(!skb_hash)) { u32 mask_index = 0; + u32 cache = 0; - return flow_lookup(tbl, ti, ma, key, n_mask_hit, &mask_index); + return flow_lookup(tbl, ti, ma, key, n_mask_hit, &cache, + &mask_index); } /* Pre and post recirulation flows usually have the same skb_hash @@ -753,7 +759,7 @@ struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, e = &entries[index]; if (e->skb_hash == skb_hash) { flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, - &e->mask_index); + n_cache_hit, &e->mask_index); if (!flow) e->skb_hash = 0; return flow; @@ -766,10 +772,12 @@ struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, } /* Cache miss, do full lookup. */ - flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, &ce->mask_index); + flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, n_cache_hit, + &ce->mask_index); if (flow) ce->skb_hash = skb_hash; + *n_cache_hit = 0; return flow; } @@ -779,9 +787,10 @@ struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl, struct table_instance *ti = rcu_dereference_ovsl(tbl->ti); struct mask_array *ma = rcu_dereference_ovsl(tbl->mask_array); u32 __always_unused n_mask_hit; + u32 __always_unused n_cache_hit; u32 index = 0; - return flow_lookup(tbl, ti, ma, key, &n_mask_hit, &index); + return flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, &index); } struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl, diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h index 1f664b050e3b..325e939371d8 100644 --- a/net/openvswitch/flow_table.h +++ b/net/openvswitch/flow_table.h @@ -82,7 +82,8 @@ struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *table, struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *, const struct sw_flow_key *, u32 skb_hash, - u32 *n_mask_hit); + u32 *n_mask_hit, + u32 *n_cache_hit); struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *, const struct sw_flow_key *); struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl, -- cgit v1.2.3-59-g8ed1b From 9bf24f594c6acf676fb8c229f152c21bfb915ddb Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Fri, 31 Jul 2020 14:21:34 +0200 Subject: net: openvswitch: make masks cache size configurable This patch makes the masks cache size configurable, or with a size of 0, disable it. Reviewed-by: Paolo Abeni Reviewed-by: Tonghao Zhang Signed-off-by: Eelco Chaudron Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 1 + net/openvswitch/datapath.c | 17 +++++++ net/openvswitch/flow_table.c | 101 ++++++++++++++++++++++++++++++++++----- net/openvswitch/flow_table.h | 10 +++- 4 files changed, 115 insertions(+), 14 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 7cb76e5ca7cf..8300cc29dec8 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -86,6 +86,7 @@ enum ovs_datapath_attr { OVS_DP_ATTR_MEGAFLOW_STATS, /* struct ovs_dp_megaflow_stats */ OVS_DP_ATTR_USER_FEATURES, /* OVS_DP_F_* */ OVS_DP_ATTR_PAD, + OVS_DP_ATTR_MASKS_CACHE_SIZE, __OVS_DP_ATTR_MAX }; diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index f45fee760504..42f8cc70bb2c 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -1498,6 +1498,7 @@ static size_t ovs_dp_cmd_msg_size(void) msgsize += nla_total_size_64bit(sizeof(struct ovs_dp_stats)); msgsize += nla_total_size_64bit(sizeof(struct ovs_dp_megaflow_stats)); msgsize += nla_total_size(sizeof(u32)); /* OVS_DP_ATTR_USER_FEATURES */ + msgsize += nla_total_size(sizeof(u32)); /* OVS_DP_ATTR_MASKS_CACHE_SIZE */ return msgsize; } @@ -1535,6 +1536,10 @@ static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb, if (nla_put_u32(skb, OVS_DP_ATTR_USER_FEATURES, dp->user_features)) goto nla_put_failure; + if (nla_put_u32(skb, OVS_DP_ATTR_MASKS_CACHE_SIZE, + ovs_flow_tbl_masks_cache_size(&dp->table))) + goto nla_put_failure; + genlmsg_end(skb, ovs_header); return 0; @@ -1599,6 +1604,16 @@ static int ovs_dp_change(struct datapath *dp, struct nlattr *a[]) #endif } + if (a[OVS_DP_ATTR_MASKS_CACHE_SIZE]) { + int err; + u32 cache_size; + + cache_size = nla_get_u32(a[OVS_DP_ATTR_MASKS_CACHE_SIZE]); + err = ovs_flow_tbl_masks_cache_resize(&dp->table, cache_size); + if (err) + return err; + } + dp->user_features = user_features; if (dp->user_features & OVS_DP_F_TC_RECIRC_SHARING) @@ -1887,6 +1902,8 @@ static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = { [OVS_DP_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, [OVS_DP_ATTR_UPCALL_PID] = { .type = NLA_U32 }, [OVS_DP_ATTR_USER_FEATURES] = { .type = NLA_U32 }, + [OVS_DP_ATTR_MASKS_CACHE_SIZE] = NLA_POLICY_RANGE(NLA_U32, 0, + PCPU_MIN_UNIT_SIZE / sizeof(struct mask_cache_entry)), }; static const struct genl_ops dp_datapath_genl_ops[] = { diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index a5912ea05352..6527d84c3ea6 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -38,8 +38,8 @@ #define MASK_ARRAY_SIZE_MIN 16 #define REHASH_INTERVAL (10 * 60 * HZ) +#define MC_DEFAULT_HASH_ENTRIES 256 #define MC_HASH_SHIFT 8 -#define MC_HASH_ENTRIES (1u << MC_HASH_SHIFT) #define MC_HASH_SEGS ((sizeof(uint32_t) * 8) / MC_HASH_SHIFT) static struct kmem_cache *flow_cache; @@ -341,15 +341,79 @@ static void flow_mask_remove(struct flow_table *tbl, struct sw_flow_mask *mask) } } +static void __mask_cache_destroy(struct mask_cache *mc) +{ + free_percpu(mc->mask_cache); + kfree(mc); +} + +static void mask_cache_rcu_cb(struct rcu_head *rcu) +{ + struct mask_cache *mc = container_of(rcu, struct mask_cache, rcu); + + __mask_cache_destroy(mc); +} + +static struct mask_cache *tbl_mask_cache_alloc(u32 size) +{ + struct mask_cache_entry __percpu *cache = NULL; + struct mask_cache *new; + + /* Only allow size to be 0, or a power of 2, and does not exceed + * percpu allocation size. + */ + if ((!is_power_of_2(size) && size != 0) || + (size * sizeof(struct mask_cache_entry)) > PCPU_MIN_UNIT_SIZE) + return NULL; + + new = kzalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return NULL; + + new->cache_size = size; + if (new->cache_size > 0) { + cache = __alloc_percpu(array_size(sizeof(struct mask_cache_entry), + new->cache_size), + __alignof__(struct mask_cache_entry)); + if (!cache) { + kfree(new); + return NULL; + } + } + + new->mask_cache = cache; + return new; +} +int ovs_flow_tbl_masks_cache_resize(struct flow_table *table, u32 size) +{ + struct mask_cache *mc = rcu_dereference(table->mask_cache); + struct mask_cache *new; + + if (size == mc->cache_size) + return 0; + + if ((!is_power_of_2(size) && size != 0) || + (size * sizeof(struct mask_cache_entry)) > PCPU_MIN_UNIT_SIZE) + return -EINVAL; + + new = tbl_mask_cache_alloc(size); + if (!new) + return -ENOMEM; + + rcu_assign_pointer(table->mask_cache, new); + call_rcu(&mc->rcu, mask_cache_rcu_cb); + + return 0; +} + int ovs_flow_tbl_init(struct flow_table *table) { struct table_instance *ti, *ufid_ti; + struct mask_cache *mc; struct mask_array *ma; - table->mask_cache = __alloc_percpu(sizeof(struct mask_cache_entry) * - MC_HASH_ENTRIES, - __alignof__(struct mask_cache_entry)); - if (!table->mask_cache) + mc = tbl_mask_cache_alloc(MC_DEFAULT_HASH_ENTRIES); + if (!mc) return -ENOMEM; ma = tbl_mask_array_alloc(MASK_ARRAY_SIZE_MIN); @@ -367,6 +431,7 @@ int ovs_flow_tbl_init(struct flow_table *table) rcu_assign_pointer(table->ti, ti); rcu_assign_pointer(table->ufid_ti, ufid_ti); rcu_assign_pointer(table->mask_array, ma); + rcu_assign_pointer(table->mask_cache, mc); table->last_rehash = jiffies; table->count = 0; table->ufid_count = 0; @@ -377,7 +442,7 @@ free_ti: free_mask_array: __mask_array_destroy(ma); free_mask_cache: - free_percpu(table->mask_cache); + __mask_cache_destroy(mc); return -ENOMEM; } @@ -453,9 +518,11 @@ void ovs_flow_tbl_destroy(struct flow_table *table) { struct table_instance *ti = rcu_dereference_raw(table->ti); struct table_instance *ufid_ti = rcu_dereference_raw(table->ufid_ti); + struct mask_cache *mc = rcu_dereference(table->mask_cache); + struct mask_array *ma = rcu_dereference_ovsl(table->mask_array); - free_percpu(table->mask_cache); - call_rcu(&table->mask_array->rcu, mask_array_rcu_cb); + call_rcu(&mc->rcu, mask_cache_rcu_cb); + call_rcu(&ma->rcu, mask_array_rcu_cb); table_instance_destroy(table, ti, ufid_ti, false); } @@ -724,6 +791,7 @@ struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, u32 *n_mask_hit, u32 *n_cache_hit) { + struct mask_cache *mc = rcu_dereference(tbl->mask_cache); struct mask_array *ma = rcu_dereference(tbl->mask_array); struct table_instance *ti = rcu_dereference(tbl->ti); struct mask_cache_entry *entries, *ce; @@ -733,7 +801,7 @@ struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, *n_mask_hit = 0; *n_cache_hit = 0; - if (unlikely(!skb_hash)) { + if (unlikely(!skb_hash || mc->cache_size == 0)) { u32 mask_index = 0; u32 cache = 0; @@ -749,11 +817,11 @@ struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, ce = NULL; hash = skb_hash; - entries = this_cpu_ptr(tbl->mask_cache); + entries = this_cpu_ptr(mc->mask_cache); /* Find the cache entry 'ce' to operate on. */ for (seg = 0; seg < MC_HASH_SEGS; seg++) { - int index = hash & (MC_HASH_ENTRIES - 1); + int index = hash & (mc->cache_size - 1); struct mask_cache_entry *e; e = &entries[index]; @@ -867,6 +935,13 @@ int ovs_flow_tbl_num_masks(const struct flow_table *table) return READ_ONCE(ma->count); } +u32 ovs_flow_tbl_masks_cache_size(const struct flow_table *table) +{ + struct mask_cache *mc = rcu_dereference(table->mask_cache); + + return READ_ONCE(mc->cache_size); +} + static struct table_instance *table_instance_expand(struct table_instance *ti, bool ufid) { @@ -1095,8 +1170,8 @@ void ovs_flow_masks_rebalance(struct flow_table *table) for (i = 0; i < masks_entries; i++) { int index = masks_and_count[i].index; - new->masks[new->count++] = - rcu_dereference_ovsl(ma->masks[index]); + if (ovsl_dereference(ma->masks[index])) + new->masks[new->count++] = ma->masks[index]; } rcu_assign_pointer(table->mask_array, new); diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h index 325e939371d8..74ce48fecba9 100644 --- a/net/openvswitch/flow_table.h +++ b/net/openvswitch/flow_table.h @@ -27,6 +27,12 @@ struct mask_cache_entry { u32 mask_index; }; +struct mask_cache { + struct rcu_head rcu; + u32 cache_size; /* Must be ^2 value. */ + struct mask_cache_entry __percpu *mask_cache; +}; + struct mask_count { int index; u64 counter; @@ -53,7 +59,7 @@ struct table_instance { struct flow_table { struct table_instance __rcu *ti; struct table_instance __rcu *ufid_ti; - struct mask_cache_entry __percpu *mask_cache; + struct mask_cache __rcu *mask_cache; struct mask_array __rcu *mask_array; unsigned long last_rehash; unsigned int count; @@ -77,6 +83,8 @@ int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow, const struct sw_flow_mask *mask); void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow); int ovs_flow_tbl_num_masks(const struct flow_table *table); +u32 ovs_flow_tbl_masks_cache_size(const struct flow_table *table); +int ovs_flow_tbl_masks_cache_resize(struct flow_table *table, u32 size); struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *table, u32 *bucket, u32 *idx); struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *, -- cgit v1.2.3-59-g8ed1b From 88fab21c691bb1ff164e540735237a385e3afeaf Mon Sep 17 00:00:00 2001 From: Ioana-Ruxandra Stăncioi Date: Mon, 3 Aug 2020 07:33:33 +0000 Subject: seg6_iptunnel: Refactor seg6_lwt_headroom out of uapi header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Refactor the function seg6_lwt_headroom out of the seg6_iptunnel.h uapi header, because it is only used in seg6_iptunnel.c. Moreover, it is only used in the kernel code, as indicated by the "#ifdef __KERNEL__". Suggested-by: David Miller Signed-off-by: Ioana-Ruxandra Stăncioi Signed-off-by: David S. Miller --- include/uapi/linux/seg6_iptunnel.h | 21 --------------------- net/ipv6/seg6_iptunnel.c | 17 +++++++++++++++++ 2 files changed, 17 insertions(+), 21 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/seg6_iptunnel.h b/include/uapi/linux/seg6_iptunnel.h index 09fb608a35ec..eb815e0d0ac3 100644 --- a/include/uapi/linux/seg6_iptunnel.h +++ b/include/uapi/linux/seg6_iptunnel.h @@ -37,25 +37,4 @@ enum { SEG6_IPTUN_MODE_L2ENCAP, }; -#ifdef __KERNEL__ - -static inline size_t seg6_lwt_headroom(struct seg6_iptunnel_encap *tuninfo) -{ - int head = 0; - - switch (tuninfo->mode) { - case SEG6_IPTUN_MODE_INLINE: - break; - case SEG6_IPTUN_MODE_ENCAP: - head = sizeof(struct ipv6hdr); - break; - case SEG6_IPTUN_MODE_L2ENCAP: - return 0; - } - - return ((tuninfo->srh->hdrlen + 1) << 3) + head; -} - -#endif - #endif diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index e0e9f48ab14f..897fa59c47de 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -27,6 +27,23 @@ #include #endif +static size_t seg6_lwt_headroom(struct seg6_iptunnel_encap *tuninfo) +{ + int head = 0; + + switch (tuninfo->mode) { + case SEG6_IPTUN_MODE_INLINE: + break; + case SEG6_IPTUN_MODE_ENCAP: + head = sizeof(struct ipv6hdr); + break; + case SEG6_IPTUN_MODE_L2ENCAP: + return 0; + } + + return ((tuninfo->srh->hdrlen + 1) << 3) + head; +} + struct seg6_lwt { struct dst_cache cache; struct seg6_iptunnel_encap tuninfo[]; -- cgit v1.2.3-59-g8ed1b From 4476770881d7ac647e3bcae0943f37e00b9c3f3c Mon Sep 17 00:00:00 2001 From: Siddharth Gupta Date: Wed, 29 Jul 2020 10:40:00 -0700 Subject: remoteproc: Add remoteproc character device interface Add the character device interface into remoteproc framework. This interface can be used in order to boot/shutdown remote subsystems and provides a basic ioctl based interface to implement supplementary functionality. An ioctl call is implemented to enable the shutdown on release feature which will allow remote processors to be shutdown when the controlling userspace application crashes or hangs. Reviewed-by: Bjorn Andersson Reviewed-by: Mathieu Poirier Signed-off-by: Rishabh Bhatnagar Signed-off-by: Siddharth Gupta Link: https://lore.kernel.org/r/1596044401-22083-2-git-send-email-sidgup@codeaurora.org [bjorn: s/int32_t/s32/ per checkpatch] Signed-off-by: Bjorn Andersson --- Documentation/userspace-api/ioctl/ioctl-number.rst | 1 + drivers/remoteproc/Kconfig | 9 ++ drivers/remoteproc/Makefile | 1 + drivers/remoteproc/remoteproc_cdev.c | 124 +++++++++++++++++++++ drivers/remoteproc/remoteproc_internal.h | 28 +++++ include/linux/remoteproc.h | 5 + include/uapi/linux/remoteproc_cdev.h | 37 ++++++ 7 files changed, 205 insertions(+) create mode 100644 drivers/remoteproc/remoteproc_cdev.c create mode 100644 include/uapi/linux/remoteproc_cdev.h (limited to 'include/uapi') diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst index 59472cd6a11d..2a198838fca9 100644 --- a/Documentation/userspace-api/ioctl/ioctl-number.rst +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst @@ -339,6 +339,7 @@ Code Seq# Include File Comments 0xB4 00-0F linux/gpio.h 0xB5 00-0F uapi/linux/rpmsg.h 0xB6 all linux/fpga-dfl.h +0xB7 all uapi/linux/remoteproc_cdev.h 0xC0 00-0F linux/usb/iowarrior.h 0xCA 00-0F uapi/misc/cxl.h 0xCA 10-2F uapi/misc/ocxl.h diff --git a/drivers/remoteproc/Kconfig b/drivers/remoteproc/Kconfig index 48315dc4a30c..c6659dfea7c7 100644 --- a/drivers/remoteproc/Kconfig +++ b/drivers/remoteproc/Kconfig @@ -14,6 +14,15 @@ config REMOTEPROC if REMOTEPROC +config REMOTEPROC_CDEV + bool "Remoteproc character device interface" + help + Say y here to have a character device interface for the remoteproc + framework. Userspace can boot/shutdown remote processors through + this interface. + + It's safe to say N if you don't want to use this interface. + config IMX_REMOTEPROC tristate "IMX6/7 remoteproc support" depends on ARCH_MXC diff --git a/drivers/remoteproc/Makefile b/drivers/remoteproc/Makefile index 4d4307dc8fa9..3dfa28e6c701 100644 --- a/drivers/remoteproc/Makefile +++ b/drivers/remoteproc/Makefile @@ -10,6 +10,7 @@ remoteproc-y += remoteproc_debugfs.o remoteproc-y += remoteproc_sysfs.o remoteproc-y += remoteproc_virtio.o remoteproc-y += remoteproc_elf_loader.o +obj-$(CONFIG_REMOTEPROC_CDEV) += remoteproc_cdev.o obj-$(CONFIG_IMX_REMOTEPROC) += imx_rproc.o obj-$(CONFIG_INGENIC_VPU_RPROC) += ingenic_rproc.o obj-$(CONFIG_MTK_SCP) += mtk_scp.o mtk_scp_ipi.o diff --git a/drivers/remoteproc/remoteproc_cdev.c b/drivers/remoteproc/remoteproc_cdev.c new file mode 100644 index 000000000000..b19ea3057bde --- /dev/null +++ b/drivers/remoteproc/remoteproc_cdev.c @@ -0,0 +1,124 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Character device interface driver for Remoteproc framework. + * + * Copyright (c) 2020, The Linux Foundation. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "remoteproc_internal.h" + +#define NUM_RPROC_DEVICES 64 +static dev_t rproc_major; + +static ssize_t rproc_cdev_write(struct file *filp, const char __user *buf, size_t len, loff_t *pos) +{ + struct rproc *rproc = container_of(filp->f_inode->i_cdev, struct rproc, cdev); + int ret = 0; + char cmd[10]; + + if (!len || len > sizeof(cmd)) + return -EINVAL; + + ret = copy_from_user(cmd, buf, len); + if (ret) + return -EFAULT; + + if (!strncmp(cmd, "start", len)) { + if (rproc->state == RPROC_RUNNING) + return -EBUSY; + + ret = rproc_boot(rproc); + } else if (!strncmp(cmd, "stop", len)) { + if (rproc->state != RPROC_RUNNING) + return -EINVAL; + + rproc_shutdown(rproc); + } else { + dev_err(&rproc->dev, "Unrecognized option\n"); + ret = -EINVAL; + } + + return ret ? ret : len; +} + +static long rproc_device_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) +{ + struct rproc *rproc = container_of(filp->f_inode->i_cdev, struct rproc, cdev); + void __user *argp = (void __user *)arg; + s32 param; + + switch (ioctl) { + case RPROC_SET_SHUTDOWN_ON_RELEASE: + if (copy_from_user(¶m, argp, sizeof(s32))) + return -EFAULT; + + rproc->cdev_put_on_release = !!param; + break; + case RPROC_GET_SHUTDOWN_ON_RELEASE: + param = (s32)rproc->cdev_put_on_release; + if (copy_to_user(argp, ¶m, sizeof(s32))) + return -EFAULT; + + break; + default: + dev_err(&rproc->dev, "Unsupported ioctl\n"); + return -EINVAL; + } + + return 0; +} + +static int rproc_cdev_release(struct inode *inode, struct file *filp) +{ + struct rproc *rproc = container_of(inode->i_cdev, struct rproc, cdev); + + if (rproc->cdev_put_on_release && rproc->state == RPROC_RUNNING) + rproc_shutdown(rproc); + + return 0; +} + +static const struct file_operations rproc_fops = { + .write = rproc_cdev_write, + .unlocked_ioctl = rproc_device_ioctl, + .compat_ioctl = compat_ptr_ioctl, + .release = rproc_cdev_release, +}; + +int rproc_char_device_add(struct rproc *rproc) +{ + int ret; + + cdev_init(&rproc->cdev, &rproc_fops); + rproc->cdev.owner = THIS_MODULE; + + rproc->dev.devt = MKDEV(MAJOR(rproc_major), rproc->index); + cdev_set_parent(&rproc->cdev, &rproc->dev.kobj); + ret = cdev_add(&rproc->cdev, rproc->dev.devt, 1); + if (ret < 0) + dev_err(&rproc->dev, "Failed to add char dev for %s\n", rproc->name); + + return ret; +} + +void rproc_char_device_remove(struct rproc *rproc) +{ + __unregister_chrdev(MAJOR(rproc->dev.devt), rproc->index, 1, "remoteproc"); +} + +void __init rproc_init_cdev(void) +{ + int ret; + + ret = alloc_chrdev_region(&rproc_major, 0, NUM_RPROC_DEVICES, "remoteproc"); + if (ret < 0) + pr_err("Failed to alloc rproc_cdev region, err %d\n", ret); +} diff --git a/drivers/remoteproc/remoteproc_internal.h b/drivers/remoteproc/remoteproc_internal.h index cd4176b033a3..c34002888d2c 100644 --- a/drivers/remoteproc/remoteproc_internal.h +++ b/drivers/remoteproc/remoteproc_internal.h @@ -53,6 +53,34 @@ void rproc_exit_sysfs(void); void rproc_coredump_cleanup(struct rproc *rproc); void rproc_coredump(struct rproc *rproc); +#ifdef CONFIG_REMOTEPROC_CDEV +void rproc_init_cdev(void); +void rproc_exit_cdev(void); +int rproc_char_device_add(struct rproc *rproc); +void rproc_char_device_remove(struct rproc *rproc); +#else +static inline void rproc_init_cdev(void) +{ +} + +static inline void rproc_exit_cdev(void) +{ +} + +/* + * The character device interface is an optional feature, if it is not enabled + * the function should not return an error. + */ +static inline int rproc_char_device_add(struct rproc *rproc) +{ + return 0; +} + +static inline void rproc_char_device_remove(struct rproc *rproc) +{ +} +#endif + void rproc_free_vring(struct rproc_vring *rvring); int rproc_alloc_vring(struct rproc_vdev *rvdev, int i); diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h index 0e8d2ff575b4..2fa68bf5aa4f 100644 --- a/include/linux/remoteproc.h +++ b/include/linux/remoteproc.h @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -509,6 +510,8 @@ struct rproc_dump_segment { * @autonomous: true if an external entity has booted the remote processor * @dump_segments: list of segments in the firmware * @nb_vdev: number of vdev currently handled by rproc + * @char_dev: character device of the rproc + * @cdev_put_on_release: flag to indicate if remoteproc should be shutdown on @char_dev release */ struct rproc { struct list_head node; @@ -546,6 +549,8 @@ struct rproc { int nb_vdev; u8 elf_class; u16 elf_machine; + struct cdev cdev; + bool cdev_put_on_release; }; /** diff --git a/include/uapi/linux/remoteproc_cdev.h b/include/uapi/linux/remoteproc_cdev.h new file mode 100644 index 000000000000..c43768e4b0dc --- /dev/null +++ b/include/uapi/linux/remoteproc_cdev.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */ +/* + * IOCTLs for Remoteproc's character device interface. + * + * Copyright (c) 2020, The Linux Foundation. All rights reserved. + */ + +#ifndef _UAPI_REMOTEPROC_CDEV_H_ +#define _UAPI_REMOTEPROC_CDEV_H_ + +#include +#include + +#define RPROC_MAGIC 0xB7 + +/* + * The RPROC_SET_SHUTDOWN_ON_RELEASE ioctl allows to enable/disable the shutdown of a remote + * processor automatically when the controlling userpsace closes the char device interface. + * + * input parameter: integer + * 0 : disable automatic shutdown + * other : enable automatic shutdown + */ +#define RPROC_SET_SHUTDOWN_ON_RELEASE _IOW(RPROC_MAGIC, 1, __s32) + +/* + * The RPROC_GET_SHUTDOWN_ON_RELEASE ioctl gets information about whether the automatic shutdown of + * a remote processor is enabled or disabled when the controlling userspace closes the char device + * interface. + * + * output parameter: integer + * 0 : automatic shutdown disable + * other : automatic shutdown enable + */ +#define RPROC_GET_SHUTDOWN_ON_RELEASE _IOR(RPROC_MAGIC, 2, __s32) + +#endif -- cgit v1.2.3-59-g8ed1b From cae19a6386c86dec8ec2810a96d7497a2eec8d38 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 10 Jul 2020 07:17:13 -0400 Subject: virtio_9p: correct tags for config space fields Tag config space fields as having virtio endian-ness. Signed-off-by: Michael S. Tsirkin Reviewed-by: Cornelia Huck --- include/uapi/linux/virtio_9p.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/virtio_9p.h b/include/uapi/linux/virtio_9p.h index 277c4ad44e84..441047432258 100644 --- a/include/uapi/linux/virtio_9p.h +++ b/include/uapi/linux/virtio_9p.h @@ -25,7 +25,7 @@ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ -#include +#include #include #include @@ -36,7 +36,7 @@ struct virtio_9p_config { /* length of the tag name */ - __u16 tag_len; + __virtio16 tag_len; /* non-NULL terminated tag name */ __u8 tag[0]; } __attribute__((packed)); -- cgit v1.2.3-59-g8ed1b From c73cb10cc4421baa669c1edd8211086280273216 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 10 Jul 2020 07:17:13 -0400 Subject: virtio_balloon: correct tags for config space fields Tag config space fields as having little endian-ness. Note that balloon is special: LE even when using the legacy interface. Signed-off-by: Michael S. Tsirkin Acked-by: David Hildenbrand Reviewed-by: Cornelia Huck --- include/uapi/linux/virtio_balloon.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h index dc3e656470dd..ddaa45e723c4 100644 --- a/include/uapi/linux/virtio_balloon.h +++ b/include/uapi/linux/virtio_balloon.h @@ -45,20 +45,20 @@ #define VIRTIO_BALLOON_CMD_ID_DONE 1 struct virtio_balloon_config { /* Number of pages host wants Guest to give up. */ - __u32 num_pages; + __le32 num_pages; /* Number of pages we've actually got in balloon. */ - __u32 actual; + __le32 actual; /* * Free page hint command id, readonly by guest. * Was previously named free_page_report_cmd_id so we * need to carry that name for legacy support. */ union { - __u32 free_page_hint_cmd_id; - __u32 free_page_report_cmd_id; /* deprecated */ + __le32 free_page_hint_cmd_id; + __le32 free_page_report_cmd_id; /* deprecated */ }; /* Stores PAGE_POISON if page poisoning is in use */ - __u32 poison_val; + __le32 poison_val; }; #define VIRTIO_BALLOON_S_SWAP_IN 0 /* Amount of memory swapped in */ -- cgit v1.2.3-59-g8ed1b From 40e04c488bd6ab1859778d40bf9bb3ca294ab97b Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 10 Jul 2020 07:17:13 -0400 Subject: virtio_blk: correct tags for config space fields Tag config space fields as having virtio endian-ness. Signed-off-by: Michael S. Tsirkin Reviewed-by: Cornelia Huck Reviewed-by: Stefano Garzarella Reviewed-by: Stefano Garzarella --- include/uapi/linux/virtio_blk.h | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/virtio_blk.h b/include/uapi/linux/virtio_blk.h index 0f99d7b49ede..d888f013d9ff 100644 --- a/include/uapi/linux/virtio_blk.h +++ b/include/uapi/linux/virtio_blk.h @@ -57,20 +57,20 @@ struct virtio_blk_config { /* The capacity (in 512-byte sectors). */ - __u64 capacity; + __virtio64 capacity; /* The maximum segment size (if VIRTIO_BLK_F_SIZE_MAX) */ - __u32 size_max; + __virtio32 size_max; /* The maximum number of segments (if VIRTIO_BLK_F_SEG_MAX) */ - __u32 seg_max; + __virtio32 seg_max; /* geometry of the device (if VIRTIO_BLK_F_GEOMETRY) */ struct virtio_blk_geometry { - __u16 cylinders; + __virtio16 cylinders; __u8 heads; __u8 sectors; } geometry; /* block size of device (if VIRTIO_BLK_F_BLK_SIZE) */ - __u32 blk_size; + __virtio32 blk_size; /* the next 4 entries are guarded by VIRTIO_BLK_F_TOPOLOGY */ /* exponent for physical block per logical block. */ @@ -78,42 +78,42 @@ struct virtio_blk_config { /* alignment offset in logical blocks. */ __u8 alignment_offset; /* minimum I/O size without performance penalty in logical blocks. */ - __u16 min_io_size; + __virtio16 min_io_size; /* optimal sustained I/O size in logical blocks. */ - __u32 opt_io_size; + __virtio32 opt_io_size; /* writeback mode (if VIRTIO_BLK_F_CONFIG_WCE) */ __u8 wce; __u8 unused; /* number of vqs, only available when VIRTIO_BLK_F_MQ is set */ - __u16 num_queues; + __virtio16 num_queues; /* the next 3 entries are guarded by VIRTIO_BLK_F_DISCARD */ /* * The maximum discard sectors (in 512-byte sectors) for * one segment. */ - __u32 max_discard_sectors; + __virtio32 max_discard_sectors; /* * The maximum number of discard segments in a * discard command. */ - __u32 max_discard_seg; + __virtio32 max_discard_seg; /* Discard commands must be aligned to this number of sectors. */ - __u32 discard_sector_alignment; + __virtio32 discard_sector_alignment; /* the next 3 entries are guarded by VIRTIO_BLK_F_WRITE_ZEROES */ /* * The maximum number of write zeroes sectors (in 512-byte sectors) in * one segment. */ - __u32 max_write_zeroes_sectors; + __virtio32 max_write_zeroes_sectors; /* * The maximum number of segments in a write zeroes * command. */ - __u32 max_write_zeroes_seg; + __virtio32 max_write_zeroes_seg; /* * Set if a VIRTIO_BLK_T_WRITE_ZEROES request may result in the * deallocation of one or more of the sectors. -- cgit v1.2.3-59-g8ed1b From dbe2dc8c5838ef415620a4fe691570b062ab046f Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 10 Jul 2020 07:17:13 -0400 Subject: virtio_console: correct tags for config space fields Tag config space fields as having virtio endian-ness. Signed-off-by: Michael S. Tsirkin Reviewed-by: Cornelia Huck --- include/uapi/linux/virtio_console.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/virtio_console.h b/include/uapi/linux/virtio_console.h index b7fb108c9310..7e6ec2ff0560 100644 --- a/include/uapi/linux/virtio_console.h +++ b/include/uapi/linux/virtio_console.h @@ -45,13 +45,13 @@ struct virtio_console_config { /* colums of the screens */ - __u16 cols; + __virtio16 cols; /* rows of the screens */ - __u16 rows; + __virtio16 rows; /* max. number of ports this device can hold */ - __u32 max_nr_ports; + __virtio32 max_nr_ports; /* emergency write register */ - __u32 emerg_wr; + __virtio32 emerg_wr; } __attribute__((packed)); /* -- cgit v1.2.3-59-g8ed1b From 24bcf35b695ef5228f3035eb979cc2de571d560b Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 10 Jul 2020 07:17:13 -0400 Subject: virtio_crypto: correct tags for config space fields Since crypto is a modern-only device, tag config space fields as having little endian-ness. Signed-off-by: Michael S. Tsirkin Reviewed-by: Cornelia Huck --- include/uapi/linux/virtio_crypto.h | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/virtio_crypto.h b/include/uapi/linux/virtio_crypto.h index 50cdc8aebfcf..a03932f10565 100644 --- a/include/uapi/linux/virtio_crypto.h +++ b/include/uapi/linux/virtio_crypto.h @@ -414,33 +414,33 @@ struct virtio_crypto_op_data_req { struct virtio_crypto_config { /* See VIRTIO_CRYPTO_OP_* above */ - __u32 status; + __le32 status; /* * Maximum number of data queue */ - __u32 max_dataqueues; + __le32 max_dataqueues; /* * Specifies the services mask which the device support, * see VIRTIO_CRYPTO_SERVICE_* above */ - __u32 crypto_services; + __le32 crypto_services; /* Detailed algorithms mask */ - __u32 cipher_algo_l; - __u32 cipher_algo_h; - __u32 hash_algo; - __u32 mac_algo_l; - __u32 mac_algo_h; - __u32 aead_algo; + __le32 cipher_algo_l; + __le32 cipher_algo_h; + __le32 hash_algo; + __le32 mac_algo_l; + __le32 mac_algo_h; + __le32 aead_algo; /* Maximum length of cipher key */ - __u32 max_cipher_key_len; + __le32 max_cipher_key_len; /* Maximum length of authenticated key */ - __u32 max_auth_key_len; - __u32 reserve; + __le32 max_auth_key_len; + __le32 reserve; /* Maximum size of each crypto request's content */ - __u64 max_size; + __le64 max_size; }; struct virtio_crypto_inhdr { -- cgit v1.2.3-59-g8ed1b From fc4a1accbb4ef372bb55b7ab161cf88e3b631935 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 10 Jul 2020 07:17:13 -0400 Subject: virtio_fs: correct tags for config space fields Since fs is a modern-only device, tag config space fields as having little endian-ness. Signed-off-by: Michael S. Tsirkin Acked-by: Vivek Goyal Acked-by: Vivek Goyal Reviewed-by: Cornelia Huck --- include/uapi/linux/virtio_fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/virtio_fs.h b/include/uapi/linux/virtio_fs.h index b02eb2ac3d99..3056b6e9f8ce 100644 --- a/include/uapi/linux/virtio_fs.h +++ b/include/uapi/linux/virtio_fs.h @@ -13,7 +13,7 @@ struct virtio_fs_config { __u8 tag[36]; /* Number of request queues */ - __u32 num_request_queues; + __le32 num_request_queues; } __attribute__((packed)); #endif /* _UAPI_LINUX_VIRTIO_FS_H */ -- cgit v1.2.3-59-g8ed1b From f378444b7c97e39358de5d50d01fb0e92f259073 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 10 Jul 2020 07:17:13 -0400 Subject: virtio_gpu: correct tags for config space fields Since gpu is a modern-only device, tag config space fields as having little endian-ness. Signed-off-by: Michael S. Tsirkin Reviewed-by: Cornelia Huck --- include/uapi/linux/virtio_gpu.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/virtio_gpu.h b/include/uapi/linux/virtio_gpu.h index 0c85914d9369..ccbd174ef321 100644 --- a/include/uapi/linux/virtio_gpu.h +++ b/include/uapi/linux/virtio_gpu.h @@ -320,10 +320,10 @@ struct virtio_gpu_resp_edid { #define VIRTIO_GPU_EVENT_DISPLAY (1 << 0) struct virtio_gpu_config { - __u32 events_read; - __u32 events_clear; - __u32 num_scanouts; - __u32 num_capsets; + __le32 events_read; + __le32 events_clear; + __le32 num_scanouts; + __le32 num_capsets; }; /* simple formats for fbcon/X use */ -- cgit v1.2.3-59-g8ed1b From 924b59a6dfa85dc9eac4c7f2fe1857bba2cb2510 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 10 Jul 2020 07:17:13 -0400 Subject: virtio_input: correct tags for config space fields Since this is a modern-only device, tag config space fields as having little endian-ness. Signed-off-by: Michael S. Tsirkin Reviewed-by: Gerd Hoffmann Reviewed-by: Gerd Hoffmann Reviewed-by: Cornelia Huck --- include/uapi/linux/virtio_input.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/virtio_input.h b/include/uapi/linux/virtio_input.h index a7fe5c8fb135..52084b1fb965 100644 --- a/include/uapi/linux/virtio_input.h +++ b/include/uapi/linux/virtio_input.h @@ -40,18 +40,18 @@ enum virtio_input_config_select { }; struct virtio_input_absinfo { - __u32 min; - __u32 max; - __u32 fuzz; - __u32 flat; - __u32 res; + __le32 min; + __le32 max; + __le32 fuzz; + __le32 flat; + __le32 res; }; struct virtio_input_devids { - __u16 bustype; - __u16 vendor; - __u16 product; - __u16 version; + __le16 bustype; + __le16 vendor; + __le16 product; + __le16 version; }; struct virtio_input_config { -- cgit v1.2.3-59-g8ed1b From 0ebcffcc2731682777bab19b51a512d8f31e1bdd Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 10 Jul 2020 07:17:13 -0400 Subject: virtio_iommu: correct tags for config space fields Since this is a modern-only device, tag config space fields as having little endian-ness. Signed-off-by: Michael S. Tsirkin Reviewed-by: Jean-Philippe Brucker Reviewed-by: Jean-Philippe Brucker Reviewed-by: Cornelia Huck --- include/uapi/linux/virtio_iommu.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/virtio_iommu.h b/include/uapi/linux/virtio_iommu.h index 48e3c29223b5..237e36a280cb 100644 --- a/include/uapi/linux/virtio_iommu.h +++ b/include/uapi/linux/virtio_iommu.h @@ -18,24 +18,24 @@ #define VIRTIO_IOMMU_F_MMIO 5 struct virtio_iommu_range_64 { - __u64 start; - __u64 end; + __le64 start; + __le64 end; }; struct virtio_iommu_range_32 { - __u32 start; - __u32 end; + __le32 start; + __le32 end; }; struct virtio_iommu_config { /* Supported page sizes */ - __u64 page_size_mask; + __le64 page_size_mask; /* Supported IOVA range */ struct virtio_iommu_range_64 input_range; /* Max domain ID size */ struct virtio_iommu_range_32 domain_range; /* Probe buffer size */ - __u32 probe_size; + __le32 probe_size; }; /* Request types */ -- cgit v1.2.3-59-g8ed1b From 79268954424771185fb4ca304786dd561a272246 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 10 Jul 2020 07:17:13 -0400 Subject: virtio_mem: correct tags for config space fields Since this is a modern-only device, tag config space fields as having little endian-ness. TODO: check other uses of __virtioXX types in this header, should probably be __leXX. Signed-off-by: Michael S. Tsirkin Acked-by: David Hildenbrand Reviewed-by: Cornelia Huck --- include/uapi/linux/virtio_mem.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/virtio_mem.h b/include/uapi/linux/virtio_mem.h index a9ffe041843c..70e01c687d5e 100644 --- a/include/uapi/linux/virtio_mem.h +++ b/include/uapi/linux/virtio_mem.h @@ -185,27 +185,27 @@ struct virtio_mem_resp { struct virtio_mem_config { /* Block size and alignment. Cannot change. */ - __u64 block_size; + __le64 block_size; /* Valid with VIRTIO_MEM_F_ACPI_PXM. Cannot change. */ - __u16 node_id; + __le16 node_id; __u8 padding[6]; /* Start address of the memory region. Cannot change. */ - __u64 addr; + __le64 addr; /* Region size (maximum). Cannot change. */ - __u64 region_size; + __le64 region_size; /* * Currently usable region size. Can grow up to region_size. Can * shrink due to VIRTIO_MEM_REQ_UNPLUG_ALL (in which case no config * update will be sent). */ - __u64 usable_region_size; + __le64 usable_region_size; /* * Currently used size. Changes due to plug/unplug requests, but no * config updates will be sent. */ - __u64 plugged_size; + __le64 plugged_size; /* Requested size. New plug requests cannot exceed it. Can change. */ - __u64 requested_size; + __le64 requested_size; }; #endif /* _LINUX_VIRTIO_MEM_H */ -- cgit v1.2.3-59-g8ed1b From 577e677a785357542311a645eeb1756cd83988be Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 10 Jul 2020 07:17:13 -0400 Subject: virtio_net: correct tags for config space fields Tag config space fields as having virtio endian-ness. Signed-off-by: Michael S. Tsirkin --- include/uapi/linux/virtio_net.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h index 19d23e5baa4e..27d996f29dd1 100644 --- a/include/uapi/linux/virtio_net.h +++ b/include/uapi/linux/virtio_net.h @@ -87,19 +87,19 @@ struct virtio_net_config { /* The config defining mac address (if VIRTIO_NET_F_MAC) */ __u8 mac[ETH_ALEN]; /* See VIRTIO_NET_F_STATUS and VIRTIO_NET_S_* above */ - __u16 status; + __virtio16 status; /* Maximum number of each of transmit and receive queues; * see VIRTIO_NET_F_MQ and VIRTIO_NET_CTRL_MQ. * Legal values are between 1 and 0x8000 */ - __u16 max_virtqueue_pairs; + __virtio16 max_virtqueue_pairs; /* Default maximum transmit unit advice */ - __u16 mtu; + __virtio16 mtu; /* * speed, in units of 1Mb. All values 0 to INT_MAX are legal. * Any other value stands for unknown. */ - __u32 speed; + __virtio32 speed; /* * 0x00 - half duplex * 0x01 - full duplex -- cgit v1.2.3-59-g8ed1b From a28feb855cc0ad452acd3dfe2b8f2841927da5f1 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 10 Jul 2020 07:17:13 -0400 Subject: virtio_pmem: correct tags for config space fields Since this is a modern-only device, tag config space fields as having little endian-ness. Signed-off-by: Michael S. Tsirkin Reviewed-by: Cornelia Huck --- include/uapi/linux/virtio_pmem.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/virtio_pmem.h b/include/uapi/linux/virtio_pmem.h index b022787ffb94..d676b3620383 100644 --- a/include/uapi/linux/virtio_pmem.h +++ b/include/uapi/linux/virtio_pmem.h @@ -15,8 +15,8 @@ #include struct virtio_pmem_config { - __u64 start; - __u64 size; + __le64 start; + __le64 size; }; #define VIRTIO_PMEM_REQ_TYPE_FLUSH 0 -- cgit v1.2.3-59-g8ed1b From 965b5350514b597dc6347b733127e180844aeb43 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 10 Jul 2020 07:17:13 -0400 Subject: virtio_scsi: correct tags for config space fields Tag config space fields as having virtio endian-ness. Signed-off-by: Michael S. Tsirkin Reviewed-by: Cornelia Huck --- drivers/scsi/virtio_scsi.c | 4 ++-- include/uapi/linux/virtio_scsi.h | 20 ++++++++++---------- 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'include/uapi') diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c index 0e0910c5b942..c36aeb9a1330 100644 --- a/drivers/scsi/virtio_scsi.c +++ b/drivers/scsi/virtio_scsi.c @@ -746,14 +746,14 @@ static struct scsi_host_template virtscsi_host_template = { #define virtscsi_config_get(vdev, fld) \ ({ \ - typeof(((struct virtio_scsi_config *)0)->fld) __val; \ + __virtio_native_type(struct virtio_scsi_config, fld) __val; \ virtio_cread(vdev, struct virtio_scsi_config, fld, &__val); \ __val; \ }) #define virtscsi_config_set(vdev, fld, val) \ do { \ - typeof(((struct virtio_scsi_config *)0)->fld) __val = (val); \ + __virtio_native_type(struct virtio_scsi_config, fld) __val = (val); \ virtio_cwrite(vdev, struct virtio_scsi_config, fld, &__val); \ } while(0) diff --git a/include/uapi/linux/virtio_scsi.h b/include/uapi/linux/virtio_scsi.h index cc18ef8825c0..0abaae4027c0 100644 --- a/include/uapi/linux/virtio_scsi.h +++ b/include/uapi/linux/virtio_scsi.h @@ -103,16 +103,16 @@ struct virtio_scsi_event { } __attribute__((packed)); struct virtio_scsi_config { - __u32 num_queues; - __u32 seg_max; - __u32 max_sectors; - __u32 cmd_per_lun; - __u32 event_info_size; - __u32 sense_size; - __u32 cdb_size; - __u16 max_channel; - __u16 max_target; - __u32 max_lun; + __virtio32 num_queues; + __virtio32 seg_max; + __virtio32 max_sectors; + __virtio32 cmd_per_lun; + __virtio32 event_info_size; + __virtio32 sense_size; + __virtio32 cdb_size; + __virtio16 max_channel; + __virtio16 max_target; + __virtio32 max_lun; } __attribute__((packed)); /* Feature Bits */ -- cgit v1.2.3-59-g8ed1b From 64ffa39dc860fb9772225c694353f73eca5801c6 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 5 Aug 2020 05:39:36 -0400 Subject: virtio_net: use LE accessors for speed/duplex Speed and duplex config fields depend on VIRTIO_NET_F_SPEED_DUPLEX which being 63>31 depends on VIRTIO_F_VERSION_1. Accordingly, use LE accessors for these fields. Reported-by: Cornelia Huck Signed-off-by: Michael S. Tsirkin --- drivers/net/virtio_net.c | 9 +++++---- include/uapi/linux/virtio_net.h | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'include/uapi') diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index ba38765dc490..0934b1ec5320 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -2264,12 +2264,13 @@ static void virtnet_update_settings(struct virtnet_info *vi) if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_SPEED_DUPLEX)) return; - speed = virtio_cread32(vi->vdev, offsetof(struct virtio_net_config, - speed)); + virtio_cread_le(vi->vdev, struct virtio_net_config, speed, &speed); + if (ethtool_validate_speed(speed)) vi->speed = speed; - duplex = virtio_cread8(vi->vdev, offsetof(struct virtio_net_config, - duplex)); + + virtio_cread_le(vi->vdev, struct virtio_net_config, duplex, &duplex); + if (ethtool_validate_duplex(duplex)) vi->duplex = duplex; } diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h index 27d996f29dd1..3f55a4215f11 100644 --- a/include/uapi/linux/virtio_net.h +++ b/include/uapi/linux/virtio_net.h @@ -99,7 +99,7 @@ struct virtio_net_config { * speed, in units of 1Mb. All values 0 to INT_MAX are legal. * Any other value stands for unknown. */ - __virtio32 speed; + __le32 speed; /* * 0x00 - half duplex * 0x01 - full duplex -- cgit v1.2.3-59-g8ed1b From 25abc060d282132ea5c945392f900dca0a7e9bbb Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 4 Aug 2020 19:20:40 +0300 Subject: vhost-vdpa: support IOTLB batching hints This patches extend the vhost IOTLB API to accept batch updating hints form userspace. When userspace wants update the device IOTLB in a batch, it may do: 1) Write vhost_iotlb_msg with VHOST_IOTLB_BATCH_BEGIN flag 2) Perform a batch of IOTLB updating via VHOST_IOTLB_UPDATE/INVALIDATE 3) Write vhost_iotlb_msg with VHOST_IOTLB_BATCH_END flag Vhost-vdpa may decide to batch the IOMMU/IOTLB updating in step 3 when vDPA device support set_map() ops. This is useful for the vDPA device that want to know all the mappings to tweak their own DMA translation logic. For vDPA device that doesn't require set_map(), no behavior changes. This capability is advertised via VHOST_BACKEND_F_IOTLB_BATCH capability. Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20200804162048.22587-5-eli@mellanox.com Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vdpa.c | 36 +++++++++++++++++++++++++++--------- include/uapi/linux/vhost.h | 2 ++ include/uapi/linux/vhost_types.h | 11 +++++++++++ 3 files changed, 40 insertions(+), 9 deletions(-) (limited to 'include/uapi') diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 61c17d34cb39..e80db051845d 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -27,7 +27,9 @@ #include "vhost.h" enum { - VHOST_VDPA_BACKEND_FEATURES = (1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2) + VHOST_VDPA_BACKEND_FEATURES = + (1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2) | + (1ULL << VHOST_BACKEND_F_IOTLB_BATCH), }; /* Currently, only network backend w/o multiqueue is supported. */ @@ -48,6 +50,7 @@ struct vhost_vdpa { int virtio_id; int minor; struct eventfd_ctx *config_ctx; + int in_batch; }; static DEFINE_IDA(vhost_vdpa_ida); @@ -124,6 +127,7 @@ static void vhost_vdpa_reset(struct vhost_vdpa *v) struct vdpa_device *vdpa = v->vdpa; vdpa_reset(vdpa); + v->in_batch = 0; } static long vhost_vdpa_get_device_id(struct vhost_vdpa *v, u8 __user *argp) @@ -546,13 +550,15 @@ static int vhost_vdpa_map(struct vhost_vdpa *v, if (r) return r; - if (ops->dma_map) + if (ops->dma_map) { r = ops->dma_map(vdpa, iova, size, pa, perm); - else if (ops->set_map) - r = ops->set_map(vdpa, dev->iotlb); - else + } else if (ops->set_map) { + if (!v->in_batch) + r = ops->set_map(vdpa, dev->iotlb); + } else { r = iommu_map(v->domain, iova, pa, size, perm_to_iommu_flags(perm)); + } return r; } @@ -565,12 +571,14 @@ static void vhost_vdpa_unmap(struct vhost_vdpa *v, u64 iova, u64 size) vhost_vdpa_iotlb_unmap(v, iova, iova + size - 1); - if (ops->dma_map) + if (ops->dma_map) { ops->dma_unmap(vdpa, iova, size); - else if (ops->set_map) - ops->set_map(vdpa, dev->iotlb); - else + } else if (ops->set_map) { + if (!v->in_batch) + ops->set_map(vdpa, dev->iotlb); + } else { iommu_unmap(v->domain, iova, size); + } } static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v, @@ -663,6 +671,8 @@ static int vhost_vdpa_process_iotlb_msg(struct vhost_dev *dev, struct vhost_iotlb_msg *msg) { struct vhost_vdpa *v = container_of(dev, struct vhost_vdpa, vdev); + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; int r = 0; r = vhost_dev_check_owner(dev); @@ -676,6 +686,14 @@ static int vhost_vdpa_process_iotlb_msg(struct vhost_dev *dev, case VHOST_IOTLB_INVALIDATE: vhost_vdpa_unmap(v, msg->iova, msg->size); break; + case VHOST_IOTLB_BATCH_BEGIN: + v->in_batch = true; + break; + case VHOST_IOTLB_BATCH_END: + if (v->in_batch && ops->set_map) + ops->set_map(vdpa, dev->iotlb); + v->in_batch = false; + break; default: r = -EINVAL; break; diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h index 0c2349612e77..75232185324a 100644 --- a/include/uapi/linux/vhost.h +++ b/include/uapi/linux/vhost.h @@ -91,6 +91,8 @@ /* Use message type V2 */ #define VHOST_BACKEND_F_IOTLB_MSG_V2 0x1 +/* IOTLB can accept batching hints */ +#define VHOST_BACKEND_F_IOTLB_BATCH 0x2 #define VHOST_SET_BACKEND_FEATURES _IOW(VHOST_VIRTIO, 0x25, __u64) #define VHOST_GET_BACKEND_FEATURES _IOR(VHOST_VIRTIO, 0x26, __u64) diff --git a/include/uapi/linux/vhost_types.h b/include/uapi/linux/vhost_types.h index 669457ce5c48..9a269a88a6ff 100644 --- a/include/uapi/linux/vhost_types.h +++ b/include/uapi/linux/vhost_types.h @@ -60,6 +60,17 @@ struct vhost_iotlb_msg { #define VHOST_IOTLB_UPDATE 2 #define VHOST_IOTLB_INVALIDATE 3 #define VHOST_IOTLB_ACCESS_FAIL 4 +/* + * VHOST_IOTLB_BATCH_BEGIN and VHOST_IOTLB_BATCH_END allow modifying + * multiple mappings in one go: beginning with + * VHOST_IOTLB_BATCH_BEGIN, followed by any number of + * VHOST_IOTLB_UPDATE messages, and ending with VHOST_IOTLB_BATCH_END. + * When one of these two values is used as the message type, the rest + * of the fields in the message are ignored. There's no guarantee that + * these changes take place automatically in the device. + */ +#define VHOST_IOTLB_BATCH_BEGIN 5 +#define VHOST_IOTLB_BATCH_END 6 __u8 type; }; -- cgit v1.2.3-59-g8ed1b From 5e7b30205cef80f6bb922e61834437ca7bff5837 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 4 Aug 2020 22:50:56 -0700 Subject: bpf: Change uapi for bpf iterator map elements Commit a5cbe05a6673 ("bpf: Implement bpf iterator for map elements") added bpf iterator support for map elements. The map element bpf iterator requires info to identify a particular map. In the above commit, the attr->link_create.target_fd is used to carry map_fd and an enum bpf_iter_link_info is added to uapi to specify the target_fd actually representing a map_fd: enum bpf_iter_link_info { BPF_ITER_LINK_UNSPEC = 0, BPF_ITER_LINK_MAP_FD = 1, MAX_BPF_ITER_LINK_INFO, }; This is an extensible approach as we can grow enumerator for pid, cgroup_id, etc. and we can unionize target_fd for pid, cgroup_id, etc. But in the future, there are chances that more complex customization may happen, e.g., for tasks, it could be filtered based on both cgroup_id and user_id. This patch changed the uapi to have fields __aligned_u64 iter_info; __u32 iter_info_len; for additional iter_info for link_create. The iter_info is defined as union bpf_iter_link_info { struct { __u32 map_fd; } map; }; So future extension for additional customization will be easier. The bpf_iter_link_info will be passed to target callback to validate and generic bpf_iter framework does not need to deal it any more. Note that map_fd = 0 will be considered invalid and -EBADF will be returned to user space. Fixes: a5cbe05a6673 ("bpf: Implement bpf iterator for map elements") Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200805055056.1457463-1-yhs@fb.com --- include/linux/bpf.h | 10 ++++---- include/uapi/linux/bpf.h | 15 ++++++------ kernel/bpf/bpf_iter.c | 58 +++++++++++++++++++++++------------------------ kernel/bpf/map_iter.c | 37 +++++++++++++++++++++++------- kernel/bpf/syscall.c | 2 +- net/core/bpf_sk_storage.c | 37 +++++++++++++++++++++++------- 6 files changed, 102 insertions(+), 57 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cef4ef0d2b4e..55f694b63164 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1214,15 +1214,17 @@ struct bpf_iter_aux_info { struct bpf_map *map; }; -typedef int (*bpf_iter_check_target_t)(struct bpf_prog *prog, - struct bpf_iter_aux_info *aux); +typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog, + union bpf_iter_link_info *linfo, + struct bpf_iter_aux_info *aux); +typedef void (*bpf_iter_detach_target_t)(struct bpf_iter_aux_info *aux); #define BPF_ITER_CTX_ARG_MAX 2 struct bpf_iter_reg { const char *target; - bpf_iter_check_target_t check_target; + bpf_iter_attach_target_t attach_target; + bpf_iter_detach_target_t detach_target; u32 ctx_arg_info_size; - enum bpf_iter_link_info req_linfo; struct bpf_ctx_arg_aux ctx_arg_info[BPF_ITER_CTX_ARG_MAX]; const struct bpf_iter_seq_info *seq_info; }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b134e679e9db..0480f893facd 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -81,6 +81,12 @@ struct bpf_cgroup_storage_key { __u32 attach_type; /* program attach type */ }; +union bpf_iter_link_info { + struct { + __u32 map_fd; + } map; +}; + /* BPF syscall commands, see bpf(2) man-page for details. */ enum bpf_cmd { BPF_MAP_CREATE, @@ -249,13 +255,6 @@ enum bpf_link_type { MAX_BPF_LINK_TYPE, }; -enum bpf_iter_link_info { - BPF_ITER_LINK_UNSPEC = 0, - BPF_ITER_LINK_MAP_FD = 1, - - MAX_BPF_ITER_LINK_INFO, -}; - /* cgroup-bpf attach flags used in BPF_PROG_ATTACH command * * NONE(default): No further bpf programs allowed in the subtree. @@ -623,6 +622,8 @@ union bpf_attr { }; __u32 attach_type; /* attach type */ __u32 flags; /* extra flags */ + __aligned_u64 iter_info; /* extra bpf_iter_link_info */ + __u32 iter_info_len; /* iter_info length */ } link_create; struct { /* struct used by BPF_LINK_UPDATE command */ diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 363b9cafc2d8..b6715964b685 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -338,8 +338,8 @@ static void bpf_iter_link_release(struct bpf_link *link) struct bpf_iter_link *iter_link = container_of(link, struct bpf_iter_link, link); - if (iter_link->aux.map) - bpf_map_put_with_uref(iter_link->aux.map); + if (iter_link->tinfo->reg_info->detach_target) + iter_link->tinfo->reg_info->detach_target(&iter_link->aux); } static void bpf_iter_link_dealloc(struct bpf_link *link) @@ -390,15 +390,35 @@ bool bpf_link_is_iter(struct bpf_link *link) int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { + union bpf_iter_link_info __user *ulinfo; struct bpf_link_primer link_primer; struct bpf_iter_target_info *tinfo; - struct bpf_iter_aux_info aux = {}; + union bpf_iter_link_info linfo; struct bpf_iter_link *link; - u32 prog_btf_id, target_fd; + u32 prog_btf_id, linfo_len; bool existed = false; - struct bpf_map *map; int err; + if (attr->link_create.target_fd || attr->link_create.flags) + return -EINVAL; + + memset(&linfo, 0, sizeof(union bpf_iter_link_info)); + + ulinfo = u64_to_user_ptr(attr->link_create.iter_info); + linfo_len = attr->link_create.iter_info_len; + if (!ulinfo ^ !linfo_len) + return -EINVAL; + + if (ulinfo) { + err = bpf_check_uarg_tail_zero(ulinfo, sizeof(linfo), + linfo_len); + if (err) + return err; + linfo_len = min_t(u32, linfo_len, sizeof(linfo)); + if (copy_from_user(&linfo, ulinfo, linfo_len)) + return -EFAULT; + } + prog_btf_id = prog->aux->attach_btf_id; mutex_lock(&targets_mutex); list_for_each_entry(tinfo, &targets, list) { @@ -411,13 +431,6 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) if (!existed) return -ENOENT; - /* Make sure user supplied flags are target expected. */ - target_fd = attr->link_create.target_fd; - if (attr->link_create.flags != tinfo->reg_info->req_linfo) - return -EINVAL; - if (!attr->link_create.flags && target_fd) - return -EINVAL; - link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN); if (!link) return -ENOMEM; @@ -431,28 +444,15 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) return err; } - if (tinfo->reg_info->req_linfo == BPF_ITER_LINK_MAP_FD) { - map = bpf_map_get_with_uref(target_fd); - if (IS_ERR(map)) { - err = PTR_ERR(map); - goto cleanup_link; - } - - aux.map = map; - err = tinfo->reg_info->check_target(prog, &aux); + if (tinfo->reg_info->attach_target) { + err = tinfo->reg_info->attach_target(prog, &linfo, &link->aux); if (err) { - bpf_map_put_with_uref(map); - goto cleanup_link; + bpf_link_cleanup(&link_primer); + return err; } - - link->aux.map = map; } return bpf_link_settle(&link_primer); - -cleanup_link: - bpf_link_cleanup(&link_primer); - return err; } static void init_seq_meta(struct bpf_iter_priv_data *priv_data, diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index fbe1f557cb88..af86048e5afd 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -98,12 +98,21 @@ static struct bpf_iter_reg bpf_map_reg_info = { .seq_info = &bpf_map_seq_info, }; -static int bpf_iter_check_map(struct bpf_prog *prog, - struct bpf_iter_aux_info *aux) +static int bpf_iter_attach_map(struct bpf_prog *prog, + union bpf_iter_link_info *linfo, + struct bpf_iter_aux_info *aux) { u32 key_acc_size, value_acc_size, key_size, value_size; - struct bpf_map *map = aux->map; + struct bpf_map *map; bool is_percpu = false; + int err = -EINVAL; + + if (!linfo->map.map_fd) + return -EBADF; + + map = bpf_map_get_with_uref(linfo->map.map_fd); + if (IS_ERR(map)) + return PTR_ERR(map); if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || @@ -112,7 +121,7 @@ static int bpf_iter_check_map(struct bpf_prog *prog, else if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_LRU_HASH && map->map_type != BPF_MAP_TYPE_ARRAY) - return -EINVAL; + goto put_map; key_acc_size = prog->aux->max_rdonly_access; value_acc_size = prog->aux->max_rdwr_access; @@ -122,10 +131,22 @@ static int bpf_iter_check_map(struct bpf_prog *prog, else value_size = round_up(map->value_size, 8) * num_possible_cpus(); - if (key_acc_size > key_size || value_acc_size > value_size) - return -EACCES; + if (key_acc_size > key_size || value_acc_size > value_size) { + err = -EACCES; + goto put_map; + } + aux->map = map; return 0; + +put_map: + bpf_map_put_with_uref(map); + return err; +} + +static void bpf_iter_detach_map(struct bpf_iter_aux_info *aux) +{ + bpf_map_put_with_uref(aux->map); } DEFINE_BPF_ITER_FUNC(bpf_map_elem, struct bpf_iter_meta *meta, @@ -133,8 +154,8 @@ DEFINE_BPF_ITER_FUNC(bpf_map_elem, struct bpf_iter_meta *meta, static const struct bpf_iter_reg bpf_map_elem_reg_info = { .target = "bpf_map_elem", - .check_target = bpf_iter_check_map, - .req_linfo = BPF_ITER_LINK_MAP_FD, + .attach_target = bpf_iter_attach_map, + .detach_target = bpf_iter_detach_map, .ctx_arg_info_size = 2, .ctx_arg_info = { { offsetof(struct bpf_iter__bpf_map_elem, key), diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2f343ce15747..86299a292214 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3883,7 +3883,7 @@ static int tracing_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog * return -EINVAL; } -#define BPF_LINK_CREATE_LAST_FIELD link_create.flags +#define BPF_LINK_CREATE_LAST_FIELD link_create.iter_info_len static int link_create(union bpf_attr *attr) { enum bpf_prog_type ptype; diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index d3377c90a291..b988f48153a4 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -1384,18 +1384,39 @@ static int bpf_iter_init_sk_storage_map(void *priv_data, return 0; } -static int bpf_iter_check_map(struct bpf_prog *prog, - struct bpf_iter_aux_info *aux) +static int bpf_iter_attach_map(struct bpf_prog *prog, + union bpf_iter_link_info *linfo, + struct bpf_iter_aux_info *aux) { - struct bpf_map *map = aux->map; + struct bpf_map *map; + int err = -EINVAL; + + if (!linfo->map.map_fd) + return -EBADF; + + map = bpf_map_get_with_uref(linfo->map.map_fd); + if (IS_ERR(map)) + return PTR_ERR(map); if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) - return -EINVAL; + goto put_map; - if (prog->aux->max_rdonly_access > map->value_size) - return -EACCES; + if (prog->aux->max_rdonly_access > map->value_size) { + err = -EACCES; + goto put_map; + } + aux->map = map; return 0; + +put_map: + bpf_map_put_with_uref(map); + return err; +} + +static void bpf_iter_detach_map(struct bpf_iter_aux_info *aux) +{ + bpf_map_put_with_uref(aux->map); } static const struct seq_operations bpf_sk_storage_map_seq_ops = { @@ -1414,8 +1435,8 @@ static const struct bpf_iter_seq_info iter_seq_info = { static struct bpf_iter_reg bpf_sk_storage_map_reg_info = { .target = "bpf_sk_storage_map", - .check_target = bpf_iter_check_map, - .req_linfo = BPF_ITER_LINK_MAP_FD, + .attach_target = bpf_iter_attach_map, + .detach_target = bpf_iter_detach_map, .ctx_arg_info_size = 2, .ctx_arg_info = { { offsetof(struct bpf_iter__bpf_sk_storage_map, sk), -- cgit v1.2.3-59-g8ed1b From 7f317d34906c1033f0752fc137dda04e43979bb8 Mon Sep 17 00:00:00 2001 From: "Alexander A. Klimov" Date: Tue, 11 Aug 2020 18:34:19 -0700 Subject: include/: replace HTTP links with HTTPS ones Rationale: Reduces attack surface on kernel devs opening the links for MITM as HTTPS traffic is much harder to manipulate. Signed-off-by: Alexander A. Klimov Signed-off-by: Andrew Morton Reviewed-by: Kees Cook Link: http://lkml.kernel.org/r/20200726110117.16346-1-grandmaster@al2klimov.de Signed-off-by: Linus Torvalds --- include/clocksource/timer-ti-dm.h | 2 +- include/linux/btree.h | 2 +- include/linux/delay.h | 2 +- include/linux/dma/k3-psil.h | 2 +- include/linux/dma/k3-udma-glue.h | 2 +- include/linux/dma/ti-cppi5.h | 2 +- include/linux/irqchip/irq-omap-intc.h | 2 +- include/linux/jhash.h | 2 +- include/linux/leds-ti-lmu-common.h | 2 +- include/linux/platform_data/davinci-cpufreq.h | 2 +- include/linux/platform_data/davinci_asp.h | 2 +- include/linux/platform_data/elm.h | 2 +- include/linux/platform_data/gpio-davinci.h | 2 +- include/linux/platform_data/gpmc-omap.h | 2 +- include/linux/platform_data/mtd-davinci-aemif.h | 2 +- include/linux/platform_data/omap-twl4030.h | 2 +- include/linux/platform_data/uio_pruss.h | 2 +- include/linux/platform_data/usb-omap.h | 2 +- include/linux/soc/ti/k3-ringacc.h | 2 +- include/linux/soc/ti/knav_qmss.h | 2 +- include/linux/soc/ti/ti-msgmgr.h | 2 +- include/linux/wkup_m3_ipc.h | 2 +- include/linux/xxhash.h | 2 +- include/linux/xz.h | 2 +- include/linux/zlib.h | 2 +- include/soc/arc/aux.h | 2 +- include/uapi/linux/elf.h | 2 +- include/uapi/linux/map_to_7segment.h | 2 +- include/uapi/linux/types.h | 2 +- include/uapi/linux/usb/ch9.h | 2 +- 30 files changed, 30 insertions(+), 30 deletions(-) (limited to 'include/uapi') diff --git a/include/clocksource/timer-ti-dm.h b/include/clocksource/timer-ti-dm.h index 531ca87fcd08..4c61dade8835 100644 --- a/include/clocksource/timer-ti-dm.h +++ b/include/clocksource/timer-ti-dm.h @@ -1,7 +1,7 @@ /* * OMAP Dual-Mode Timers * - * Copyright (C) 2010 Texas Instruments Incorporated - http://www.ti.com/ + * Copyright (C) 2010 Texas Instruments Incorporated - https://www.ti.com/ * Tarun Kanti DebBarma * Thara Gopinath * diff --git a/include/linux/btree.h b/include/linux/btree.h index 68f858c831b1..243ee544397a 100644 --- a/include/linux/btree.h +++ b/include/linux/btree.h @@ -10,7 +10,7 @@ * * A B+Tree is a data structure for looking up arbitrary (currently allowing * unsigned long, u32, u64 and 2 * u64) keys into pointers. The data structure - * is described at http://en.wikipedia.org/wiki/B-tree, we currently do not + * is described at https://en.wikipedia.org/wiki/B-tree, we currently do not * use binary search to find the key on lookups. * * Each B+Tree consists of a head, that contains bookkeeping information and diff --git a/include/linux/delay.h b/include/linux/delay.h index 5e016a4029d9..1d0e2ce6b6d9 100644 --- a/include/linux/delay.h +++ b/include/linux/delay.h @@ -16,7 +16,7 @@ * 3. CPU clock rate changes. * * Please see this thread: - * http://lists.openwall.net/linux-kernel/2011/01/09/56 + * https://lists.openwall.net/linux-kernel/2011/01/09/56 */ #include diff --git a/include/linux/dma/k3-psil.h b/include/linux/dma/k3-psil.h index 61d5cc0ad601..1962f75fa2d3 100644 --- a/include/linux/dma/k3-psil.h +++ b/include/linux/dma/k3-psil.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * Copyright (C) 2019 Texas Instruments Incorporated - http://www.ti.com + * Copyright (C) 2019 Texas Instruments Incorporated - https://www.ti.com */ #ifndef K3_PSIL_H_ diff --git a/include/linux/dma/k3-udma-glue.h b/include/linux/dma/k3-udma-glue.h index caadbab1632a..5eb34ad973a7 100644 --- a/include/linux/dma/k3-udma-glue.h +++ b/include/linux/dma/k3-udma-glue.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * Copyright (C) 2019 Texas Instruments Incorporated - http://www.ti.com + * Copyright (C) 2019 Texas Instruments Incorporated - https://www.ti.com */ #ifndef K3_UDMA_GLUE_H_ diff --git a/include/linux/dma/ti-cppi5.h b/include/linux/dma/ti-cppi5.h index 579356ae447e..5896441ee604 100644 --- a/include/linux/dma/ti-cppi5.h +++ b/include/linux/dma/ti-cppi5.h @@ -2,7 +2,7 @@ /* * CPPI5 descriptors interface * - * Copyright (C) 2019 Texas Instruments Incorporated - http://www.ti.com + * Copyright (C) 2019 Texas Instruments Incorporated - https://www.ti.com */ #ifndef __TI_CPPI5_H__ diff --git a/include/linux/irqchip/irq-omap-intc.h b/include/linux/irqchip/irq-omap-intc.h index 216e5adf80ce..dca379c0d7eb 100644 --- a/include/linux/irqchip/irq-omap-intc.h +++ b/include/linux/irqchip/irq-omap-intc.h @@ -2,7 +2,7 @@ /** * irq-omap-intc.h - INTC Idle Functions * - * Copyright (C) 2014 Texas Instruments Incorporated - http://www.ti.com + * Copyright (C) 2014 Texas Instruments Incorporated - https://www.ti.com * * Author: Felipe Balbi */ diff --git a/include/linux/jhash.h b/include/linux/jhash.h index ba2f6a9776b6..19ddd43aee68 100644 --- a/include/linux/jhash.h +++ b/include/linux/jhash.h @@ -5,7 +5,7 @@ * * Copyright (C) 2006. Bob Jenkins (bob_jenkins@burtleburtle.net) * - * http://burtleburtle.net/bob/hash/ + * https://burtleburtle.net/bob/hash/ * * These are the credits from Bob's sources: * diff --git a/include/linux/leds-ti-lmu-common.h b/include/linux/leds-ti-lmu-common.h index 5eb111f38803..420b61e5a213 100644 --- a/include/linux/leds-ti-lmu-common.h +++ b/include/linux/leds-ti-lmu-common.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ // TI LMU Common Core -// Copyright (C) 2018 Texas Instruments Incorporated - http://www.ti.com/ +// Copyright (C) 2018 Texas Instruments Incorporated - https://www.ti.com/ #ifndef _TI_LMU_COMMON_H_ #define _TI_LMU_COMMON_H_ diff --git a/include/linux/platform_data/davinci-cpufreq.h b/include/linux/platform_data/davinci-cpufreq.h index 3fbf9f2793b5..bc208c64e3d7 100644 --- a/include/linux/platform_data/davinci-cpufreq.h +++ b/include/linux/platform_data/davinci-cpufreq.h @@ -2,7 +2,7 @@ /* * TI DaVinci CPUFreq platform support. * - * Copyright (C) 2009 Texas Instruments, Inc. http://www.ti.com/ + * Copyright (C) 2009 Texas Instruments, Inc. https://www.ti.com/ */ #ifndef _MACH_DAVINCI_CPUFREQ_H diff --git a/include/linux/platform_data/davinci_asp.h b/include/linux/platform_data/davinci_asp.h index 7fe80f1c7e08..5d1fb0d78a22 100644 --- a/include/linux/platform_data/davinci_asp.h +++ b/include/linux/platform_data/davinci_asp.h @@ -1,7 +1,7 @@ /* * TI DaVinci Audio Serial Port support * - * Copyright (C) 2012 Texas Instruments Incorporated - http://www.ti.com/ + * Copyright (C) 2012 Texas Instruments Incorporated - https://www.ti.com/ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as diff --git a/include/linux/platform_data/elm.h b/include/linux/platform_data/elm.h index 0f491d8abfdd..3cc78f0447b1 100644 --- a/include/linux/platform_data/elm.h +++ b/include/linux/platform_data/elm.h @@ -2,7 +2,7 @@ /* * BCH Error Location Module * - * Copyright (C) 2012 Texas Instruments Incorporated - http://www.ti.com/ + * Copyright (C) 2012 Texas Instruments Incorporated - https://www.ti.com/ */ #ifndef __ELM_H diff --git a/include/linux/platform_data/gpio-davinci.h b/include/linux/platform_data/gpio-davinci.h index a93841bfb9f7..e182a46e609f 100644 --- a/include/linux/platform_data/gpio-davinci.h +++ b/include/linux/platform_data/gpio-davinci.h @@ -1,7 +1,7 @@ /* * DaVinci GPIO Platform Related Defines * - * Copyright (C) 2013 Texas Instruments Incorporated - http://www.ti.com/ + * Copyright (C) 2013 Texas Instruments Incorporated - https://www.ti.com/ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as diff --git a/include/linux/platform_data/gpmc-omap.h b/include/linux/platform_data/gpmc-omap.h index ef663e570552..c9cc4e32435d 100644 --- a/include/linux/platform_data/gpmc-omap.h +++ b/include/linux/platform_data/gpmc-omap.h @@ -2,7 +2,7 @@ /* * OMAP GPMC Platform data * - * Copyright (C) 2014 Texas Instruments, Inc. - http://www.ti.com + * Copyright (C) 2014 Texas Instruments, Inc. - https://www.ti.com * Roger Quadros */ diff --git a/include/linux/platform_data/mtd-davinci-aemif.h b/include/linux/platform_data/mtd-davinci-aemif.h index a403dd51dacc..a49826214a39 100644 --- a/include/linux/platform_data/mtd-davinci-aemif.h +++ b/include/linux/platform_data/mtd-davinci-aemif.h @@ -1,7 +1,7 @@ /* * TI DaVinci AEMIF support * - * Copyright 2010 (C) Texas Instruments, Inc. http://www.ti.com/ + * Copyright 2010 (C) Texas Instruments, Inc. https://www.ti.com/ * * This file is licensed under the terms of the GNU General Public License * version 2. This program is licensed "as is" without any warranty of any diff --git a/include/linux/platform_data/omap-twl4030.h b/include/linux/platform_data/omap-twl4030.h index 8419c8caf54e..0dd851ea1c72 100644 --- a/include/linux/platform_data/omap-twl4030.h +++ b/include/linux/platform_data/omap-twl4030.h @@ -3,7 +3,7 @@ * omap-twl4030.h - ASoC machine driver for TI SoC based boards with twl4030 * codec, header. * - * Copyright (C) 2012 Texas Instruments Incorporated - http://www.ti.com + * Copyright (C) 2012 Texas Instruments Incorporated - https://www.ti.com * All rights reserved. * * Author: Peter Ujfalusi diff --git a/include/linux/platform_data/uio_pruss.h b/include/linux/platform_data/uio_pruss.h index 3d47d219827f..31f2e22661bc 100644 --- a/include/linux/platform_data/uio_pruss.h +++ b/include/linux/platform_data/uio_pruss.h @@ -3,7 +3,7 @@ * * Platform data for uio_pruss driver * - * Copyright (C) 2010-11 Texas Instruments Incorporated - http://www.ti.com/ + * Copyright (C) 2010-11 Texas Instruments Incorporated - https://www.ti.com/ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as diff --git a/include/linux/platform_data/usb-omap.h b/include/linux/platform_data/usb-omap.h index fa579b4c666b..5e70d667031c 100644 --- a/include/linux/platform_data/usb-omap.h +++ b/include/linux/platform_data/usb-omap.h @@ -1,7 +1,7 @@ /* * usb-omap.h - Platform data for the various OMAP USB IPs * - * Copyright (C) 2012 Texas Instruments Incorporated - http://www.ti.com + * Copyright (C) 2012 Texas Instruments Incorporated - https://www.ti.com * * This software is distributed under the terms of the GNU General Public * License ("GPL") version 2, as published by the Free Software Foundation. diff --git a/include/linux/soc/ti/k3-ringacc.h b/include/linux/soc/ti/k3-ringacc.h index 7ac115432fa1..5a472eca5ee4 100644 --- a/include/linux/soc/ti/k3-ringacc.h +++ b/include/linux/soc/ti/k3-ringacc.h @@ -2,7 +2,7 @@ /* * K3 Ring Accelerator (RA) subsystem interface * - * Copyright (C) 2019 Texas Instruments Incorporated - http://www.ti.com + * Copyright (C) 2019 Texas Instruments Incorporated - https://www.ti.com */ #ifndef __SOC_TI_K3_RINGACC_API_H_ diff --git a/include/linux/soc/ti/knav_qmss.h b/include/linux/soc/ti/knav_qmss.h index 9745df6ed9d3..c75ef99c99ca 100644 --- a/include/linux/soc/ti/knav_qmss.h +++ b/include/linux/soc/ti/knav_qmss.h @@ -1,7 +1,7 @@ /* * Keystone Navigator Queue Management Sub-System header * - * Copyright (C) 2014 Texas Instruments Incorporated - http://www.ti.com + * Copyright (C) 2014 Texas Instruments Incorporated - https://www.ti.com * Author: Sandeep Nair * Cyril Chemparathy * Santosh Shilimkar diff --git a/include/linux/soc/ti/ti-msgmgr.h b/include/linux/soc/ti/ti-msgmgr.h index eac8e0c6fe11..1f6e76d423cf 100644 --- a/include/linux/soc/ti/ti-msgmgr.h +++ b/include/linux/soc/ti/ti-msgmgr.h @@ -1,7 +1,7 @@ /* * Texas Instruments' Message Manager * - * Copyright (C) 2015-2016 Texas Instruments Incorporated - http://www.ti.com/ + * Copyright (C) 2015-2016 Texas Instruments Incorporated - https://www.ti.com/ * Nishanth Menon * * This program is free software; you can redistribute it and/or modify diff --git a/include/linux/wkup_m3_ipc.h b/include/linux/wkup_m3_ipc.h index e497e621dbb7..3f496967b538 100644 --- a/include/linux/wkup_m3_ipc.h +++ b/include/linux/wkup_m3_ipc.h @@ -1,7 +1,7 @@ /* * TI Wakeup M3 for AMx3 SoCs Power Management Routines * - * Copyright (C) 2015 Texas Instruments Incorporated - http://www.ti.com/ + * Copyright (C) 2015 Texas Instruments Incorporated - https://www.ti.com/ * Dave Gerlach * * This program is free software; you can redistribute it and/or diff --git a/include/linux/xxhash.h b/include/linux/xxhash.h index 52b073fea17f..df42511438d0 100644 --- a/include/linux/xxhash.h +++ b/include/linux/xxhash.h @@ -34,7 +34,7 @@ * ("BSD"). * * You can contact the author at: - * - xxHash homepage: http://cyan4973.github.io/xxHash/ + * - xxHash homepage: https://cyan4973.github.io/xxHash/ * - xxHash source repository: https://github.com/Cyan4973/xxHash */ diff --git a/include/linux/xz.h b/include/linux/xz.h index 24ad7d875977..9884c8440188 100644 --- a/include/linux/xz.h +++ b/include/linux/xz.h @@ -2,7 +2,7 @@ * XZ decompressor * * Authors: Lasse Collin - * Igor Pavlov + * Igor Pavlov * * This file has been put into the public domain. * You can do whatever you want with this file. diff --git a/include/linux/zlib.h b/include/linux/zlib.h index c757d848a758..78ede944c082 100644 --- a/include/linux/zlib.h +++ b/include/linux/zlib.h @@ -23,7 +23,7 @@ The data format used by the zlib library is described by RFCs (Request for - Comments) 1950 to 1952 in the files http://www.ietf.org/rfc/rfc1950.txt + Comments) 1950 to 1952 in the files https://www.ietf.org/rfc/rfc1950.txt (zlib format), rfc1951.txt (deflate format) and rfc1952.txt (gzip format). */ diff --git a/include/soc/arc/aux.h b/include/soc/arc/aux.h index e223c4ffa153..9c2eff6140b6 100644 --- a/include/soc/arc/aux.h +++ b/include/soc/arc/aux.h @@ -22,7 +22,7 @@ static inline int read_aux_reg(u32 r) /* * function helps elide unused variable warning - * see: http://lists.infradead.org/pipermail/linux-snps-arc/2016-November/001748.html + * see: https://lists.infradead.org/pipermail/linux-snps-arc/2016-November/001748.html */ static inline void write_aux_reg(u32 r, u32 v) { diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index c6dd0215482e..22220945a5fd 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -53,7 +53,7 @@ typedef __s64 Elf64_Sxword; * * - Oracle: Linker and Libraries. * Part No: 817–1984–19, August 2011. - * http://docs.oracle.com/cd/E18752_01/pdf/817-1984.pdf + * https://docs.oracle.com/cd/E18752_01/pdf/817-1984.pdf * * - System V ABI AMD64 Architecture Processor Supplement * Draft Version 0.99.4, diff --git a/include/uapi/linux/map_to_7segment.h b/include/uapi/linux/map_to_7segment.h index f9ed18134b83..13a06e5e966e 100644 --- a/include/uapi/linux/map_to_7segment.h +++ b/include/uapi/linux/map_to_7segment.h @@ -24,7 +24,7 @@ * of (ASCII) characters to a 7-segments notation. * * The 7 segment's wikipedia notation below is used as standard. - * See: http://en.wikipedia.org/wiki/Seven_segment_display + * See: https://en.wikipedia.org/wiki/Seven_segment_display * * Notation: +-a-+ * f b diff --git a/include/uapi/linux/types.h b/include/uapi/linux/types.h index 2fce8b6876e9..f6d2f83cbe29 100644 --- a/include/uapi/linux/types.h +++ b/include/uapi/linux/types.h @@ -7,7 +7,7 @@ #ifndef __ASSEMBLY__ #ifndef __KERNEL__ #ifndef __EXPORTED_HEADERS__ -#warning "Attempt to use kernel headers from user space, see http://kernelnewbies.org/KernelHeaders" +#warning "Attempt to use kernel headers from user space, see https://kernelnewbies.org/KernelHeaders" #endif /* __EXPORTED_HEADERS__ */ #endif diff --git a/include/uapi/linux/usb/ch9.h b/include/uapi/linux/usb/ch9.h index 48766fdf6580..0f865ae4ba89 100644 --- a/include/uapi/linux/usb/ch9.h +++ b/include/uapi/linux/usb/ch9.h @@ -1229,7 +1229,7 @@ struct usb_set_sel_req { * As per USB compliance update, a device that is actively drawing * more than 100mA from USB must report itself as bus-powered in * the GetStatus(DEVICE) call. - * http://compliance.usb.org/index.asp?UpdateFile=Electrical&Format=Standard#34 + * https://compliance.usb.org/index.asp?UpdateFile=Electrical&Format=Standard#34 */ #define USB_SELF_POWER_VBUS_MAX_DRAW 100 -- cgit v1.2.3-59-g8ed1b From 2fb3244f0a58ceb3d866ac63f644dfa31cae430f Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 11 Aug 2020 18:35:21 -0700 Subject: autofs: fix doubled word Change doubled word "is" to "it is". Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Acked-by: Ian Kent Link: http://lkml.kernel.org/r/5a82befd-40f8-8dc0-3498-cbc0436cad9b@infradead.org Signed-off-by: Linus Torvalds --- include/uapi/linux/auto_dev-ioctl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/auto_dev-ioctl.h b/include/uapi/linux/auto_dev-ioctl.h index 374742651c30..62e625356dc8 100644 --- a/include/uapi/linux/auto_dev-ioctl.h +++ b/include/uapi/linux/auto_dev-ioctl.h @@ -82,7 +82,7 @@ struct args_ismountpoint { /* * All the ioctls use this structure. * When sending a path size must account for the total length - * of the chunk of memory otherwise is is the size of the + * of the chunk of memory otherwise it is the size of the * structure. */ -- cgit v1.2.3-59-g8ed1b