From 0c38740c08962ab109267cb23f4a40df2ccf2bbf Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 17 Jun 2021 17:28:24 -0700 Subject: selftests/bpf: Fix ringbuf test fetching map FD Seems like 4d1b62986125 ("selftests/bpf: Convert few tests to light skeleton.") and 704e2beba23c ("selftests/bpf: Test ringbuf mmap read-only and read-write restrictions") were done independently on bpf and bpf-next trees and are in conflict with each other, despite a clean merge. Fix fetching of ringbuf's map_fd to use light skeleton properly. Fixes: 704e2beba23c ("selftests/bpf: Test ringbuf mmap read-only and read-write restrictions") Fixes: 4d1b62986125 ("selftests/bpf: Convert few tests to light skeleton.") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210618002824.2081922-1-andrii@kernel.org --- tools/testing/selftests/bpf/prog_tests/ringbuf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf.c b/tools/testing/selftests/bpf/prog_tests/ringbuf.c index a01788090c31..4706cee84360 100644 --- a/tools/testing/selftests/bpf/prog_tests/ringbuf.c +++ b/tools/testing/selftests/bpf/prog_tests/ringbuf.c @@ -100,7 +100,7 @@ void test_ringbuf(void) if (CHECK(err != 0, "skel_load", "skeleton load failed\n")) goto cleanup; - rb_fd = bpf_map__fd(skel->maps.ringbuf); + rb_fd = skel->maps.ringbuf.map_fd; /* good read/write cons_pos */ mmap_ptr = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED, rb_fd, 0); ASSERT_OK_PTR(mmap_ptr, "rw_cons_pos"); -- cgit v1.2.3-59-g8ed1b From 85102ba58b4125ebad941d7555c3c248b23efd16 Mon Sep 17 00:00:00 2001 From: Wang Hai Date: Wed, 16 Jun 2021 12:23:24 +0800 Subject: samples/bpf: Fix Segmentation fault for xdp_redirect command A Segmentation fault error is caused when the following command is executed. $ sudo ./samples/bpf/xdp_redirect lo Segmentation fault This command is missing a device as an argument, resulting in out-of-bounds access from argv. If the number of devices for the xdp_redirect parameter is not 2, we should report an error and exit. Fixes: 24251c264798 ("samples/bpf: add option for native and skb mode for redirect apps") Signed-off-by: Wang Hai Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210616042324.314832-1-wanghai38@huawei.com --- samples/bpf/xdp_redirect_user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/bpf/xdp_redirect_user.c b/samples/bpf/xdp_redirect_user.c index 41d705c3a1f7..eb876629109a 100644 --- a/samples/bpf/xdp_redirect_user.c +++ b/samples/bpf/xdp_redirect_user.c @@ -130,7 +130,7 @@ int main(int argc, char **argv) if (!(xdp_flags & XDP_FLAGS_SKB_MODE)) xdp_flags |= XDP_FLAGS_DRV_MODE; - if (optind == argc) { + if (optind + 2 != argc) { printf("usage: %s _IN _OUT\n", argv[0]); return 1; } -- cgit v1.2.3-59-g8ed1b From 7c6090ee2a7b3315410cfc83a94c3eb057407b25 Mon Sep 17 00:00:00 2001 From: Wang Hai Date: Wed, 16 Jun 2021 12:25:34 +0800 Subject: samples/bpf: Fix the error return code of xdp_redirect's main() Fix to return a negative error code from the error handling case instead of 0, as done elsewhere in this function. If bpf_map_update_elem() failed, main() should return a negative error. Fixes: 832622e6bd18 ("xdp: sample program for new bpf_redirect helper") Signed-off-by: Wang Hai Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210616042534.315097-1-wanghai38@huawei.com --- samples/bpf/xdp_redirect_user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/bpf/xdp_redirect_user.c b/samples/bpf/xdp_redirect_user.c index eb876629109a..93854e135134 100644 --- a/samples/bpf/xdp_redirect_user.c +++ b/samples/bpf/xdp_redirect_user.c @@ -213,5 +213,5 @@ int main(int argc, char **argv) poll_stats(2, ifindex_out); out: - return 0; + return ret; } -- cgit v1.2.3-59-g8ed1b From f42cfb469f9b4a1c002a03cce3d9329376800a6f Mon Sep 17 00:00:00 2001 From: Grant Seltzer Date: Fri, 18 Jun 2021 14:04:59 +0000 Subject: bpf: Add documentation for libbpf including API autogen This patch is meant to start the initiative to document libbpf. It includes .rst files which are text documentation describing building, API naming convention, as well as an index to generated API documentation. In this approach the generated API documentation is enabled by the kernels existing kernel documentation system which uses sphinx. The resulting docs would then be synced to kernel.org/doc You can test this by running `make htmldocs` and serving the html in Documentation/output. Since libbpf does not yet have comments in kernel doc format, see kernel.org/doc/html/latest/doc-guide/kernel-doc.html for an example so you can test this. The advantage of this approach is to use the existing sphinx infrastructure that the kernel has, and have libbpf docs in the same place as everything else. The current plan is to have the libbpf mirror sync the generated docs and version them based on the libbpf releases which are cut on github. This patch includes the addition of libbpf_api.rst which pulls comment documentation from header files in libbpf under tools/lib/bpf/. The comment docs would be of the standard kernel doc format. Signed-off-by: Grant Seltzer Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210618140459.9887-2-grantseltzer@gmail.com --- Documentation/bpf/index.rst | 13 ++ Documentation/bpf/libbpf/libbpf.rst | 14 ++ Documentation/bpf/libbpf/libbpf_api.rst | 27 ++++ Documentation/bpf/libbpf/libbpf_build.rst | 37 +++++ .../bpf/libbpf/libbpf_naming_convention.rst | 162 ++++++++++++++++++++ tools/lib/bpf/README.rst | 168 --------------------- 6 files changed, 253 insertions(+), 168 deletions(-) create mode 100644 Documentation/bpf/libbpf/libbpf.rst create mode 100644 Documentation/bpf/libbpf/libbpf_api.rst create mode 100644 Documentation/bpf/libbpf/libbpf_build.rst create mode 100644 Documentation/bpf/libbpf/libbpf_naming_convention.rst delete mode 100644 tools/lib/bpf/README.rst diff --git a/Documentation/bpf/index.rst b/Documentation/bpf/index.rst index 93e8cf12a6d4..baea6c2abba5 100644 --- a/Documentation/bpf/index.rst +++ b/Documentation/bpf/index.rst @@ -12,6 +12,19 @@ BPF instruction-set. The Cilium project also maintains a `BPF and XDP Reference Guide`_ that goes into great technical depth about the BPF Architecture. +libbpf +====== + +Libbpf is a userspace library for loading and interacting with bpf programs. + +.. toctree:: + :maxdepth: 1 + + libbpf/libbpf + libbpf/libbpf_api + libbpf/libbpf_build + libbpf/libbpf_naming_convention + BPF Type Format (BTF) ===================== diff --git a/Documentation/bpf/libbpf/libbpf.rst b/Documentation/bpf/libbpf/libbpf.rst new file mode 100644 index 000000000000..1b1e61d5ead1 --- /dev/null +++ b/Documentation/bpf/libbpf/libbpf.rst @@ -0,0 +1,14 @@ +.. SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +libbpf +====== + +This is documentation for libbpf, a userspace library for loading and +interacting with bpf programs. + +All general BPF questions, including kernel functionality, libbpf APIs and +their application, should be sent to bpf@vger.kernel.org mailing list. +You can `subscribe `_ to the +mailing list search its `archive `_. +Please search the archive before asking new questions. It very well might +be that this was already addressed or answered before. diff --git a/Documentation/bpf/libbpf/libbpf_api.rst b/Documentation/bpf/libbpf/libbpf_api.rst new file mode 100644 index 000000000000..f07eecd054da --- /dev/null +++ b/Documentation/bpf/libbpf/libbpf_api.rst @@ -0,0 +1,27 @@ +.. SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +API +=== + +This documentation is autogenerated from header files in libbpf, tools/lib/bpf + +.. kernel-doc:: tools/lib/bpf/libbpf.h + :internal: + +.. kernel-doc:: tools/lib/bpf/bpf.h + :internal: + +.. kernel-doc:: tools/lib/bpf/btf.h + :internal: + +.. kernel-doc:: tools/lib/bpf/xsk.h + :internal: + +.. kernel-doc:: tools/lib/bpf/bpf_tracing.h + :internal: + +.. kernel-doc:: tools/lib/bpf/bpf_core_read.h + :internal: + +.. kernel-doc:: tools/lib/bpf/bpf_endian.h + :internal: \ No newline at end of file diff --git a/Documentation/bpf/libbpf/libbpf_build.rst b/Documentation/bpf/libbpf/libbpf_build.rst new file mode 100644 index 000000000000..8e8c23e8093d --- /dev/null +++ b/Documentation/bpf/libbpf/libbpf_build.rst @@ -0,0 +1,37 @@ +.. SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +Building libbpf +=============== + +libelf and zlib are internal dependencies of libbpf and thus are required to link +against and must be installed on the system for applications to work. +pkg-config is used by default to find libelf, and the program called +can be overridden with PKG_CONFIG. + +If using pkg-config at build time is not desired, it can be disabled by +setting NO_PKG_CONFIG=1 when calling make. + +To build both static libbpf.a and shared libbpf.so: + +.. code-block:: bash + + $ cd src + $ make + +To build only static libbpf.a library in directory build/ and install them +together with libbpf headers in a staging directory root/: + +.. code-block:: bash + + $ cd src + $ mkdir build root + $ BUILD_STATIC_ONLY=y OBJDIR=build DESTDIR=root make install + +To build both static libbpf.a and shared libbpf.so against a custom libelf +dependency installed in /build/root/ and install them together with libbpf +headers in a build directory /build/root/: + +.. code-block:: bash + + $ cd src + $ PKG_CONFIG_PATH=/build/root/lib64/pkgconfig DESTDIR=/build/root make \ No newline at end of file diff --git a/Documentation/bpf/libbpf/libbpf_naming_convention.rst b/Documentation/bpf/libbpf/libbpf_naming_convention.rst new file mode 100644 index 000000000000..3de1d51e41da --- /dev/null +++ b/Documentation/bpf/libbpf/libbpf_naming_convention.rst @@ -0,0 +1,162 @@ +.. SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +API naming convention +===================== + +libbpf API provides access to a few logically separated groups of +functions and types. Every group has its own naming convention +described here. It's recommended to follow these conventions whenever a +new function or type is added to keep libbpf API clean and consistent. + +All types and functions provided by libbpf API should have one of the +following prefixes: ``bpf_``, ``btf_``, ``libbpf_``, ``xsk_``, +``btf_dump_``, ``ring_buffer_``, ``perf_buffer_``. + +System call wrappers +-------------------- + +System call wrappers are simple wrappers for commands supported by +sys_bpf system call. These wrappers should go to ``bpf.h`` header file +and map one to one to corresponding commands. + +For example ``bpf_map_lookup_elem`` wraps ``BPF_MAP_LOOKUP_ELEM`` +command of sys_bpf, ``bpf_prog_attach`` wraps ``BPF_PROG_ATTACH``, etc. + +Objects +------- + +Another class of types and functions provided by libbpf API is "objects" +and functions to work with them. Objects are high-level abstractions +such as BPF program or BPF map. They're represented by corresponding +structures such as ``struct bpf_object``, ``struct bpf_program``, +``struct bpf_map``, etc. + +Structures are forward declared and access to their fields should be +provided via corresponding getters and setters rather than directly. + +These objects are associated with corresponding parts of ELF object that +contains compiled BPF programs. + +For example ``struct bpf_object`` represents ELF object itself created +from an ELF file or from a buffer, ``struct bpf_program`` represents a +program in ELF object and ``struct bpf_map`` is a map. + +Functions that work with an object have names built from object name, +double underscore and part that describes function purpose. + +For example ``bpf_object__open`` consists of the name of corresponding +object, ``bpf_object``, double underscore and ``open`` that defines the +purpose of the function to open ELF file and create ``bpf_object`` from +it. + +All objects and corresponding functions other than BTF related should go +to ``libbpf.h``. BTF types and functions should go to ``btf.h``. + +Auxiliary functions +------------------- + +Auxiliary functions and types that don't fit well in any of categories +described above should have ``libbpf_`` prefix, e.g. +``libbpf_get_error`` or ``libbpf_prog_type_by_name``. + +AF_XDP functions +------------------- + +AF_XDP functions should have an ``xsk_`` prefix, e.g. +``xsk_umem__get_data`` or ``xsk_umem__create``. The interface consists +of both low-level ring access functions and high-level configuration +functions. These can be mixed and matched. Note that these functions +are not reentrant for performance reasons. + +ABI +========== + +libbpf can be both linked statically or used as DSO. To avoid possible +conflicts with other libraries an application is linked with, all +non-static libbpf symbols should have one of the prefixes mentioned in +API documentation above. See API naming convention to choose the right +name for a new symbol. + +Symbol visibility +----------------- + +libbpf follow the model when all global symbols have visibility "hidden" +by default and to make a symbol visible it has to be explicitly +attributed with ``LIBBPF_API`` macro. For example: + +.. code-block:: c + + LIBBPF_API int bpf_prog_get_fd_by_id(__u32 id); + +This prevents from accidentally exporting a symbol, that is not supposed +to be a part of ABI what, in turn, improves both libbpf developer- and +user-experiences. + +ABI versionning +--------------- + +To make future ABI extensions possible libbpf ABI is versioned. +Versioning is implemented by ``libbpf.map`` version script that is +passed to linker. + +Version name is ``LIBBPF_`` prefix + three-component numeric version, +starting from ``0.0.1``. + +Every time ABI is being changed, e.g. because a new symbol is added or +semantic of existing symbol is changed, ABI version should be bumped. +This bump in ABI version is at most once per kernel development cycle. + +For example, if current state of ``libbpf.map`` is: + +.. code-block:: c + + LIBBPF_0.0.1 { + global: + bpf_func_a; + bpf_func_b; + local: + \*; + }; + +, and a new symbol ``bpf_func_c`` is being introduced, then +``libbpf.map`` should be changed like this: + +.. code-block:: c + + LIBBPF_0.0.1 { + global: + bpf_func_a; + bpf_func_b; + local: + \*; + }; + LIBBPF_0.0.2 { + global: + bpf_func_c; + } LIBBPF_0.0.1; + +, where new version ``LIBBPF_0.0.2`` depends on the previous +``LIBBPF_0.0.1``. + +Format of version script and ways to handle ABI changes, including +incompatible ones, described in details in [1]. + +Stand-alone build +------------------- + +Under https://github.com/libbpf/libbpf there is a (semi-)automated +mirror of the mainline's version of libbpf for a stand-alone build. + +However, all changes to libbpf's code base must be upstreamed through +the mainline kernel tree. + +License +------------------- + +libbpf is dual-licensed under LGPL 2.1 and BSD 2-Clause. + +Links +------------------- + +[1] https://www.akkadia.org/drepper/dsohowto.pdf + (Chapter 3. Maintaining APIs and ABIs). diff --git a/tools/lib/bpf/README.rst b/tools/lib/bpf/README.rst deleted file mode 100644 index 8928f7787f2d..000000000000 --- a/tools/lib/bpf/README.rst +++ /dev/null @@ -1,168 +0,0 @@ -.. SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) - -libbpf API naming convention -============================ - -libbpf API provides access to a few logically separated groups of -functions and types. Every group has its own naming convention -described here. It's recommended to follow these conventions whenever a -new function or type is added to keep libbpf API clean and consistent. - -All types and functions provided by libbpf API should have one of the -following prefixes: ``bpf_``, ``btf_``, ``libbpf_``, ``xsk_``, -``perf_buffer_``. - -System call wrappers --------------------- - -System call wrappers are simple wrappers for commands supported by -sys_bpf system call. These wrappers should go to ``bpf.h`` header file -and map one-on-one to corresponding commands. - -For example ``bpf_map_lookup_elem`` wraps ``BPF_MAP_LOOKUP_ELEM`` -command of sys_bpf, ``bpf_prog_attach`` wraps ``BPF_PROG_ATTACH``, etc. - -Objects -------- - -Another class of types and functions provided by libbpf API is "objects" -and functions to work with them. Objects are high-level abstractions -such as BPF program or BPF map. They're represented by corresponding -structures such as ``struct bpf_object``, ``struct bpf_program``, -``struct bpf_map``, etc. - -Structures are forward declared and access to their fields should be -provided via corresponding getters and setters rather than directly. - -These objects are associated with corresponding parts of ELF object that -contains compiled BPF programs. - -For example ``struct bpf_object`` represents ELF object itself created -from an ELF file or from a buffer, ``struct bpf_program`` represents a -program in ELF object and ``struct bpf_map`` is a map. - -Functions that work with an object have names built from object name, -double underscore and part that describes function purpose. - -For example ``bpf_object__open`` consists of the name of corresponding -object, ``bpf_object``, double underscore and ``open`` that defines the -purpose of the function to open ELF file and create ``bpf_object`` from -it. - -Another example: ``bpf_program__load`` is named for corresponding -object, ``bpf_program``, that is separated from other part of the name -by double underscore. - -All objects and corresponding functions other than BTF related should go -to ``libbpf.h``. BTF types and functions should go to ``btf.h``. - -Auxiliary functions -------------------- - -Auxiliary functions and types that don't fit well in any of categories -described above should have ``libbpf_`` prefix, e.g. -``libbpf_get_error`` or ``libbpf_prog_type_by_name``. - -AF_XDP functions -------------------- - -AF_XDP functions should have an ``xsk_`` prefix, e.g. -``xsk_umem__get_data`` or ``xsk_umem__create``. The interface consists -of both low-level ring access functions and high-level configuration -functions. These can be mixed and matched. Note that these functions -are not reentrant for performance reasons. - -Please take a look at Documentation/networking/af_xdp.rst in the Linux -kernel source tree on how to use XDP sockets and for some common -mistakes in case you do not get any traffic up to user space. - -libbpf ABI -========== - -libbpf can be both linked statically or used as DSO. To avoid possible -conflicts with other libraries an application is linked with, all -non-static libbpf symbols should have one of the prefixes mentioned in -API documentation above. See API naming convention to choose the right -name for a new symbol. - -Symbol visibility ------------------ - -libbpf follow the model when all global symbols have visibility "hidden" -by default and to make a symbol visible it has to be explicitly -attributed with ``LIBBPF_API`` macro. For example: - -.. code-block:: c - - LIBBPF_API int bpf_prog_get_fd_by_id(__u32 id); - -This prevents from accidentally exporting a symbol, that is not supposed -to be a part of ABI what, in turn, improves both libbpf developer- and -user-experiences. - -ABI versionning ---------------- - -To make future ABI extensions possible libbpf ABI is versioned. -Versioning is implemented by ``libbpf.map`` version script that is -passed to linker. - -Version name is ``LIBBPF_`` prefix + three-component numeric version, -starting from ``0.0.1``. - -Every time ABI is being changed, e.g. because a new symbol is added or -semantic of existing symbol is changed, ABI version should be bumped. -This bump in ABI version is at most once per kernel development cycle. - -For example, if current state of ``libbpf.map`` is: - -.. code-block:: - LIBBPF_0.0.1 { - global: - bpf_func_a; - bpf_func_b; - local: - \*; - }; - -, and a new symbol ``bpf_func_c`` is being introduced, then -``libbpf.map`` should be changed like this: - -.. code-block:: - LIBBPF_0.0.1 { - global: - bpf_func_a; - bpf_func_b; - local: - \*; - }; - LIBBPF_0.0.2 { - global: - bpf_func_c; - } LIBBPF_0.0.1; - -, where new version ``LIBBPF_0.0.2`` depends on the previous -``LIBBPF_0.0.1``. - -Format of version script and ways to handle ABI changes, including -incompatible ones, described in details in [1]. - -Stand-alone build -================= - -Under https://github.com/libbpf/libbpf there is a (semi-)automated -mirror of the mainline's version of libbpf for a stand-alone build. - -However, all changes to libbpf's code base must be upstreamed through -the mainline kernel tree. - -License -======= - -libbpf is dual-licensed under LGPL 2.1 and BSD 2-Clause. - -Links -===== - -[1] https://www.akkadia.org/drepper/dsohowto.pdf - (Chapter 3. Maintaining APIs and ABIs). -- cgit v1.2.3-59-g8ed1b From 5c10a3dbe9220ca7bcee716c13c8a8563bcb010a Mon Sep 17 00:00:00 2001 From: Jonathan Edwards Date: Sat, 19 Jun 2021 11:10:07 -0400 Subject: libbpf: Add extra BPF_PROG_TYPE check to bpf_object__probe_loading eBPF has been backported for RHEL 7 w/ kernel 3.10-940+ [0]. However only the following program types are supported [1]: BPF_PROG_TYPE_KPROBE BPF_PROG_TYPE_TRACEPOINT BPF_PROG_TYPE_PERF_EVENT For libbpf this causes an EINVAL return during the bpf_object__probe_loading call which only checks to see if programs of type BPF_PROG_TYPE_SOCKET_FILTER can load. The following will try BPF_PROG_TYPE_TRACEPOINT as a fallback attempt before erroring out. BPF_PROG_TYPE_KPROBE was not a good candidate because on some kernels it requires knowledge of the LINUX_VERSION_CODE. [0] https://www.redhat.com/en/blog/introduction-ebpf-red-hat-enterprise-linux-7 [1] https://access.redhat.com/articles/3550581 Signed-off-by: Jonathan Edwards Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210619151007.GA6963@165gc.onmicrosoft.com --- tools/lib/bpf/libbpf.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 48c0ade05ab1..1e04ce724240 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -4000,6 +4000,10 @@ bpf_object__probe_loading(struct bpf_object *obj) attr.license = "GPL"; ret = bpf_load_program_xattr(&attr, NULL, 0); + if (ret < 0) { + attr.prog_type = BPF_PROG_TYPE_TRACEPOINT; + ret = bpf_load_program_xattr(&attr, NULL, 0); + } if (ret < 0) { ret = errno; cp = libbpf_strerror_r(ret, errmsg, sizeof(errmsg)); -- cgit v1.2.3-59-g8ed1b From 0ae64fb6b645e0f976e08bc3c05e518856f19d00 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 19 Jun 2021 09:44:53 +0530 Subject: libbpf: Add request buffer type for netlink messages Coverity complains about OOB writes to nlmsghdr. There is no OOB as we write to the trailing buffer, but static analyzers and compilers may rightfully be confused as the nlmsghdr pointer has subobject provenance (and hence subobject bounds). Fix this by using an explicit request structure containing the nlmsghdr, struct tcmsg/ifinfomsg, and attribute buffer. Also switch nh_tail (renamed to req_tail) to cast req * to char * so that it can be understood as arithmetic on pointer to the representation array (hence having same bound as request structure), which should further appease analyzers. As a bonus, callers don't have to pass sizeof(req) all the time now, as size is implicitly obtained using the pointer. While at it, also reduce the size of attribute buffer to 128 bytes (132 for ifinfomsg using functions due to the padding). Summary of problem: Even though C standard allows interconvertibility of pointer to first member and pointer to struct, for the purposes of alias analysis it would still consider the first as having pointer value "pointer to T" where T is type of first member hence having subobject bounds, allowing analyzers within reason to complain when object is accessed beyond the size of pointed to object. The only exception to this rule may be when a char * is formed to a member subobject. It is not possible for the compiler to be able to tell the intent of the programmer that it is a pointer to member object or the underlying representation array of the containing object, so such diagnosis is suppressed. Fixes: 715c5ce454a6 ("libbpf: Add low level TC-BPF management API") Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210619041454.417577-1-memxor@gmail.com --- tools/lib/bpf/netlink.c | 113 ++++++++++++++++++------------------------------ tools/lib/bpf/nlattr.h | 34 ++++++++++----- 2 files changed, 65 insertions(+), 82 deletions(-) diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c index cf9381f03b16..bfaa9a5c50f3 100644 --- a/tools/lib/bpf/netlink.c +++ b/tools/lib/bpf/netlink.c @@ -154,7 +154,7 @@ done: return ret; } -static int libbpf_netlink_send_recv(struct nlmsghdr *nh, +static int libbpf_netlink_send_recv(struct libbpf_nla_req *req, __dump_nlmsg_t parse_msg, libbpf_dump_nlmsg_t parse_attr, void *cookie) @@ -166,15 +166,15 @@ static int libbpf_netlink_send_recv(struct nlmsghdr *nh, if (sock < 0) return sock; - nh->nlmsg_pid = 0; - nh->nlmsg_seq = time(NULL); + req->nh.nlmsg_pid = 0; + req->nh.nlmsg_seq = time(NULL); - if (send(sock, nh, nh->nlmsg_len, 0) < 0) { + if (send(sock, req, req->nh.nlmsg_len, 0) < 0) { ret = -errno; goto out; } - ret = libbpf_netlink_recv(sock, nl_pid, nh->nlmsg_seq, + ret = libbpf_netlink_recv(sock, nl_pid, req->nh.nlmsg_seq, parse_msg, parse_attr, cookie); out: libbpf_netlink_close(sock); @@ -186,11 +186,7 @@ static int __bpf_set_link_xdp_fd_replace(int ifindex, int fd, int old_fd, { struct nlattr *nla; int ret; - struct { - struct nlmsghdr nh; - struct ifinfomsg ifinfo; - char attrbuf[64]; - } req; + struct libbpf_nla_req req; memset(&req, 0, sizeof(req)); req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)); @@ -199,27 +195,26 @@ static int __bpf_set_link_xdp_fd_replace(int ifindex, int fd, int old_fd, req.ifinfo.ifi_family = AF_UNSPEC; req.ifinfo.ifi_index = ifindex; - nla = nlattr_begin_nested(&req.nh, sizeof(req), IFLA_XDP); + nla = nlattr_begin_nested(&req, IFLA_XDP); if (!nla) return -EMSGSIZE; - ret = nlattr_add(&req.nh, sizeof(req), IFLA_XDP_FD, &fd, sizeof(fd)); + ret = nlattr_add(&req, IFLA_XDP_FD, &fd, sizeof(fd)); if (ret < 0) return ret; if (flags) { - ret = nlattr_add(&req.nh, sizeof(req), IFLA_XDP_FLAGS, &flags, - sizeof(flags)); + ret = nlattr_add(&req, IFLA_XDP_FLAGS, &flags, sizeof(flags)); if (ret < 0) return ret; } if (flags & XDP_FLAGS_REPLACE) { - ret = nlattr_add(&req.nh, sizeof(req), IFLA_XDP_EXPECTED_FD, - &old_fd, sizeof(old_fd)); + ret = nlattr_add(&req, IFLA_XDP_EXPECTED_FD, &old_fd, + sizeof(old_fd)); if (ret < 0) return ret; } - nlattr_end_nested(&req.nh, nla); + nlattr_end_nested(&req, nla); - return libbpf_netlink_send_recv(&req.nh, NULL, NULL, NULL); + return libbpf_netlink_send_recv(&req, NULL, NULL, NULL); } int bpf_set_link_xdp_fd_opts(int ifindex, int fd, __u32 flags, @@ -314,14 +309,11 @@ int bpf_get_link_xdp_info(int ifindex, struct xdp_link_info *info, struct xdp_id_md xdp_id = {}; __u32 mask; int ret; - struct { - struct nlmsghdr nh; - struct ifinfomsg ifm; - } req = { - .nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)), - .nh.nlmsg_type = RTM_GETLINK, - .nh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, - .ifm.ifi_family = AF_PACKET, + struct libbpf_nla_req req = { + .nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)), + .nh.nlmsg_type = RTM_GETLINK, + .nh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, + .ifinfo.ifi_family = AF_PACKET, }; if (flags & ~XDP_FLAGS_MASK || !info_size) @@ -336,7 +328,7 @@ int bpf_get_link_xdp_info(int ifindex, struct xdp_link_info *info, xdp_id.ifindex = ifindex; xdp_id.flags = flags; - ret = libbpf_netlink_send_recv(&req.nh, __dump_link_nlmsg, + ret = libbpf_netlink_send_recv(&req, __dump_link_nlmsg, get_xdp_info, &xdp_id); if (!ret) { size_t sz = min(info_size, sizeof(xdp_id.info)); @@ -376,15 +368,14 @@ int bpf_get_link_xdp_id(int ifindex, __u32 *prog_id, __u32 flags) return libbpf_err(ret); } -typedef int (*qdisc_config_t)(struct nlmsghdr *nh, struct tcmsg *t, - size_t maxsz); +typedef int (*qdisc_config_t)(struct libbpf_nla_req *req); -static int clsact_config(struct nlmsghdr *nh, struct tcmsg *t, size_t maxsz) +static int clsact_config(struct libbpf_nla_req *req) { - t->tcm_parent = TC_H_CLSACT; - t->tcm_handle = TC_H_MAKE(TC_H_CLSACT, 0); + req->tc.tcm_parent = TC_H_CLSACT; + req->tc.tcm_handle = TC_H_MAKE(TC_H_CLSACT, 0); - return nlattr_add(nh, maxsz, TCA_KIND, "clsact", sizeof("clsact")); + return nlattr_add(req, TCA_KIND, "clsact", sizeof("clsact")); } static int attach_point_to_config(struct bpf_tc_hook *hook, @@ -431,11 +422,7 @@ static int tc_qdisc_modify(struct bpf_tc_hook *hook, int cmd, int flags) { qdisc_config_t config; int ret; - struct { - struct nlmsghdr nh; - struct tcmsg tc; - char buf[256]; - } req; + struct libbpf_nla_req req; ret = attach_point_to_config(hook, &config); if (ret < 0) @@ -448,11 +435,11 @@ static int tc_qdisc_modify(struct bpf_tc_hook *hook, int cmd, int flags) req.tc.tcm_family = AF_UNSPEC; req.tc.tcm_ifindex = OPTS_GET(hook, ifindex, 0); - ret = config(&req.nh, &req.tc, sizeof(req)); + ret = config(&req); if (ret < 0) return ret; - return libbpf_netlink_send_recv(&req.nh, NULL, NULL, NULL); + return libbpf_netlink_send_recv(&req, NULL, NULL, NULL); } static int tc_qdisc_create_excl(struct bpf_tc_hook *hook) @@ -544,7 +531,7 @@ static int get_tc_info(struct nlmsghdr *nh, libbpf_dump_nlmsg_t fn, return __get_tc_info(cookie, tc, tb, nh->nlmsg_flags & NLM_F_ECHO); } -static int tc_add_fd_and_name(struct nlmsghdr *nh, size_t maxsz, int fd) +static int tc_add_fd_and_name(struct libbpf_nla_req *req, int fd) { struct bpf_prog_info info = {}; __u32 info_len = sizeof(info); @@ -555,7 +542,7 @@ static int tc_add_fd_and_name(struct nlmsghdr *nh, size_t maxsz, int fd) if (ret < 0) return ret; - ret = nlattr_add(nh, maxsz, TCA_BPF_FD, &fd, sizeof(fd)); + ret = nlattr_add(req, TCA_BPF_FD, &fd, sizeof(fd)); if (ret < 0) return ret; len = snprintf(name, sizeof(name), "%s:[%u]", info.name, info.id); @@ -563,7 +550,7 @@ static int tc_add_fd_and_name(struct nlmsghdr *nh, size_t maxsz, int fd) return -errno; if (len >= sizeof(name)) return -ENAMETOOLONG; - return nlattr_add(nh, maxsz, TCA_BPF_NAME, name, len + 1); + return nlattr_add(req, TCA_BPF_NAME, name, len + 1); } int bpf_tc_attach(const struct bpf_tc_hook *hook, struct bpf_tc_opts *opts) @@ -571,12 +558,8 @@ int bpf_tc_attach(const struct bpf_tc_hook *hook, struct bpf_tc_opts *opts) __u32 protocol, bpf_flags, handle, priority, parent, prog_id, flags; int ret, ifindex, attach_point, prog_fd; struct bpf_cb_ctx info = {}; + struct libbpf_nla_req req; struct nlattr *nla; - struct { - struct nlmsghdr nh; - struct tcmsg tc; - char buf[256]; - } req; if (!hook || !opts || !OPTS_VALID(hook, bpf_tc_hook) || @@ -618,25 +601,24 @@ int bpf_tc_attach(const struct bpf_tc_hook *hook, struct bpf_tc_opts *opts) return libbpf_err(ret); req.tc.tcm_parent = parent; - ret = nlattr_add(&req.nh, sizeof(req), TCA_KIND, "bpf", sizeof("bpf")); + ret = nlattr_add(&req, TCA_KIND, "bpf", sizeof("bpf")); if (ret < 0) return libbpf_err(ret); - nla = nlattr_begin_nested(&req.nh, sizeof(req), TCA_OPTIONS); + nla = nlattr_begin_nested(&req, TCA_OPTIONS); if (!nla) return libbpf_err(-EMSGSIZE); - ret = tc_add_fd_and_name(&req.nh, sizeof(req), prog_fd); + ret = tc_add_fd_and_name(&req, prog_fd); if (ret < 0) return libbpf_err(ret); bpf_flags = TCA_BPF_FLAG_ACT_DIRECT; - ret = nlattr_add(&req.nh, sizeof(req), TCA_BPF_FLAGS, &bpf_flags, - sizeof(bpf_flags)); + ret = nlattr_add(&req, TCA_BPF_FLAGS, &bpf_flags, sizeof(bpf_flags)); if (ret < 0) return libbpf_err(ret); - nlattr_end_nested(&req.nh, nla); + nlattr_end_nested(&req, nla); info.opts = opts; - ret = libbpf_netlink_send_recv(&req.nh, get_tc_info, NULL, &info); + ret = libbpf_netlink_send_recv(&req, get_tc_info, NULL, &info); if (ret < 0) return libbpf_err(ret); if (!info.processed) @@ -650,11 +632,7 @@ static int __bpf_tc_detach(const struct bpf_tc_hook *hook, { __u32 protocol = 0, handle, priority, parent, prog_id, flags; int ret, ifindex, attach_point, prog_fd; - struct { - struct nlmsghdr nh; - struct tcmsg tc; - char buf[256]; - } req; + struct libbpf_nla_req req; if (!hook || !OPTS_VALID(hook, bpf_tc_hook) || @@ -701,13 +679,12 @@ static int __bpf_tc_detach(const struct bpf_tc_hook *hook, req.tc.tcm_parent = parent; if (!flush) { - ret = nlattr_add(&req.nh, sizeof(req), TCA_KIND, - "bpf", sizeof("bpf")); + ret = nlattr_add(&req, TCA_KIND, "bpf", sizeof("bpf")); if (ret < 0) return ret; } - return libbpf_netlink_send_recv(&req.nh, NULL, NULL, NULL); + return libbpf_netlink_send_recv(&req, NULL, NULL, NULL); } int bpf_tc_detach(const struct bpf_tc_hook *hook, @@ -727,11 +704,7 @@ int bpf_tc_query(const struct bpf_tc_hook *hook, struct bpf_tc_opts *opts) __u32 protocol, handle, priority, parent, prog_id, flags; int ret, ifindex, attach_point, prog_fd; struct bpf_cb_ctx info = {}; - struct { - struct nlmsghdr nh; - struct tcmsg tc; - char buf[256]; - } req; + struct libbpf_nla_req req; if (!hook || !opts || !OPTS_VALID(hook, bpf_tc_hook) || @@ -770,13 +743,13 @@ int bpf_tc_query(const struct bpf_tc_hook *hook, struct bpf_tc_opts *opts) return libbpf_err(ret); req.tc.tcm_parent = parent; - ret = nlattr_add(&req.nh, sizeof(req), TCA_KIND, "bpf", sizeof("bpf")); + ret = nlattr_add(&req, TCA_KIND, "bpf", sizeof("bpf")); if (ret < 0) return libbpf_err(ret); info.opts = opts; - ret = libbpf_netlink_send_recv(&req.nh, get_tc_info, NULL, &info); + ret = libbpf_netlink_send_recv(&req, get_tc_info, NULL, &info); if (ret < 0) return libbpf_err(ret); if (!info.processed) diff --git a/tools/lib/bpf/nlattr.h b/tools/lib/bpf/nlattr.h index 3c780ab6d022..76cbfeb21955 100644 --- a/tools/lib/bpf/nlattr.h +++ b/tools/lib/bpf/nlattr.h @@ -13,6 +13,7 @@ #include #include #include +#include /* avoid multiple definition of netlink features */ #define __LINUX_NETLINK_H @@ -52,6 +53,15 @@ struct libbpf_nla_policy { uint16_t maxlen; }; +struct libbpf_nla_req { + struct nlmsghdr nh; + union { + struct ifinfomsg ifinfo; + struct tcmsg tc; + }; + char buf[128]; +}; + /** * @ingroup attr * Iterate over a stream of attributes @@ -111,44 +121,44 @@ static inline struct nlattr *nla_data(struct nlattr *nla) return (struct nlattr *)((char *)nla + NLA_HDRLEN); } -static inline struct nlattr *nh_tail(struct nlmsghdr *nh) +static inline struct nlattr *req_tail(struct libbpf_nla_req *req) { - return (struct nlattr *)((char *)nh + NLMSG_ALIGN(nh->nlmsg_len)); + return (struct nlattr *)((char *)req + NLMSG_ALIGN(req->nh.nlmsg_len)); } -static inline int nlattr_add(struct nlmsghdr *nh, size_t maxsz, int type, +static inline int nlattr_add(struct libbpf_nla_req *req, int type, const void *data, int len) { struct nlattr *nla; - if (NLMSG_ALIGN(nh->nlmsg_len) + NLA_ALIGN(NLA_HDRLEN + len) > maxsz) + if (NLMSG_ALIGN(req->nh.nlmsg_len) + NLA_ALIGN(NLA_HDRLEN + len) > sizeof(*req)) return -EMSGSIZE; if (!!data != !!len) return -EINVAL; - nla = nh_tail(nh); + nla = req_tail(req); nla->nla_type = type; nla->nla_len = NLA_HDRLEN + len; if (data) memcpy(nla_data(nla), data, len); - nh->nlmsg_len = NLMSG_ALIGN(nh->nlmsg_len) + NLA_ALIGN(nla->nla_len); + req->nh.nlmsg_len = NLMSG_ALIGN(req->nh.nlmsg_len) + NLA_ALIGN(nla->nla_len); return 0; } -static inline struct nlattr *nlattr_begin_nested(struct nlmsghdr *nh, - size_t maxsz, int type) +static inline struct nlattr *nlattr_begin_nested(struct libbpf_nla_req *req, int type) { struct nlattr *tail; - tail = nh_tail(nh); - if (nlattr_add(nh, maxsz, type | NLA_F_NESTED, NULL, 0)) + tail = req_tail(req); + if (nlattr_add(req, type | NLA_F_NESTED, NULL, 0)) return NULL; return tail; } -static inline void nlattr_end_nested(struct nlmsghdr *nh, struct nlattr *tail) +static inline void nlattr_end_nested(struct libbpf_nla_req *req, + struct nlattr *tail) { - tail->nla_len = (char *)nh_tail(nh) - (char *)tail; + tail->nla_len = (char *)req_tail(req) - (char *)tail; } #endif /* __LIBBPF_NLATTR_H */ -- cgit v1.2.3-59-g8ed1b From ee62a5c6bb100b6fb07f3da3818c10a24d440e10 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 19 Jun 2021 09:44:54 +0530 Subject: libbpf: Switch to void * casting in netlink helpers Netlink helpers I added in 8bbb77b7c7a2 ("libbpf: Add various netlink helpers") used char * casts everywhere, and there were a few more that existed from before. Convert all of them to void * cast, as it is treated equivalently by clang/gcc for the purposes of pointer arithmetic and to follow the convention elsewhere in the kernel/libbpf. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210619041454.417577-2-memxor@gmail.com --- tools/lib/bpf/netlink.c | 2 +- tools/lib/bpf/nlattr.c | 2 +- tools/lib/bpf/nlattr.h | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c index bfaa9a5c50f3..39f25e09b51e 100644 --- a/tools/lib/bpf/netlink.c +++ b/tools/lib/bpf/netlink.c @@ -524,7 +524,7 @@ static int get_tc_info(struct nlmsghdr *nh, libbpf_dump_nlmsg_t fn, struct nlattr *tb[TCA_MAX + 1]; libbpf_nla_parse(tb, TCA_MAX, - (struct nlattr *)((char *)tc + NLMSG_ALIGN(sizeof(*tc))), + (struct nlattr *)((void *)tc + NLMSG_ALIGN(sizeof(*tc))), NLMSG_PAYLOAD(nh, sizeof(*tc)), NULL); if (!tb[TCA_KIND]) return NL_CONT; diff --git a/tools/lib/bpf/nlattr.c b/tools/lib/bpf/nlattr.c index b607fa9852b1..f57e77a6e40f 100644 --- a/tools/lib/bpf/nlattr.c +++ b/tools/lib/bpf/nlattr.c @@ -27,7 +27,7 @@ static struct nlattr *nla_next(const struct nlattr *nla, int *remaining) int totlen = NLA_ALIGN(nla->nla_len); *remaining -= totlen; - return (struct nlattr *) ((char *) nla + totlen); + return (struct nlattr *)((void *)nla + totlen); } static int nla_ok(const struct nlattr *nla, int remaining) diff --git a/tools/lib/bpf/nlattr.h b/tools/lib/bpf/nlattr.h index 76cbfeb21955..4d15ae2ff812 100644 --- a/tools/lib/bpf/nlattr.h +++ b/tools/lib/bpf/nlattr.h @@ -81,7 +81,7 @@ struct libbpf_nla_req { */ static inline void *libbpf_nla_data(const struct nlattr *nla) { - return (char *) nla + NLA_HDRLEN; + return (void *)nla + NLA_HDRLEN; } static inline uint8_t libbpf_nla_getattr_u8(const struct nlattr *nla) @@ -118,12 +118,12 @@ int libbpf_nla_dump_errormsg(struct nlmsghdr *nlh); static inline struct nlattr *nla_data(struct nlattr *nla) { - return (struct nlattr *)((char *)nla + NLA_HDRLEN); + return (struct nlattr *)((void *)nla + NLA_HDRLEN); } static inline struct nlattr *req_tail(struct libbpf_nla_req *req) { - return (struct nlattr *)((char *)req + NLMSG_ALIGN(req->nh.nlmsg_len)); + return (struct nlattr *)((void *)req + NLMSG_ALIGN(req->nh.nlmsg_len)); } static inline int nlattr_add(struct libbpf_nla_req *req, int type, @@ -158,7 +158,7 @@ static inline struct nlattr *nlattr_begin_nested(struct libbpf_nla_req *req, int static inline void nlattr_end_nested(struct libbpf_nla_req *req, struct nlattr *tail) { - tail->nla_len = (char *)req_tail(req) - (char *)tail; + tail->nla_len = (void *)req_tail(req) - (void *)tail; } #endif /* __LIBBPF_NLATTR_H */ -- cgit v1.2.3-59-g8ed1b From 4b9718b5a201eddcd00d9db6c36b18840125c7ee Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 22 Jun 2021 20:56:47 +0200 Subject: docs, af_xdp: Consistent indentation in examples Examples in this document use all kinds of indentation from 3 to 5 spaces and even mixed with tabs. Making them all even and equal to 4 spaces. Signed-off-by: Ilya Maximets Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20210622185647.3705104-1-i.maximets@ovn.org --- Documentation/networking/af_xdp.rst | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/Documentation/networking/af_xdp.rst b/Documentation/networking/af_xdp.rst index 2ccc5644cc98..42576880aa4a 100644 --- a/Documentation/networking/af_xdp.rst +++ b/Documentation/networking/af_xdp.rst @@ -290,19 +290,19 @@ round-robin example of distributing packets is shown below: #define MAX_SOCKS 16 struct { - __uint(type, BPF_MAP_TYPE_XSKMAP); - __uint(max_entries, MAX_SOCKS); - __uint(key_size, sizeof(int)); - __uint(value_size, sizeof(int)); + __uint(type, BPF_MAP_TYPE_XSKMAP); + __uint(max_entries, MAX_SOCKS); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(int)); } xsks_map SEC(".maps"); static unsigned int rr; SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx) { - rr = (rr + 1) & (MAX_SOCKS - 1); + rr = (rr + 1) & (MAX_SOCKS - 1); - return bpf_redirect_map(&xsks_map, rr, XDP_DROP); + return bpf_redirect_map(&xsks_map, rr, XDP_DROP); } Note, that since there is only a single set of FILL and COMPLETION @@ -379,7 +379,7 @@ would look like this for the TX path: .. code-block:: c if (xsk_ring_prod__needs_wakeup(&my_tx_ring)) - sendto(xsk_socket__fd(xsk_handle), NULL, 0, MSG_DONTWAIT, NULL, 0); + sendto(xsk_socket__fd(xsk_handle), NULL, 0, MSG_DONTWAIT, NULL, 0); I.e., only use the syscall if the flag is set. @@ -442,9 +442,9 @@ purposes. The supported statistics are shown below: .. code-block:: c struct xdp_statistics { - __u64 rx_dropped; /* Dropped for reasons other than invalid desc */ - __u64 rx_invalid_descs; /* Dropped due to invalid descriptor */ - __u64 tx_invalid_descs; /* Dropped due to invalid descriptor */ + __u64 rx_dropped; /* Dropped for reasons other than invalid desc */ + __u64 rx_invalid_descs; /* Dropped due to invalid descriptor */ + __u64 tx_invalid_descs; /* Dropped due to invalid descriptor */ }; XDP_OPTIONS getsockopt @@ -483,15 +483,15 @@ like this: .. code-block:: c // struct xdp_rxtx_ring { - // __u32 *producer; - // __u32 *consumer; - // struct xdp_desc *desc; + // __u32 *producer; + // __u32 *consumer; + // struct xdp_desc *desc; // }; // struct xdp_umem_ring { - // __u32 *producer; - // __u32 *consumer; - // __u64 *desc; + // __u32 *producer; + // __u32 *consumer; + // __u64 *desc; // }; // typedef struct xdp_rxtx_ring RING; -- cgit v1.2.3-59-g8ed1b From ced50fc49f3bde2892c3d7fad7b3b6bfbc6ef90e Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 23 Jun 2021 13:25:04 +0200 Subject: bpf, x86: Remove unused cnt increase from EMIT macro Removing unused cnt increase from EMIT macro together with cnt declarations. This was introduced in commit [1] to ensure proper code generation. But that code was removed in commit [2] and this extra code was left in. [1] b52f00e6a715 ("x86: bpf_jit: implement bpf_tail_call() helper") [2] ebf7d1f508a7 ("bpf, x64: rework pro/epilogue and tailcall handling in JIT") Signed-off-by: Jiri Olsa Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210623112504.709856-1-jolsa@kernel.org --- arch/x86/net/bpf_jit_comp.c | 44 ++++++++++++-------------------------------- 1 file changed, 12 insertions(+), 32 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 2a2e290fa5d8..db1e83813db5 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -31,7 +31,7 @@ static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len) } #define EMIT(bytes, len) \ - do { prog = emit_code(prog, bytes, len); cnt += len; } while (0) + do { prog = emit_code(prog, bytes, len); } while (0) #define EMIT1(b1) EMIT(b1, 1) #define EMIT2(b1, b2) EMIT((b1) + ((b2) << 8), 2) @@ -239,7 +239,6 @@ struct jit_context { static void push_callee_regs(u8 **pprog, bool *callee_regs_used) { u8 *prog = *pprog; - int cnt = 0; if (callee_regs_used[0]) EMIT1(0x53); /* push rbx */ @@ -255,7 +254,6 @@ static void push_callee_regs(u8 **pprog, bool *callee_regs_used) static void pop_callee_regs(u8 **pprog, bool *callee_regs_used) { u8 *prog = *pprog; - int cnt = 0; if (callee_regs_used[3]) EMIT2(0x41, 0x5F); /* pop r15 */ @@ -277,13 +275,12 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf, bool tail_call_reachable, bool is_subprog) { u8 *prog = *pprog; - int cnt = X86_PATCH_SIZE; /* BPF trampoline can be made to work without these nops, * but let's waste 5 bytes for now and optimize later */ - memcpy(prog, x86_nops[5], cnt); - prog += cnt; + memcpy(prog, x86_nops[5], X86_PATCH_SIZE); + prog += X86_PATCH_SIZE; if (!ebpf_from_cbpf) { if (tail_call_reachable && !is_subprog) EMIT2(0x31, 0xC0); /* xor eax, eax */ @@ -303,7 +300,6 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf, static int emit_patch(u8 **pprog, void *func, void *ip, u8 opcode) { u8 *prog = *pprog; - int cnt = 0; s64 offset; offset = func - (ip + X86_PATCH_SIZE); @@ -423,7 +419,6 @@ static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used, int off1 = 42; int off2 = 31; int off3 = 9; - int cnt = 0; /* count the additional bytes used for popping callee regs from stack * that need to be taken into account for each of the offsets that @@ -513,7 +508,6 @@ static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke, int pop_bytes = 0; int off1 = 20; int poke_off; - int cnt = 0; /* count the additional bytes used for popping callee regs to stack * that need to be taken into account for jump offset that is used for @@ -615,7 +609,6 @@ static void emit_mov_imm32(u8 **pprog, bool sign_propagate, { u8 *prog = *pprog; u8 b1, b2, b3; - int cnt = 0; /* * Optimization: if imm32 is positive, use 'mov %eax, imm32' @@ -655,7 +648,6 @@ static void emit_mov_imm64(u8 **pprog, u32 dst_reg, const u32 imm32_hi, const u32 imm32_lo) { u8 *prog = *pprog; - int cnt = 0; if (is_uimm32(((u64)imm32_hi << 32) | (u32)imm32_lo)) { /* @@ -678,7 +670,6 @@ static void emit_mov_imm64(u8 **pprog, u32 dst_reg, static void emit_mov_reg(u8 **pprog, bool is64, u32 dst_reg, u32 src_reg) { u8 *prog = *pprog; - int cnt = 0; if (is64) { /* mov dst, src */ @@ -697,7 +688,6 @@ static void emit_mov_reg(u8 **pprog, bool is64, u32 dst_reg, u32 src_reg) static void emit_insn_suffix(u8 **pprog, u32 ptr_reg, u32 val_reg, int off) { u8 *prog = *pprog; - int cnt = 0; if (is_imm8(off)) { /* 1-byte signed displacement. @@ -720,7 +710,6 @@ static void emit_insn_suffix(u8 **pprog, u32 ptr_reg, u32 val_reg, int off) static void maybe_emit_mod(u8 **pprog, u32 dst_reg, u32 src_reg, bool is64) { u8 *prog = *pprog; - int cnt = 0; if (is64) EMIT1(add_2mod(0x48, dst_reg, src_reg)); @@ -733,7 +722,6 @@ static void maybe_emit_mod(u8 **pprog, u32 dst_reg, u32 src_reg, bool is64) static void emit_ldx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) { u8 *prog = *pprog; - int cnt = 0; switch (size) { case BPF_B: @@ -764,7 +752,6 @@ static void emit_ldx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) { u8 *prog = *pprog; - int cnt = 0; switch (size) { case BPF_B: @@ -799,7 +786,6 @@ static int emit_atomic(u8 **pprog, u8 atomic_op, u32 dst_reg, u32 src_reg, s16 off, u8 bpf_size) { u8 *prog = *pprog; - int cnt = 0; EMIT1(0xF0); /* lock prefix */ @@ -869,10 +855,10 @@ static void detect_reg_usage(struct bpf_insn *insn, int insn_cnt, } } -static int emit_nops(u8 **pprog, int len) +static void emit_nops(u8 **pprog, int len) { u8 *prog = *pprog; - int i, noplen, cnt = 0; + int i, noplen; while (len > 0) { noplen = len; @@ -886,8 +872,6 @@ static int emit_nops(u8 **pprog, int len) } *pprog = prog; - - return cnt; } #define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp))) @@ -902,7 +886,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, bool tail_call_seen = false; bool seen_exit = false; u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY]; - int i, cnt = 0, excnt = 0; + int i, excnt = 0; int ilen, proglen = 0; u8 *prog = temp; int err; @@ -1576,7 +1560,7 @@ emit_cond_jmp: /* Convert BPF opcode to x86 */ nops); return -EFAULT; } - cnt += emit_nops(&prog, nops); + emit_nops(&prog, nops); } EMIT2(jmp_cond, jmp_offset); } else if (is_simm32(jmp_offset)) { @@ -1622,7 +1606,7 @@ emit_cond_jmp: /* Convert BPF opcode to x86 */ nops); return -EFAULT; } - cnt += emit_nops(&prog, nops); + emit_nops(&prog, nops); } break; } @@ -1647,7 +1631,7 @@ emit_jmp: nops); return -EFAULT; } - cnt += emit_nops(&prog, INSN_SZ_DIFF - 2); + emit_nops(&prog, INSN_SZ_DIFF - 2); } EMIT2(0xEB, jmp_offset); } else if (is_simm32(jmp_offset)) { @@ -1754,7 +1738,6 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, { u8 *prog = *pprog; u8 *jmp_insn; - int cnt = 0; /* arg1: mov rdi, progs[i] */ emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p); @@ -1822,7 +1805,6 @@ static void emit_align(u8 **pprog, u32 align) static int emit_cond_near_jump(u8 **pprog, void *func, void *ip, u8 jmp_cond) { u8 *prog = *pprog; - int cnt = 0; s64 offset; offset = func - (ip + 2 + 4); @@ -1854,7 +1836,7 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog, u8 **branches) { u8 *prog = *pprog; - int i, cnt = 0; + int i; /* The first fmod_ret program will receive a garbage return value. * Set this to 0 to avoid confusing the program. @@ -1950,7 +1932,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i struct bpf_tramp_progs *tprogs, void *orig_call) { - int ret, i, cnt = 0, nr_args = m->nr_args; + int ret, i, nr_args = m->nr_args; int stack_size = nr_args * 8; struct bpf_tramp_progs *fentry = &tprogs[BPF_TRAMP_FENTRY]; struct bpf_tramp_progs *fexit = &tprogs[BPF_TRAMP_FEXIT]; @@ -2095,8 +2077,6 @@ static int emit_fallback_jump(u8 **pprog) */ err = emit_jump(&prog, __x86_indirect_thunk_rdx, prog); #else - int cnt = 0; - EMIT2(0xFF, 0xE2); /* jmp rdx */ #endif *pprog = prog; @@ -2106,7 +2086,7 @@ static int emit_fallback_jump(u8 **pprog) static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs) { u8 *jg_reloc, *prog = *pprog; - int pivot, err, jg_bytes = 1, cnt = 0; + int pivot, err, jg_bytes = 1; s64 jg_offset; if (a == b) { -- cgit v1.2.3-59-g8ed1b From 647d446d66e493d23ca1047fa8492b0269674530 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Wed, 23 Jun 2021 22:37:54 +0100 Subject: media, bpf: Do not copy more entries than user space requested The syscall bpf(BPF_PROG_QUERY, &attr) should use the prog_cnt field to see how many entries user space provided and return ENOSPC if there are more programs than that. Before this patch, this is not checked and ENOSPC is never returned. Note that one lirc device is limited to 64 bpf programs, and user space I'm aware of -- ir-keytable -- always gives enough space for 64 entries already. However, we should not copy program ids than are requested. Signed-off-by: Sean Young Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210623213754.632-1-sean@mess.org --- drivers/media/rc/bpf-lirc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c index 3fe3edd80876..afae0afe3f81 100644 --- a/drivers/media/rc/bpf-lirc.c +++ b/drivers/media/rc/bpf-lirc.c @@ -326,7 +326,8 @@ int lirc_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) } if (attr->query.prog_cnt != 0 && prog_ids && cnt) - ret = bpf_prog_array_copy_to_user(progs, prog_ids, cnt); + ret = bpf_prog_array_copy_to_user(progs, prog_ids, + attr->query.prog_cnt); unlock: mutex_unlock(&ir_raw_handler_lock); -- cgit v1.2.3-59-g8ed1b From ba47396e1c042619f1c038ad19493aef737677f5 Mon Sep 17 00:00:00 2001 From: Maciej Żenczykowski Date: Wed, 16 Jun 2021 17:09:50 -0700 Subject: Revert "bpf: Check for BPF_F_ADJ_ROOM_FIXED_GSO when bpf_skb_change_proto" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit fa7b83bf3b156c767f3e4a25bbf3817b08f3ff8e. See the followup commit for the reasoning why I believe the appropriate approach is to simply make this change without a flag, but it can basically be summarized as using this helper without the flag is bug-prone or outright buggy, and thus the default should be this new behaviour. As this commit has only made it into net-next/master, but not into any real release, such a backwards incompatible change is still ok. Signed-off-by: Maciej Żenczykowski Signed-off-by: Daniel Borkmann Cc: Dongseok Yi Cc: Willem de Bruijn Link: https://lore.kernel.org/bpf/20210617000953.2787453-1-zenczykowski@gmail.com --- net/core/filter.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 0b13d8157a8f..243abf519efd 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3235,7 +3235,7 @@ static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len) return ret; } -static int bpf_skb_proto_4_to_6(struct sk_buff *skb, u64 flags) +static int bpf_skb_proto_4_to_6(struct sk_buff *skb) { const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); u32 off = skb_mac_header_len(skb); @@ -3264,9 +3264,7 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb, u64 flags) } /* Due to IPv6 header, MSS needs to be downgraded. */ - if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) - skb_decrease_gso_size(shinfo, len_diff); - + skb_decrease_gso_size(shinfo, len_diff); /* Header must be checked, and gso_segs recomputed. */ shinfo->gso_type |= SKB_GSO_DODGY; shinfo->gso_segs = 0; @@ -3278,7 +3276,7 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb, u64 flags) return 0; } -static int bpf_skb_proto_6_to_4(struct sk_buff *skb, u64 flags) +static int bpf_skb_proto_6_to_4(struct sk_buff *skb) { const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); u32 off = skb_mac_header_len(skb); @@ -3307,9 +3305,7 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb, u64 flags) } /* Due to IPv4 header, MSS can be upgraded. */ - if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) - skb_increase_gso_size(shinfo, len_diff); - + skb_increase_gso_size(shinfo, len_diff); /* Header must be checked, and gso_segs recomputed. */ shinfo->gso_type |= SKB_GSO_DODGY; shinfo->gso_segs = 0; @@ -3321,17 +3317,17 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb, u64 flags) return 0; } -static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto, u64 flags) +static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto) { __be16 from_proto = skb->protocol; if (from_proto == htons(ETH_P_IP) && to_proto == htons(ETH_P_IPV6)) - return bpf_skb_proto_4_to_6(skb, flags); + return bpf_skb_proto_4_to_6(skb); if (from_proto == htons(ETH_P_IPV6) && to_proto == htons(ETH_P_IP)) - return bpf_skb_proto_6_to_4(skb, flags); + return bpf_skb_proto_6_to_4(skb); return -ENOTSUPP; } @@ -3341,7 +3337,7 @@ BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto, { int ret; - if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO))) + if (unlikely(flags)) return -EINVAL; /* General idea is that this helper does the basic groundwork @@ -3361,7 +3357,7 @@ BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto, * that. For offloads, we mark packet as dodgy, so that headers * need to be verified first. */ - ret = bpf_skb_proto_xlat(skb, proto, flags); + ret = bpf_skb_proto_xlat(skb, proto); bpf_compute_data_pointers(skb); return ret; } -- cgit v1.2.3-59-g8ed1b From 364745fbe981a4370f50274475da4675661104df Mon Sep 17 00:00:00 2001 From: Maciej Żenczykowski Date: Wed, 16 Jun 2021 17:09:51 -0700 Subject: bpf: Do not change gso_size during bpf_skb_change_proto() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is technically a backwards incompatible change in behaviour, but I'm going to argue that it is very unlikely to break things, and likely to fix *far* more then it breaks. In no particular order, various reasons follow: (a) I've long had a bug assigned to myself to debug a super rare kernel crash on Android Pixel phones which can (per stacktrace) be traced back to BPF clat IPv6 to IPv4 protocol conversion causing some sort of ugly failure much later on during transmit deep in the GSO engine, AFAICT precisely because of this change to gso_size, though I've never been able to manually reproduce it. I believe it may be related to the particular network offload support of attached USB ethernet dongle being used for tethering off of an IPv6-only cellular connection. The reason might be we end up with more segments than max permitted, or with a GSO packet with only one segment... (either way we break some assumption and hit a BUG_ON) (b) There is no check that the gso_size is > 20 when reducing it by 20, so we might end up with a negative (or underflowing) gso_size or a gso_size of 0. This can't possibly be good. Indeed this is probably somehow exploitable (or at least can result in a kernel crash) by delivering crafted packets and perhaps triggering an infinite loop or a divide by zero... As a reminder: gso_size (MSS) is related to MTU, but not directly derived from it: gso_size/MSS may be significantly smaller then one would get by deriving from local MTU. And on some NICs (which do loose MTU checking on receive, it may even potentially be larger, for example my work pc with 1500 MTU can receive 1520 byte frames [and sometimes does due to bugs in a vendor plat46 implementation]). Indeed even just going from 21 to 1 is potentially problematic because it increases the number of segments by a factor of 21 (think DoS, or some other crash due to too many segments). (c) It's always safe to not increase the gso_size, because it doesn't result in the max packet size increasing. So the skb_increase_gso_size() call was always unnecessary for correctness (and outright undesirable, see later). As such the only part which is potentially dangerous (ie. could cause backwards compatibility issues) is the removal of the skb_decrease_gso_size() call. (d) If the packets are ultimately destined to the local device, then there is absolutely no benefit to playing around with gso_size. It only matters if the packets will egress the device. ie. we're either forwarding, or transmitting from the device. (e) This logic only triggers for packets which are GSO. It does not trigger for skbs which are not GSO. It will not convert a non-GSO MTU sized packet into a GSO packet (and you don't even know what the MTU is, so you can't even fix it). As such your transmit path must *already* be able to handle an MTU 20 bytes larger then your receive path (for IPv4 to IPv6 translation) - and indeed 28 bytes larger due to IPv4 fragments. Thus removing the skb_decrease_gso_size() call doesn't actually increase the size of the packets your transmit side must be able to handle. ie. to handle non-GSO max-MTU packets, the IPv4/IPv6 device/ route MTUs must already be set correctly. Since for example with an IPv4 egress MTU of 1500, IPv4 to IPv6 translation will already build 1520 byte IPv6 frames, so you need a 1520 byte device MTU. This means if your IPv6 device's egress MTU is 1280, your IPv4 route must be 1260 (and actually 1252, because of the need to handle fragments). This is to handle normal non-GSO packets. Thus the reduction is simply not needed for GSO packets, because when they're correctly built, they will already be the right size. (f) TSO/GSO should be able to exactly undo GRO: the number of packets (TCP segments) should not be modified, so that TCP's MSS counting works correctly (this matters for congestion control). If protocol conversion changes the gso_size, then the number of TCP segments may increase or decrease. Packet loss after protocol conversion can result in partial loss of MSS segments that the sender sent. How's the sending TCP stack going to react to receiving ACKs/SACKs in the middle of the segments it sent? (g) skb_{decrease,increase}_gso_size() are already no-ops for GSO_BY_FRAGS case (besides triggering WARN_ON_ONCE). This means you already cannot guarantee that gso_size (and thus resulting packet MTU) is changed. ie. you must assume it won't be changed. (h) changing gso_size is outright buggy for UDP GSO packets, where framing matters (I believe that's also the case for SCTP, but it's already excluded by [g]). So the only remaining case is TCP, which also doesn't want it (see [f]). (i) see also the reasoning on the previous attempt at fixing this (commit fa7b83bf3b156c767f3e4a25bbf3817b08f3ff8e) which shows that the current behaviour causes TCP packet loss: In the forwarding path GRO -> BPF 6 to 4 -> GSO for TCP traffic, the coalesced packet payload can be > MSS, but < MSS + 20. bpf_skb_proto_6_to_4() will upgrade the MSS and it can be > the payload length. After then tcp_gso_segment checks for the payload length if it is <= MSS. The condition is causing the packet to be dropped. tcp_gso_segment(): [...] mss = skb_shinfo(skb)->gso_size; if (unlikely(skb->len <= mss)) goto out; [...] Thus changing the gso_size is simply a very bad idea. Increasing is unnecessary and buggy, and decreasing can go negative. Fixes: 6578171a7ff0 ("bpf: add bpf_skb_change_proto helper") Signed-off-by: Maciej Żenczykowski Signed-off-by: Daniel Borkmann Cc: Dongseok Yi Cc: Willem de Bruijn Link: https://lore.kernel.org/bpf/CANP3RGfjLikQ6dg=YpBU0OeHvyv7JOki7CyOUS9modaXAi-9vQ@mail.gmail.com Link: https://lore.kernel.org/bpf/20210617000953.2787453-2-zenczykowski@gmail.com --- net/core/filter.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 243abf519efd..ae92a8bada0f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3263,8 +3263,6 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb) shinfo->gso_type |= SKB_GSO_TCPV6; } - /* Due to IPv6 header, MSS needs to be downgraded. */ - skb_decrease_gso_size(shinfo, len_diff); /* Header must be checked, and gso_segs recomputed. */ shinfo->gso_type |= SKB_GSO_DODGY; shinfo->gso_segs = 0; @@ -3304,8 +3302,6 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb) shinfo->gso_type |= SKB_GSO_TCPV4; } - /* Due to IPv4 header, MSS can be upgraded. */ - skb_increase_gso_size(shinfo, len_diff); /* Header must be checked, and gso_segs recomputed. */ shinfo->gso_type |= SKB_GSO_DODGY; shinfo->gso_segs = 0; -- cgit v1.2.3-59-g8ed1b From 0bc919d3e0b8149a60d2444c6a8e2b5974556522 Mon Sep 17 00:00:00 2001 From: Maciej Żenczykowski Date: Wed, 16 Jun 2021 17:09:52 -0700 Subject: bpf: Support all gso types in bpf_skb_change_proto() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since we no longer modify gso_size, it is now theoretically safe to not set SKB_GSO_DODGY and reset gso_segs to zero. This also means the skb_is_gso_tcp() check should no longer be necessary. Unfortunately we cannot remove the skb_{decrease,increase}_gso_size() helpers, as they are still used elsewhere: bpf_skb_net_grow() without BPF_F_ADJ_ROOM_FIXED_GSO bpf_skb_net_shrink() without BPF_F_ADJ_ROOM_FIXED_GSO net/core/lwt_bpf.c's handle_gso_type() Signed-off-by: Maciej Żenczykowski Signed-off-by: Daniel Borkmann Cc: Dongseok Yi Cc: Willem de Bruijn Link: https://lore.kernel.org/bpf/20210617000953.2787453-3-zenczykowski@gmail.com --- net/core/filter.c | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index ae92a8bada0f..d062053994c7 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3241,9 +3241,6 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb) u32 off = skb_mac_header_len(skb); int ret; - if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) - return -ENOTSUPP; - ret = skb_cow(skb, len_diff); if (unlikely(ret < 0)) return ret; @@ -3255,17 +3252,11 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb) if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); - /* SKB_GSO_TCPV4 needs to be changed into - * SKB_GSO_TCPV6. - */ + /* SKB_GSO_TCPV4 needs to be changed into SKB_GSO_TCPV6. */ if (shinfo->gso_type & SKB_GSO_TCPV4) { shinfo->gso_type &= ~SKB_GSO_TCPV4; shinfo->gso_type |= SKB_GSO_TCPV6; } - - /* Header must be checked, and gso_segs recomputed. */ - shinfo->gso_type |= SKB_GSO_DODGY; - shinfo->gso_segs = 0; } skb->protocol = htons(ETH_P_IPV6); @@ -3280,9 +3271,6 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb) u32 off = skb_mac_header_len(skb); int ret; - if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) - return -ENOTSUPP; - ret = skb_unclone(skb, GFP_ATOMIC); if (unlikely(ret < 0)) return ret; @@ -3294,17 +3282,11 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb) if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); - /* SKB_GSO_TCPV6 needs to be changed into - * SKB_GSO_TCPV4. - */ + /* SKB_GSO_TCPV6 needs to be changed into SKB_GSO_TCPV4. */ if (shinfo->gso_type & SKB_GSO_TCPV6) { shinfo->gso_type &= ~SKB_GSO_TCPV6; shinfo->gso_type |= SKB_GSO_TCPV4; } - - /* Header must be checked, and gso_segs recomputed. */ - shinfo->gso_type |= SKB_GSO_DODGY; - shinfo->gso_segs = 0; } skb->protocol = htons(ETH_P_IP); -- cgit v1.2.3-59-g8ed1b From b9964ce74544ea6cbc4eabd2c89a531adf7f291d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 24 Jun 2021 18:05:51 +0200 Subject: rcu: Create an unrcu_pointer() to remove __rcu from a pointer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The xchg() and cmpxchg() functions are sometimes used to carry out RCU updates. Unfortunately, this can result in sparse warnings for both the old-value and new-value arguments, as well as for the return value. The arguments can be dealt with using RCU_INITIALIZER(): old_p = xchg(&p, RCU_INITIALIZER(new_p)); But a sparse warning still remains due to assigning the __rcu pointer returned from xchg to the (most likely) non-__rcu pointer old_p. This commit therefore provides an unrcu_pointer() macro that strips the __rcu. This macro can be used as follows: old_p = unrcu_pointer(xchg(&p, RCU_INITIALIZER(new_p))); Reported-by: Toke Høiland-Jørgensen Signed-off-by: Paul E. McKenney Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210624160609.292325-2-toke@redhat.com --- include/linux/rcupdate.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 9455476c5ba2..d7895b81264e 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -363,6 +363,20 @@ static inline void rcu_preempt_sleep_check(void) { } #define rcu_check_sparse(p, space) #endif /* #else #ifdef __CHECKER__ */ +/** + * unrcu_pointer - mark a pointer as not being RCU protected + * @p: pointer needing to lose its __rcu property + * + * Converts @p from an __rcu pointer to a __kernel pointer. + * This allows an __rcu pointer to be used with xchg() and friends. + */ +#define unrcu_pointer(p) \ +({ \ + typeof(*p) *_________p1 = (typeof(*p) *__force)(p); \ + rcu_check_sparse(p, __rcu); \ + ((typeof(*p) __force __kernel *)(_________p1)); \ +}) + #define __rcu_access_pointer(p, space) \ ({ \ typeof(*p) *_________p1 = (typeof(*p) *__force)READ_ONCE(p); \ -- cgit v1.2.3-59-g8ed1b From 9a145c04a293933002ec288a4d6b4f370b59e4d1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 24 Jun 2021 18:05:52 +0200 Subject: doc: Clarify and expand RCU updaters and corresponding readers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit clarifies which primitives readers can use given that the corresponding updaters have made a specific choice. This commit also adds this information for the various RCU Tasks flavors. While in the area, it removes a paragraph that no longer applies in any straightforward manner. Signed-off-by: Paul E. McKenney Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210624160609.292325-3-toke@redhat.com --- Documentation/RCU/checklist.rst | 48 +++++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 21 deletions(-) diff --git a/Documentation/RCU/checklist.rst b/Documentation/RCU/checklist.rst index 1030119294d0..07f6cb8f674d 100644 --- a/Documentation/RCU/checklist.rst +++ b/Documentation/RCU/checklist.rst @@ -211,27 +211,33 @@ over a rather long period of time, but improvements are always welcome! of the system, especially to real-time workloads running on the rest of the system. -7. As of v4.20, a given kernel implements only one RCU flavor, - which is RCU-sched for PREEMPTION=n and RCU-preempt for PREEMPTION=y. - If the updater uses call_rcu() or synchronize_rcu(), - then the corresponding readers may use rcu_read_lock() and - rcu_read_unlock(), rcu_read_lock_bh() and rcu_read_unlock_bh(), - or any pair of primitives that disables and re-enables preemption, - for example, rcu_read_lock_sched() and rcu_read_unlock_sched(). - If the updater uses synchronize_srcu() or call_srcu(), - then the corresponding readers must use srcu_read_lock() and - srcu_read_unlock(), and with the same srcu_struct. The rules for - the expedited primitives are the same as for their non-expedited - counterparts. Mixing things up will result in confusion and - broken kernels, and has even resulted in an exploitable security - issue. - - One exception to this rule: rcu_read_lock() and rcu_read_unlock() - may be substituted for rcu_read_lock_bh() and rcu_read_unlock_bh() - in cases where local bottom halves are already known to be - disabled, for example, in irq or softirq context. Commenting - such cases is a must, of course! And the jury is still out on - whether the increased speed is worth it. +7. As of v4.20, a given kernel implements only one RCU flavor, which + is RCU-sched for PREEMPTION=n and RCU-preempt for PREEMPTION=y. + If the updater uses call_rcu() or synchronize_rcu(), then + the corresponding readers may use: (1) rcu_read_lock() and + rcu_read_unlock(), (2) any pair of primitives that disables + and re-enables softirq, for example, rcu_read_lock_bh() and + rcu_read_unlock_bh(), or (3) any pair of primitives that disables + and re-enables preemption, for example, rcu_read_lock_sched() and + rcu_read_unlock_sched(). If the updater uses synchronize_srcu() + or call_srcu(), then the corresponding readers must use + srcu_read_lock() and srcu_read_unlock(), and with the same + srcu_struct. The rules for the expedited RCU grace-period-wait + primitives are the same as for their non-expedited counterparts. + + If the updater uses call_rcu_tasks() or synchronize_rcu_tasks(), + then the readers must refrain from executing voluntary + context switches, that is, from blocking. If the updater uses + call_rcu_tasks_trace() or synchronize_rcu_tasks_trace(), then + the corresponding readers must use rcu_read_lock_trace() and + rcu_read_unlock_trace(). If an updater uses call_rcu_tasks_rude() + or synchronize_rcu_tasks_rude(), then the corresponding readers + must use anything that disables interrupts. + + Mixing things up will result in confusion and broken kernels, and + has even resulted in an exploitable security issue. Therefore, + when using non-obvious pairs of primitives, commenting is of + course a must. 8. Although synchronize_rcu() is slower than is call_rcu(), it usually results in simpler code. So, unless update performance is -- cgit v1.2.3-59-g8ed1b From e74c74f9e51deb725e72d129084ba8252d47222d Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Thu, 24 Jun 2021 18:05:53 +0200 Subject: doc: Give XDP as example of non-obvious RCU reader/updater pairing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit gives an example of non-obvious RCU reader/updater pairing in the guise of the XDP feature in networking, which calls BPF programs from network-driver NAPI (softirq) context. Signed-off-by: Paul E. McKenney Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210624160609.292325-4-toke@redhat.com --- Documentation/RCU/checklist.rst | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/Documentation/RCU/checklist.rst b/Documentation/RCU/checklist.rst index 07f6cb8f674d..01cc21f17f7b 100644 --- a/Documentation/RCU/checklist.rst +++ b/Documentation/RCU/checklist.rst @@ -236,8 +236,15 @@ over a rather long period of time, but improvements are always welcome! Mixing things up will result in confusion and broken kernels, and has even resulted in an exploitable security issue. Therefore, - when using non-obvious pairs of primitives, commenting is of - course a must. + when using non-obvious pairs of primitives, commenting is + of course a must. One example of non-obvious pairing is + the XDP feature in networking, which calls BPF programs from + network-driver NAPI (softirq) context. BPF relies heavily on RCU + protection for its data structures, but because the BPF program + invocation happens entirely within a single local_bh_disable() + section in a NAPI poll cycle, this usage is safe. The reason + that this usage is safe is that readers can use anything that + disables BH when updaters use call_rcu() or synchronize_rcu(). 8. Although synchronize_rcu() is slower than is call_rcu(), it usually results in simpler code. So, unless update performance is -- cgit v1.2.3-59-g8ed1b From 694cea395fded425008e93cd90cfdf7a451674af Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Thu, 24 Jun 2021 18:05:54 +0200 Subject: bpf: Allow RCU-protected lookups to happen from bh context MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit XDP programs are called from a NAPI poll context, which means the RCU reference liveness is ensured by local_bh_disable(). Add rcu_read_lock_bh_held() as a condition to the RCU checks for map lookups so lockdep understands that the dereferences are safe from inside *either* an rcu_read_lock() section *or* a local_bh_disable() section. While both bh_disabled and rcu_read_lock() provide RCU protection, they are semantically distinct, so we need both conditions to prevent lockdep complaints. This change is done in preparation for removing the redundant rcu_read_lock()s from drivers. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20210624160609.292325-5-toke@redhat.com --- kernel/bpf/hashtab.c | 21 ++++++++++++++------- kernel/bpf/helpers.c | 6 +++--- kernel/bpf/lpm_trie.c | 6 ++++-- 3 files changed, 21 insertions(+), 12 deletions(-) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 6f6681b07364..72c58cc516a3 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -596,7 +596,8 @@ static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) struct htab_elem *l; u32 hash, key_size; - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && + !rcu_read_lock_bh_held()); key_size = map->key_size; @@ -989,7 +990,8 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, /* unknown flags */ return -EINVAL; - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && + !rcu_read_lock_bh_held()); key_size = map->key_size; @@ -1082,7 +1084,8 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, /* unknown flags */ return -EINVAL; - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && + !rcu_read_lock_bh_held()); key_size = map->key_size; @@ -1148,7 +1151,8 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, /* unknown flags */ return -EINVAL; - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && + !rcu_read_lock_bh_held()); key_size = map->key_size; @@ -1202,7 +1206,8 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, /* unknown flags */ return -EINVAL; - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && + !rcu_read_lock_bh_held()); key_size = map->key_size; @@ -1276,7 +1281,8 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) u32 hash, key_size; int ret; - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && + !rcu_read_lock_bh_held()); key_size = map->key_size; @@ -1311,7 +1317,8 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) u32 hash, key_size; int ret; - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && + !rcu_read_lock_bh_held()); key_size = map->key_size; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index a2f1f15ce432..62cf00383910 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -29,7 +29,7 @@ */ BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key) { - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); return (unsigned long) map->ops->map_lookup_elem(map, key); } @@ -45,7 +45,7 @@ const struct bpf_func_proto bpf_map_lookup_elem_proto = { BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key, void *, value, u64, flags) { - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); return map->ops->map_update_elem(map, key, value, flags); } @@ -62,7 +62,7 @@ const struct bpf_func_proto bpf_map_update_elem_proto = { BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key) { - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); return map->ops->map_delete_elem(map, key); } diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 1b7b8a6f34ee..423549d2c52e 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -232,7 +232,8 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key) /* Start walking the trie from the root node ... */ - for (node = rcu_dereference(trie->root); node;) { + for (node = rcu_dereference_check(trie->root, rcu_read_lock_bh_held()); + node;) { unsigned int next_bit; size_t matchlen; @@ -264,7 +265,8 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key) * traverse down. */ next_bit = extract_bit(key->data, node->prefixlen); - node = rcu_dereference(node->child[next_bit]); + node = rcu_dereference_check(node->child[next_bit], + rcu_read_lock_bh_held()); } if (!found) -- cgit v1.2.3-59-g8ed1b From 782347b6bcad07ddb574422e01e22c92e05928c8 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Thu, 24 Jun 2021 18:05:55 +0200 Subject: xdp: Add proper __rcu annotations to redirect map entries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit XDP_REDIRECT works by a three-step process: the bpf_redirect() and bpf_redirect_map() helpers will lookup the target of the redirect and store it (along with some other metadata) in a per-CPU struct bpf_redirect_info. Next, when the program returns the XDP_REDIRECT return code, the driver will call xdp_do_redirect() which will use the information thus stored to actually enqueue the frame into a bulk queue structure (that differs slightly by map type, but shares the same principle). Finally, before exiting its NAPI poll loop, the driver will call xdp_do_flush(), which will flush all the different bulk queues, thus completing the redirect. Pointers to the map entries will be kept around for this whole sequence of steps, protected by RCU. However, there is no top-level rcu_read_lock() in the core code; instead drivers add their own rcu_read_lock() around the XDP portions of the code, but somewhat inconsistently as Martin discovered[0]. However, things still work because everything happens inside a single NAPI poll sequence, which means it's between a pair of calls to local_bh_disable()/local_bh_enable(). So Paul suggested[1] that we could document this intention by using rcu_dereference_check() with rcu_read_lock_bh_held() as a second parameter, thus allowing sparse and lockdep to verify that everything is done correctly. This patch does just that: we add an __rcu annotation to the map entry pointers and remove the various comments explaining the NAPI poll assurance strewn through devmap.c in favour of a longer explanation in filter.c. The goal is to have one coherent documentation of the entire flow, and rely on the RCU annotations as a "standard" way of communicating the flow in the map code (which can additionally be understood by sparse and lockdep). The RCU annotation replacements result in a fairly straight-forward replacement where READ_ONCE() becomes rcu_dereference_check(), WRITE_ONCE() becomes rcu_assign_pointer() and xchg() and cmpxchg() gets wrapped in the proper constructs to cast the pointer back and forth between __rcu and __kernel address space (for the benefit of sparse). The one complication is that xskmap has a few constructions where double-pointers are passed back and forth; these simply all gain __rcu annotations, and only the final reference/dereference to the inner-most pointer gets changed. With this, everything can be run through sparse without eliciting complaints, and lockdep can verify correctness even without the use of rcu_read_lock() in the drivers. Subsequent patches will clean these up from the drivers. [0] https://lore.kernel.org/bpf/20210415173551.7ma4slcbqeyiba2r@kafai-mbp.dhcp.thefacebook.com/ [1] https://lore.kernel.org/bpf/20210419165837.GA975577@paulmck-ThinkPad-P17-Gen-1/ Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210624160609.292325-6-toke@redhat.com --- include/linux/filter.h | 8 +++----- include/net/xdp_sock.h | 2 +- kernel/bpf/cpumap.c | 13 +++++++++---- kernel/bpf/devmap.c | 49 +++++++++++++++++++++---------------------------- net/core/filter.c | 28 ++++++++++++++++++++++++++++ net/xdp/xsk.c | 4 ++-- net/xdp/xsk.h | 4 ++-- net/xdp/xskmap.c | 29 +++++++++++++++++------------ 8 files changed, 83 insertions(+), 54 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 688856e0b28a..472f97074da0 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -763,11 +763,9 @@ DECLARE_BPF_DISPATCHER(xdp) static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog, struct xdp_buff *xdp) { - /* Caller needs to hold rcu_read_lock() (!), otherwise program - * can be released while still running, or map elements could be - * freed early while still having concurrent users. XDP fastpath - * already takes rcu_read_lock() when fetching the program, so - * it's not necessary here anymore. + /* Driver XDP hooks are invoked within a single NAPI poll cycle and thus + * under local_bh_disable(), which provides the needed RCU protection + * for accessing map entries. */ return __BPF_PROG_RUN(prog, xdp, BPF_DISPATCHER_FUNC(xdp)); } diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 9c0722c6d7ac..fff069d2ed1b 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -37,7 +37,7 @@ struct xdp_umem { struct xsk_map { struct bpf_map map; spinlock_t lock; /* Synchronize map updates */ - struct xdp_sock *xsk_map[]; + struct xdp_sock __rcu *xsk_map[]; }; struct xdp_sock { diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index a1a0c4e791c6..480e936c54d0 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -74,7 +74,7 @@ struct bpf_cpu_map_entry { struct bpf_cpu_map { struct bpf_map map; /* Below members specific for map type */ - struct bpf_cpu_map_entry **cpu_map; + struct bpf_cpu_map_entry __rcu **cpu_map; }; static DEFINE_PER_CPU(struct list_head, cpu_map_flush_list); @@ -469,7 +469,7 @@ static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, { struct bpf_cpu_map_entry *old_rcpu; - old_rcpu = xchg(&cmap->cpu_map[key_cpu], rcpu); + old_rcpu = unrcu_pointer(xchg(&cmap->cpu_map[key_cpu], RCU_INITIALIZER(rcpu))); if (old_rcpu) { call_rcu(&old_rcpu->rcu, __cpu_map_entry_free); INIT_WORK(&old_rcpu->kthread_stop_wq, cpu_map_kthread_stop); @@ -551,7 +551,7 @@ static void cpu_map_free(struct bpf_map *map) for (i = 0; i < cmap->map.max_entries; i++) { struct bpf_cpu_map_entry *rcpu; - rcpu = READ_ONCE(cmap->cpu_map[i]); + rcpu = rcu_dereference_raw(cmap->cpu_map[i]); if (!rcpu) continue; @@ -562,6 +562,10 @@ static void cpu_map_free(struct bpf_map *map) kfree(cmap); } +/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or + * by local_bh_disable() (from XDP calls inside NAPI). The + * rcu_read_lock_bh_held() below makes lockdep accept both. + */ static void *__cpu_map_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); @@ -570,7 +574,8 @@ static void *__cpu_map_lookup_elem(struct bpf_map *map, u32 key) if (key >= map->max_entries) return NULL; - rcpu = READ_ONCE(cmap->cpu_map[key]); + rcpu = rcu_dereference_check(cmap->cpu_map[key], + rcu_read_lock_bh_held()); return rcpu; } diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 2a75e6c2d27d..2f6bd75cd682 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -73,7 +73,7 @@ struct bpf_dtab_netdev { struct bpf_dtab { struct bpf_map map; - struct bpf_dtab_netdev **netdev_map; /* DEVMAP type only */ + struct bpf_dtab_netdev __rcu **netdev_map; /* DEVMAP type only */ struct list_head list; /* these are only used for DEVMAP_HASH type maps */ @@ -226,7 +226,7 @@ static void dev_map_free(struct bpf_map *map) for (i = 0; i < dtab->map.max_entries; i++) { struct bpf_dtab_netdev *dev; - dev = dtab->netdev_map[i]; + dev = rcu_dereference_raw(dtab->netdev_map[i]); if (!dev) continue; @@ -259,6 +259,10 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } +/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or + * by local_bh_disable() (from XDP calls inside NAPI). The + * rcu_read_lock_bh_held() below makes lockdep accept both. + */ static void *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); @@ -410,15 +414,9 @@ out: trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, cnt - sent, err); } -/* __dev_flush is called from xdp_do_flush() which _must_ be signaled - * from the driver before returning from its napi->poll() routine. The poll() - * routine is called either from busy_poll context or net_rx_action signaled - * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the - * net device can be torn down. On devmap tear down we ensure the flush list - * is empty before completing to ensure all flush operations have completed. - * When drivers update the bpf program they may need to ensure any flush ops - * are also complete. Using synchronize_rcu or call_rcu will suffice for this - * because both wait for napi context to exit. +/* __dev_flush is called from xdp_do_flush() which _must_ be signalled from the + * driver before returning from its napi->poll() routine. See the comment above + * xdp_do_flush() in filter.c. */ void __dev_flush(void) { @@ -433,9 +431,9 @@ void __dev_flush(void) } } -/* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or - * update happens in parallel here a dev_put won't happen until after reading - * the ifindex. +/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or + * by local_bh_disable() (from XDP calls inside NAPI). The + * rcu_read_lock_bh_held() below makes lockdep accept both. */ static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key) { @@ -445,12 +443,14 @@ static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key) if (key >= map->max_entries) return NULL; - obj = READ_ONCE(dtab->netdev_map[key]); + obj = rcu_dereference_check(dtab->netdev_map[key], + rcu_read_lock_bh_held()); return obj; } -/* Runs under RCU-read-side, plus in softirq under NAPI protection. - * Thus, safe percpu variable access. +/* Runs in NAPI, i.e., softirq under local_bh_disable(). Thus, safe percpu + * variable access, and map elements stick around. See comment above + * xdp_do_flush() in filter.c. */ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_prog *xdp_prog) @@ -735,14 +735,7 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key) if (k >= map->max_entries) return -EINVAL; - /* Use call_rcu() here to ensure any rcu critical sections have - * completed as well as any flush operations because call_rcu - * will wait for preempt-disable region to complete, NAPI in this - * context. And additionally, the driver tear down ensures all - * soft irqs are complete before removing the net device in the - * case of dev_put equals zero. - */ - old_dev = xchg(&dtab->netdev_map[k], NULL); + old_dev = unrcu_pointer(xchg(&dtab->netdev_map[k], NULL)); if (old_dev) call_rcu(&old_dev->rcu, __dev_map_entry_free); return 0; @@ -851,7 +844,7 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map, * Remembering the driver side flush operation will happen before the * net device is removed. */ - old_dev = xchg(&dtab->netdev_map[i], dev); + old_dev = unrcu_pointer(xchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev))); if (old_dev) call_rcu(&old_dev->rcu, __dev_map_entry_free); @@ -1031,10 +1024,10 @@ static int dev_map_notification(struct notifier_block *notifier, for (i = 0; i < dtab->map.max_entries; i++) { struct bpf_dtab_netdev *dev, *odev; - dev = READ_ONCE(dtab->netdev_map[i]); + dev = rcu_dereference(dtab->netdev_map[i]); if (!dev || netdev != dev->dev) continue; - odev = cmpxchg(&dtab->netdev_map[i], dev, NULL); + odev = unrcu_pointer(cmpxchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev), NULL)); if (dev == odev) call_rcu(&dev->rcu, __dev_map_entry_free); diff --git a/net/core/filter.c b/net/core/filter.c index d062053994c7..d22895caa164 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3897,6 +3897,34 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = { .arg2_type = ARG_ANYTHING, }; +/* XDP_REDIRECT works by a three-step process, implemented in the functions + * below: + * + * 1. The bpf_redirect() and bpf_redirect_map() helpers will lookup the target + * of the redirect and store it (along with some other metadata) in a per-CPU + * struct bpf_redirect_info. + * + * 2. When the program returns the XDP_REDIRECT return code, the driver will + * call xdp_do_redirect() which will use the information in struct + * bpf_redirect_info to actually enqueue the frame into a map type-specific + * bulk queue structure. + * + * 3. Before exiting its NAPI poll loop, the driver will call xdp_do_flush(), + * which will flush all the different bulk queues, thus completing the + * redirect. + * + * Pointers to the map entries will be kept around for this whole sequence of + * steps, protected by RCU. However, there is no top-level rcu_read_lock() in + * the core code; instead, the RCU protection relies on everything happening + * inside a single NAPI poll sequence, which means it's between a pair of calls + * to local_bh_disable()/local_bh_enable(). + * + * The map entries are marked as __rcu and the map code makes sure to + * dereference those pointers with rcu_dereference_check() in a way that works + * for both sections that to hold an rcu_read_lock() and sections that are + * called from NAPI without a separate rcu_read_lock(). The code below does not + * use RCU annotations, but relies on those in the map code. + */ void xdp_do_flush(void) { __dev_flush(); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index cd62d4ba87a9..996da915f520 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -749,7 +749,7 @@ static void xsk_unbind_dev(struct xdp_sock *xs) } static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs, - struct xdp_sock ***map_entry) + struct xdp_sock __rcu ***map_entry) { struct xsk_map *map = NULL; struct xsk_map_node *node; @@ -785,7 +785,7 @@ static void xsk_delete_from_maps(struct xdp_sock *xs) * might be updates to the map between * xsk_get_map_list_entry() and xsk_map_try_sock_delete(). */ - struct xdp_sock **map_entry = NULL; + struct xdp_sock __rcu **map_entry = NULL; struct xsk_map *map; while ((map = xsk_get_map_list_entry(xs, &map_entry))) { diff --git a/net/xdp/xsk.h b/net/xdp/xsk.h index edcf249ad1f1..a4bc4749faac 100644 --- a/net/xdp/xsk.h +++ b/net/xdp/xsk.h @@ -31,7 +31,7 @@ struct xdp_mmap_offsets_v1 { struct xsk_map_node { struct list_head node; struct xsk_map *map; - struct xdp_sock **map_entry; + struct xdp_sock __rcu **map_entry; }; static inline struct xdp_sock *xdp_sk(struct sock *sk) @@ -40,7 +40,7 @@ static inline struct xdp_sock *xdp_sk(struct sock *sk) } void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, - struct xdp_sock **map_entry); + struct xdp_sock __rcu **map_entry); void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id); int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool, u16 queue_id); diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c index 9df75ea4a567..2e48d0e094d9 100644 --- a/net/xdp/xskmap.c +++ b/net/xdp/xskmap.c @@ -12,7 +12,7 @@ #include "xsk.h" static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map, - struct xdp_sock **map_entry) + struct xdp_sock __rcu **map_entry) { struct xsk_map_node *node; @@ -42,7 +42,7 @@ static void xsk_map_sock_add(struct xdp_sock *xs, struct xsk_map_node *node) } static void xsk_map_sock_delete(struct xdp_sock *xs, - struct xdp_sock **map_entry) + struct xdp_sock __rcu **map_entry) { struct xsk_map_node *n, *tmp; @@ -124,6 +124,10 @@ static int xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) return insn - insn_buf; } +/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or + * by local_bh_disable() (from XDP calls inside NAPI). The + * rcu_read_lock_bh_held() below makes lockdep accept both. + */ static void *__xsk_map_lookup_elem(struct bpf_map *map, u32 key) { struct xsk_map *m = container_of(map, struct xsk_map, map); @@ -131,12 +135,11 @@ static void *__xsk_map_lookup_elem(struct bpf_map *map, u32 key) if (key >= map->max_entries) return NULL; - return READ_ONCE(m->xsk_map[key]); + return rcu_dereference_check(m->xsk_map[key], rcu_read_lock_bh_held()); } static void *xsk_map_lookup_elem(struct bpf_map *map, void *key) { - WARN_ON_ONCE(!rcu_read_lock_held()); return __xsk_map_lookup_elem(map, *(u32 *)key); } @@ -149,7 +152,8 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct xsk_map *m = container_of(map, struct xsk_map, map); - struct xdp_sock *xs, *old_xs, **map_entry; + struct xdp_sock __rcu **map_entry; + struct xdp_sock *xs, *old_xs; u32 i = *(u32 *)key, fd = *(u32 *)value; struct xsk_map_node *node; struct socket *sock; @@ -179,7 +183,7 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, } spin_lock_bh(&m->lock); - old_xs = READ_ONCE(*map_entry); + old_xs = rcu_dereference_protected(*map_entry, lockdep_is_held(&m->lock)); if (old_xs == xs) { err = 0; goto out; @@ -191,7 +195,7 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, goto out; } xsk_map_sock_add(xs, node); - WRITE_ONCE(*map_entry, xs); + rcu_assign_pointer(*map_entry, xs); if (old_xs) xsk_map_sock_delete(old_xs, map_entry); spin_unlock_bh(&m->lock); @@ -208,7 +212,8 @@ out: static int xsk_map_delete_elem(struct bpf_map *map, void *key) { struct xsk_map *m = container_of(map, struct xsk_map, map); - struct xdp_sock *old_xs, **map_entry; + struct xdp_sock __rcu **map_entry; + struct xdp_sock *old_xs; int k = *(u32 *)key; if (k >= map->max_entries) @@ -216,7 +221,7 @@ static int xsk_map_delete_elem(struct bpf_map *map, void *key) spin_lock_bh(&m->lock); map_entry = &m->xsk_map[k]; - old_xs = xchg(map_entry, NULL); + old_xs = unrcu_pointer(xchg(map_entry, NULL)); if (old_xs) xsk_map_sock_delete(old_xs, map_entry); spin_unlock_bh(&m->lock); @@ -231,11 +236,11 @@ static int xsk_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) } void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, - struct xdp_sock **map_entry) + struct xdp_sock __rcu **map_entry) { spin_lock_bh(&map->lock); - if (READ_ONCE(*map_entry) == xs) { - WRITE_ONCE(*map_entry, NULL); + if (rcu_access_pointer(*map_entry) == xs) { + rcu_assign_pointer(*map_entry, NULL); xsk_map_sock_delete(xs, map_entry); } spin_unlock_bh(&map->lock); -- cgit v1.2.3-59-g8ed1b From 77151ccf10659d4066074f278402032f3265f0cc Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Thu, 24 Jun 2021 18:05:56 +0200 Subject: bpf, sched: Remove unneeded rcu_read_lock() around BPF program invocation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The rcu_read_lock() call in cls_bpf and act_bpf are redundant: on the TX side, there's already a call to rcu_read_lock_bh() in __dev_queue_xmit(), and on RX there's a covering rcu_read_lock() in netif_receive_skb{,_list}_internal(). With the previous patches we also amended the lockdep checks in the map code to not require any particular RCU flavour, so we can just get rid of the rcu_read_lock()s. Suggested-by: Daniel Borkmann Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210624160609.292325-7-toke@redhat.com --- net/sched/act_bpf.c | 2 -- net/sched/cls_bpf.c | 3 --- 2 files changed, 5 deletions(-) diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index e48e980c3b93..e409a0005717 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -43,7 +43,6 @@ static int tcf_bpf_act(struct sk_buff *skb, const struct tc_action *act, tcf_lastuse_update(&prog->tcf_tm); bstats_cpu_update(this_cpu_ptr(prog->common.cpu_bstats), skb); - rcu_read_lock(); filter = rcu_dereference(prog->filter); if (at_ingress) { __skb_push(skb, skb->mac_len); @@ -56,7 +55,6 @@ static int tcf_bpf_act(struct sk_buff *skb, const struct tc_action *act, } if (skb_sk_is_prefetched(skb) && filter_res != TC_ACT_OK) skb_orphan(skb); - rcu_read_unlock(); /* A BPF program may overwrite the default action opcode. * Similarly as in cls_bpf, if filter_res == -1 we use the diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 6e3e63db0e01..fa739efa59f4 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -85,8 +85,6 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct cls_bpf_prog *prog; int ret = -1; - /* Needed here for accessing maps. */ - rcu_read_lock(); list_for_each_entry_rcu(prog, &head->plist, link) { int filter_res; @@ -131,7 +129,6 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, break; } - rcu_read_unlock(); return ret; } -- cgit v1.2.3-59-g8ed1b From 0939e0537896e421e391fa4b1a0b052907808e0d Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Thu, 24 Jun 2021 18:05:57 +0200 Subject: ena: Remove rcu_read_lock() around XDP program invocation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ena driver has rcu_read_lock()/rcu_read_unlock() pairs around XDP program invocations. However, the actual lifetime of the objects referred by the XDP program invocation is longer, all the way through to the call to xdp_do_flush(), making the scope of the rcu_read_lock() too small. This turns out to be harmless because it all happens in a single NAPI poll cycle (and thus under local_bh_disable()), but it makes the rcu_read_lock() misleading. Rather than extend the scope of the rcu_read_lock(), just get rid of it entirely. With the addition of RCU annotations to the XDP_REDIRECT map types that take bh execution into account, lockdep even understands this to be safe, so there's really no reason to keep it around. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Cc: Saeed Bishara Cc: Guy Tzalik Link: https://lore.kernel.org/bpf/20210624160609.292325-8-toke@redhat.com --- drivers/net/ethernet/amazon/ena/ena_netdev.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index 3bb0e66b2c7e..44ef6b88f715 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -382,7 +382,6 @@ static int ena_xdp_execute(struct ena_ring *rx_ring, struct xdp_buff *xdp) struct xdp_frame *xdpf; u64 *xdp_stat; - rcu_read_lock(); xdp_prog = READ_ONCE(rx_ring->xdp_bpf_prog); if (!xdp_prog) @@ -439,8 +438,6 @@ static int ena_xdp_execute(struct ena_ring *rx_ring, struct xdp_buff *xdp) ena_increase_stat(xdp_stat, 1, &rx_ring->syncp); out: - rcu_read_unlock(); - return verdict; } -- cgit v1.2.3-59-g8ed1b From 158c1399fc45c5178a3f2b8b68ff2faa2e36a52d Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Thu, 24 Jun 2021 18:05:58 +0200 Subject: bnxt: Remove rcu_read_lock() around XDP program invocation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bnxt driver has rcu_read_lock()/rcu_read_unlock() pairs around XDP program invocations. However, the actual lifetime of the objects referred by the XDP program invocation is longer, all the way through to the call to xdp_do_flush(), making the scope of the rcu_read_lock() too small. This turns out to be harmless because it all happens in a single NAPI poll cycle (and thus under local_bh_disable()), but it makes the rcu_read_lock() misleading. Rather than extend the scope of the rcu_read_lock(), just get rid of it entirely. With the addition of RCU annotations to the XDP_REDIRECT map types that take bh execution into account, lockdep even understands this to be safe, so there's really no reason to keep it around. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Cc: Michael Chan Link: https://lore.kernel.org/bpf/20210624160609.292325-9-toke@redhat.com --- drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c index ec9564e584e0..bee6e091a997 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c @@ -138,9 +138,7 @@ bool bnxt_rx_xdp(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, u16 cons, xdp_prepare_buff(&xdp, *data_ptr - offset, offset, *len, false); orig_data = xdp.data; - rcu_read_lock(); act = bpf_prog_run_xdp(xdp_prog, &xdp); - rcu_read_unlock(); tx_avail = bnxt_tx_avail(bp, txr); /* If the tx ring is not full, we must not update the rx producer yet -- cgit v1.2.3-59-g8ed1b From 36baafe347a85a9d85f61aac0a9b53c53635829e Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Thu, 24 Jun 2021 18:05:59 +0200 Subject: thunderx: Remove rcu_read_lock() around XDP program invocation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The thunderx driver has rcu_read_lock()/rcu_read_unlock() pairs around XDP program invocations. However, the actual lifetime of the objects referred by the XDP program invocation is longer, all the way through to the call to xdp_do_flush(), making the scope of the rcu_read_lock() too small. This turns out to be harmless because it all happens in a single NAPI poll cycle (and thus under local_bh_disable()), but it makes the rcu_read_lock() misleading. Rather than extend the scope of the rcu_read_lock(), just get rid of it entirely. With the addition of RCU annotations to the XDP_REDIRECT map types that take bh execution into account, lockdep even understands this to be safe, so there's really no reason to keep it around. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Cc: Sunil Goutham Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/bpf/20210624160609.292325-10-toke@redhat.com --- drivers/net/ethernet/cavium/thunder/nicvf_main.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c index c33b4e837515..e2b290135fd9 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c @@ -555,9 +555,7 @@ static inline bool nicvf_xdp_rx(struct nicvf *nic, struct bpf_prog *prog, xdp_prepare_buff(&xdp, hard_start, data - hard_start, len, false); orig_data = xdp.data; - rcu_read_lock(); action = bpf_prog_run_xdp(prog, &xdp); - rcu_read_unlock(); len = xdp.data_end - xdp.data; /* Check if XDP program has changed headers */ -- cgit v1.2.3-59-g8ed1b From 547aabcac3251c40e4cd09d79dba70f7eab8cca2 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Thu, 24 Jun 2021 18:06:00 +0200 Subject: freescale: Remove rcu_read_lock() around XDP program invocation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The dpaa and dpaa2 drivers have rcu_read_lock()/rcu_read_unlock() pairs around XDP program invocations. However, the actual lifetime of the objects referred by the XDP program invocation is longer, all the way through to the call to xdp_do_flush(), making the scope of the rcu_read_lock() too small. This turns out to be harmless because it all happens in a single NAPI poll cycle (and thus under local_bh_disable()), but it makes the rcu_read_lock() misleading. Rather than extend the scope of the rcu_read_lock(), just get rid of it entirely. With the addition of RCU annotations to the XDP_REDIRECT map types that take bh execution into account, lockdep even understands this to be safe, so there's really no reason to keep it around. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Reviewed-by: Camelia Groza Cc: Ioana Radulescu Cc: Madalin Bucur Cc: Ioana Ciornei Link: https://lore.kernel.org/bpf/20210624160609.292325-11-toke@redhat.com --- drivers/net/ethernet/freescale/dpaa/dpaa_eth.c | 8 +------- drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 3 --- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c index 177c020bf34a..e6826561cf11 100644 --- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c +++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c @@ -2558,13 +2558,9 @@ static u32 dpaa_run_xdp(struct dpaa_priv *priv, struct qm_fd *fd, void *vaddr, u32 xdp_act; int err; - rcu_read_lock(); - xdp_prog = READ_ONCE(priv->xdp_prog); - if (!xdp_prog) { - rcu_read_unlock(); + if (!xdp_prog) return XDP_PASS; - } xdp_init_buff(&xdp, DPAA_BP_RAW_SIZE - DPAA_TX_PRIV_DATA_SIZE, &dpaa_fq->xdp_rxq); @@ -2638,8 +2634,6 @@ static u32 dpaa_run_xdp(struct dpaa_priv *priv, struct qm_fd *fd, void *vaddr, break; } - rcu_read_unlock(); - return xdp_act; } diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index 8433aa730c42..973352393bd4 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -352,8 +352,6 @@ static u32 dpaa2_eth_run_xdp(struct dpaa2_eth_priv *priv, u32 xdp_act = XDP_PASS; int err, offset; - rcu_read_lock(); - xdp_prog = READ_ONCE(ch->xdp.prog); if (!xdp_prog) goto out; @@ -414,7 +412,6 @@ static u32 dpaa2_eth_run_xdp(struct dpaa2_eth_priv *priv, ch->xdp.res |= xdp_act; out: - rcu_read_unlock(); return xdp_act; } -- cgit v1.2.3-59-g8ed1b From 49589b23d5a92dff4a7cb705608dff7dd13ef709 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Thu, 24 Jun 2021 18:06:01 +0200 Subject: intel: Remove rcu_read_lock() around XDP program invocation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Intel drivers all have rcu_read_lock()/rcu_read_unlock() pairs around XDP program invocations. However, the actual lifetime of the objects referred by the XDP program invocation is longer, all the way through to the call to xdp_do_flush(), making the scope of the rcu_read_lock() too small. This turns out to be harmless because it all happens in a single NAPI poll cycle (and thus under local_bh_disable()), but it makes the rcu_read_lock() misleading. Rather than extend the scope of the rcu_read_lock(), just get rid of it entirely. With the addition of RCU annotations to the XDP_REDIRECT map types that take bh execution into account, lockdep even understands this to be safe, so there's really no reason to keep it around. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Tested-by: Jesper Dangaard Brouer # i40e Cc: Jesse Brandeburg Cc: Tony Nguyen Cc: intel-wired-lan@lists.osuosl.org Link: https://lore.kernel.org/bpf/20210624160609.292325-12-toke@redhat.com --- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 2 -- drivers/net/ethernet/intel/i40e/i40e_xsk.c | 3 --- drivers/net/ethernet/intel/ice/ice_txrx.c | 6 +----- drivers/net/ethernet/intel/ice/ice_xsk.c | 3 --- drivers/net/ethernet/intel/igb/igb_main.c | 2 -- drivers/net/ethernet/intel/igc/igc_main.c | 7 ++----- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 2 -- drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c | 3 --- drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 2 -- 9 files changed, 3 insertions(+), 27 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index b883ab809df3..38eb8151ee9a 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -2298,7 +2298,6 @@ static int i40e_run_xdp(struct i40e_ring *rx_ring, struct xdp_buff *xdp) struct bpf_prog *xdp_prog; u32 act; - rcu_read_lock(); xdp_prog = READ_ONCE(rx_ring->xdp_prog); if (!xdp_prog) @@ -2334,7 +2333,6 @@ out_failure: break; } xdp_out: - rcu_read_unlock(); return result; } diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c index 68f177a86403..e7e778ca074c 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c @@ -153,7 +153,6 @@ static int i40e_run_xdp_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp) struct bpf_prog *xdp_prog; u32 act; - rcu_read_lock(); /* NB! xdp_prog will always be !NULL, due to the fact that * this path is enabled by setting an XDP program. */ @@ -164,7 +163,6 @@ static int i40e_run_xdp_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp) err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog); if (err) goto out_failure; - rcu_read_unlock(); return I40E_XDP_REDIR; } @@ -188,7 +186,6 @@ out_failure: result = I40E_XDP_CONSUMED; break; } - rcu_read_unlock(); return result; } diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index 917eba7fdd0c..dd791ca34fab 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -1135,15 +1135,11 @@ int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget) xdp.frame_sz = ice_rx_frame_truesize(rx_ring, size); #endif - rcu_read_lock(); xdp_prog = READ_ONCE(rx_ring->xdp_prog); - if (!xdp_prog) { - rcu_read_unlock(); + if (!xdp_prog) goto construct_skb; - } xdp_res = ice_run_xdp(rx_ring, &xdp, xdp_prog); - rcu_read_unlock(); if (!xdp_res) goto construct_skb; if (xdp_res & (ICE_XDP_TX | ICE_XDP_REDIR)) { diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index 239b9bf10794..8a093368f631 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -466,7 +466,6 @@ ice_run_xdp_zc(struct ice_ring *rx_ring, struct xdp_buff *xdp) struct ice_ring *xdp_ring; u32 act; - rcu_read_lock(); /* ZC patch is enabled only when XDP program is set, * so here it can not be NULL */ @@ -478,7 +477,6 @@ ice_run_xdp_zc(struct ice_ring *rx_ring, struct xdp_buff *xdp) err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog); if (err) goto out_failure; - rcu_read_unlock(); return ICE_XDP_REDIR; } @@ -503,7 +501,6 @@ out_failure: break; } - rcu_read_unlock(); return result; } diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 5db303d64d14..7e6435dc7e80 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -8381,7 +8381,6 @@ static struct sk_buff *igb_run_xdp(struct igb_adapter *adapter, struct bpf_prog *xdp_prog; u32 act; - rcu_read_lock(); xdp_prog = READ_ONCE(rx_ring->xdp_prog); if (!xdp_prog) @@ -8416,7 +8415,6 @@ out_failure: break; } xdp_out: - rcu_read_unlock(); return ERR_PTR(-result); } diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 3f6b6d4543a8..95323095094d 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -2240,18 +2240,15 @@ static struct sk_buff *igc_xdp_run_prog(struct igc_adapter *adapter, struct bpf_prog *prog; int res; - rcu_read_lock(); - prog = READ_ONCE(adapter->xdp_prog); if (!prog) { res = IGC_XDP_PASS; - goto unlock; + goto out; } res = __igc_xdp_run_prog(adapter, prog, xdp); -unlock: - rcu_read_unlock(); +out: return ERR_PTR(-res); } diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 2ac5b82676f3..ffff69efd78a 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -2199,7 +2199,6 @@ static struct sk_buff *ixgbe_run_xdp(struct ixgbe_adapter *adapter, struct xdp_frame *xdpf; u32 act; - rcu_read_lock(); xdp_prog = READ_ONCE(rx_ring->xdp_prog); if (!xdp_prog) @@ -2237,7 +2236,6 @@ out_failure: break; } xdp_out: - rcu_read_unlock(); return ERR_PTR(-result); } diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c index f72d2978263b..96dd1a4f956a 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c @@ -100,7 +100,6 @@ static int ixgbe_run_xdp_zc(struct ixgbe_adapter *adapter, struct xdp_frame *xdpf; u32 act; - rcu_read_lock(); xdp_prog = READ_ONCE(rx_ring->xdp_prog); act = bpf_prog_run_xdp(xdp_prog, xdp); @@ -108,7 +107,6 @@ static int ixgbe_run_xdp_zc(struct ixgbe_adapter *adapter, err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog); if (err) goto out_failure; - rcu_read_unlock(); return IXGBE_XDP_REDIR; } @@ -134,7 +132,6 @@ out_failure: result = IXGBE_XDP_CONSUMED; break; } - rcu_read_unlock(); return result; } diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index dc56931fc1dc..c714e1ecd308 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -1054,7 +1054,6 @@ static struct sk_buff *ixgbevf_run_xdp(struct ixgbevf_adapter *adapter, struct bpf_prog *xdp_prog; u32 act; - rcu_read_lock(); xdp_prog = READ_ONCE(rx_ring->xdp_prog); if (!xdp_prog) @@ -1082,7 +1081,6 @@ out_failure: break; } xdp_out: - rcu_read_unlock(); return ERR_PTR(-result); } -- cgit v1.2.3-59-g8ed1b From 959ad7ec066d9a61557ad6aedf77ea9b54c82df0 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Thu, 24 Jun 2021 18:06:02 +0200 Subject: marvell: Remove rcu_read_lock() around XDP program invocation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The mvneta and mvpp2 drivers have rcu_read_lock()/rcu_read_unlock() pairs around XDP program invocations. However, the actual lifetime of the objects referred by the XDP program invocation is longer, all the way through to the call to xdp_do_flush(), making the scope of the rcu_read_lock() too small. This turns out to be harmless because it all happens in a single NAPI poll cycle (and thus under local_bh_disable()), but it makes the rcu_read_lock() misleading. Rather than extend the scope of the rcu_read_lock(), just get rid of it entirely. With the addition of RCU annotations to the XDP_REDIRECT map types that take bh execution into account, lockdep even understands this to be safe, so there's really no reason to keep it around. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Cc: Thomas Petazzoni Cc: Russell King Cc: Marcin Wojtas Link: https://lore.kernel.org/bpf/20210624160609.292325-13-toke@redhat.com --- drivers/net/ethernet/marvell/mvneta.c | 2 -- drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 4 ---- 2 files changed, 6 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index c15ce06427d0..ada4e26a5492 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -2373,7 +2373,6 @@ static int mvneta_rx_swbm(struct napi_struct *napi, /* Get number of received packets */ rx_todo = mvneta_rxq_busy_desc_num_get(pp, rxq); - rcu_read_lock(); xdp_prog = READ_ONCE(pp->xdp_prog); /* Fairness NAPI loop */ @@ -2451,7 +2450,6 @@ next: xdp_buf.data_hard_start = NULL; sinfo.nr_frags = 0; } - rcu_read_unlock(); if (xdp_buf.data_hard_start) mvneta_xdp_put_buff(pp, rxq, &xdp_buf, &sinfo, -1); diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index 9bca8c8f9f8d..c31677527a02 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -3881,8 +3881,6 @@ static int mvpp2_rx(struct mvpp2_port *port, struct napi_struct *napi, int rx_done = 0; u32 xdp_ret = 0; - rcu_read_lock(); - xdp_prog = READ_ONCE(port->xdp_prog); /* Get number of received packets and clamp the to-do */ @@ -4028,8 +4026,6 @@ err_drop_frame: mvpp2_bm_pool_put(port, pool, dma_addr, phys_addr); } - rcu_read_unlock(); - if (xdp_ret & MVPP2_XDP_REDIR) xdp_do_flush_map(); -- cgit v1.2.3-59-g8ed1b From c4411b371c104e65efb531ebd4d8892c568e3a29 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Thu, 24 Jun 2021 18:06:03 +0200 Subject: mlx4: Remove rcu_read_lock() around XDP program invocation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The mlx4 driver has rcu_read_lock()/rcu_read_unlock() pairs around XDP program invocations. However, the actual lifetime of the objects referred by the XDP program invocation is longer, all the way through to the call to xdp_do_flush(), making the scope of the rcu_read_lock() too small. This turns out to be harmless because it all happens in a single NAPI poll cycle (and thus under local_bh_disable()), but it makes the rcu_read_lock() misleading. Rather than extend the scope of the rcu_read_lock(), just get rid of it entirely. With the addition of RCU annotations to the XDP_REDIRECT map types that take bh execution into account, lockdep even understands this to be safe, so there's really no reason to keep it around. Also switch the RCU dereferences in the driver loop itself to the _bh variants. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Reviewed-by: Tariq Toukan Link: https://lore.kernel.org/bpf/20210624160609.292325-14-toke@redhat.com --- drivers/net/ethernet/mellanox/mlx4/en_rx.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index cea62b8f554c..442991d91c15 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -679,9 +679,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud ring = priv->rx_ring[cq_ring]; - /* Protect accesses to: ring->xdp_prog, priv->mac_hash list */ - rcu_read_lock(); - xdp_prog = rcu_dereference(ring->xdp_prog); + xdp_prog = rcu_dereference_bh(ring->xdp_prog); xdp_init_buff(&xdp, priv->frag_info[0].frag_stride, &ring->xdp_rxq); doorbell_pending = false; @@ -744,7 +742,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud /* Drop the packet, since HW loopback-ed it */ mac_hash = ethh->h_source[MLX4_EN_MAC_HASH_IDX]; bucket = &priv->mac_hash[mac_hash]; - hlist_for_each_entry_rcu(entry, bucket, hlist) { + hlist_for_each_entry_rcu_bh(entry, bucket, hlist) { if (ether_addr_equal_64bits(entry->mac, ethh->h_source)) goto next; @@ -899,8 +897,6 @@ next: break; } - rcu_read_unlock(); - if (likely(polled)) { if (doorbell_pending) { priv->tx_cq[TX_XDP][cq_ring]->xdp_busy = true; -- cgit v1.2.3-59-g8ed1b From d5789621b658369b21bd13446bab8102cf75df65 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Thu, 24 Jun 2021 18:06:04 +0200 Subject: nfp: Remove rcu_read_lock() around XDP program invocation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The nfp driver has rcu_read_lock()/rcu_read_unlock() pairs around XDP program invocations. However, the actual lifetime of the objects referred by the XDP program invocation is longer, all the way through to the call to xdp_do_flush(), making the scope of the rcu_read_lock() too small. While this is not actually an issue for the nfp driver because it doesn't support XDP_REDIRECT (and thus doesn't call xdp_do_flush()), the rcu_read_lock() is still unneeded. And With the addition of RCU annotations to the XDP_REDIRECT map types that take bh execution into account, lockdep even understands this to be safe, so there's really no reason to keep it around. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Reviewed-by: Simon Horman Cc: oss-drivers@netronome.com Link: https://lore.kernel.org/bpf/20210624160609.292325-15-toke@redhat.com --- drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index eeb30680b4dc..5dfa4799c34f 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -1819,7 +1819,6 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, int budget) struct xdp_buff xdp; int idx; - rcu_read_lock(); xdp_prog = READ_ONCE(dp->xdp_prog); true_bufsz = xdp_prog ? PAGE_SIZE : dp->fl_bufsz; xdp_init_buff(&xdp, PAGE_SIZE - NFP_NET_RX_BUF_HEADROOM, @@ -2036,7 +2035,6 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, int budget) if (!nfp_net_xdp_complete(tx_ring)) pkts_polled = budget; } - rcu_read_unlock(); return pkts_polled; } -- cgit v1.2.3-59-g8ed1b From 4415db6ca85ae57830a83290388f2b9dfa5f237f Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Thu, 24 Jun 2021 18:06:05 +0200 Subject: qede: Remove rcu_read_lock() around XDP program invocation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The qede driver has rcu_read_lock()/rcu_read_unlock() pairs around XDP program invocations. However, the actual lifetime of the objects referred by the XDP program invocation is longer, all the way through to the call to xdp_do_flush(), making the scope of the rcu_read_lock() too small. This turns out to be harmless because it all happens in a single NAPI poll cycle (and thus under local_bh_disable()), but it makes the rcu_read_lock() misleading. Rather than extend the scope of the rcu_read_lock(), just get rid of it entirely. With the addition of RCU annotations to the XDP_REDIRECT map types that take bh execution into account, lockdep even understands this to be safe, so there's really no reason to keep it around. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Cc: Ariel Elior Cc: gr-everest-linux-l2@marvell.com Link: https://lore.kernel.org/bpf/20210624160609.292325-16-toke@redhat.com --- drivers/net/ethernet/qlogic/qede/qede_fp.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qede/qede_fp.c b/drivers/net/ethernet/qlogic/qede/qede_fp.c index 8e150dd4f899..065e9004598e 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_fp.c +++ b/drivers/net/ethernet/qlogic/qede/qede_fp.c @@ -1089,13 +1089,7 @@ static bool qede_rx_xdp(struct qede_dev *edev, xdp_prepare_buff(&xdp, page_address(bd->data), *data_offset, *len, false); - /* Queues always have a full reset currently, so for the time - * being until there's atomic program replace just mark read - * side for map helpers. - */ - rcu_read_lock(); act = bpf_prog_run_xdp(prog, &xdp); - rcu_read_unlock(); /* Recalculate, as XDP might have changed the headers */ *data_offset = xdp.data - xdp.data_hard_start; -- cgit v1.2.3-59-g8ed1b From 4eb14e3fc6197b7205069ed4e2b31eafa11a0697 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Thu, 24 Jun 2021 18:06:06 +0200 Subject: sfc: Remove rcu_read_lock() around XDP program invocation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The sfc driver has rcu_read_lock()/rcu_read_unlock() pairs around XDP program invocations. However, the actual lifetime of the objects referred by the XDP program invocation is longer, all the way through to the call to xdp_do_flush(), making the scope of the rcu_read_lock() too small. This turns out to be harmless because it all happens in a single NAPI poll cycle (and thus under local_bh_disable()), but it makes the rcu_read_lock() misleading. Rather than extend the scope of the rcu_read_lock(), just get rid of it entirely. With the addition of RCU annotations to the XDP_REDIRECT map types that take bh execution into account, lockdep even understands this to be safe, so there's really no reason to keep it around. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Acked-by: Edward Cree Cc: Martin Habets Link: https://lore.kernel.org/bpf/20210624160609.292325-17-toke@redhat.com --- drivers/net/ethernet/sfc/rx.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c index 17b8119c48e5..606750938b89 100644 --- a/drivers/net/ethernet/sfc/rx.c +++ b/drivers/net/ethernet/sfc/rx.c @@ -260,18 +260,14 @@ static bool efx_do_xdp(struct efx_nic *efx, struct efx_channel *channel, s16 offset; int err; - rcu_read_lock(); - xdp_prog = rcu_dereference(efx->xdp_prog); - if (!xdp_prog) { - rcu_read_unlock(); + xdp_prog = rcu_dereference_bh(efx->xdp_prog); + if (!xdp_prog) return true; - } rx_queue = efx_channel_get_rx_queue(channel); if (unlikely(channel->rx_pkt_n_frags > 1)) { /* We can't do XDP on fragmented packets - drop. */ - rcu_read_unlock(); efx_free_rx_buffers(rx_queue, rx_buf, channel->rx_pkt_n_frags); if (net_ratelimit()) @@ -296,7 +292,6 @@ static bool efx_do_xdp(struct efx_nic *efx, struct efx_channel *channel, rx_buf->len, false); xdp_act = bpf_prog_run_xdp(xdp_prog, &xdp); - rcu_read_unlock(); offset = (u8 *)xdp.data - *ehp; -- cgit v1.2.3-59-g8ed1b From 7b6ee873ff20c22af355661b241defa7f6ed7582 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Thu, 24 Jun 2021 18:06:07 +0200 Subject: netsec: Remove rcu_read_lock() around XDP program invocation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The netsec driver has a rcu_read_lock()/rcu_read_unlock() pair around the full RX loop, covering everything up to and including xdp_do_flush(). This is actually the correct behaviour, but because it all happens in a single NAPI poll cycle (and thus under local_bh_disable()), it is also technically redundant. With the addition of RCU annotations to the XDP_REDIRECT map types that take bh execution into account, lockdep even understands this to be safe, so there's really no reason to keep the rcu_read_lock() around anymore, so let's just remove it. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Acked-by: Ilias Apalodimas Cc: Jassi Brar Link: https://lore.kernel.org/bpf/20210624160609.292325-18-toke@redhat.com --- drivers/net/ethernet/socionext/netsec.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/net/ethernet/socionext/netsec.c b/drivers/net/ethernet/socionext/netsec.c index dfc85cc68173..20d148c019d8 100644 --- a/drivers/net/ethernet/socionext/netsec.c +++ b/drivers/net/ethernet/socionext/netsec.c @@ -958,7 +958,6 @@ static int netsec_process_rx(struct netsec_priv *priv, int budget) xdp_init_buff(&xdp, PAGE_SIZE, &dring->xdp_rxq); - rcu_read_lock(); xdp_prog = READ_ONCE(priv->xdp_prog); dma_dir = page_pool_get_dma_dir(dring->page_pool); @@ -1069,8 +1068,6 @@ next: } netsec_finalize_xdp_rx(priv, xdp_act, xdp_xmit); - rcu_read_unlock(); - return done; } -- cgit v1.2.3-59-g8ed1b From 2f1e432d339c5fed435adf521cae392755721050 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Thu, 24 Jun 2021 18:06:08 +0200 Subject: stmmac: Remove rcu_read_lock() around XDP program invocation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The stmmac driver has rcu_read_lock()/rcu_read_unlock() pairs around XDP program invocations. However, the actual lifetime of the objects referred by the XDP program invocation is longer, all the way through to the call to xdp_do_flush(), making the scope of the rcu_read_lock() too small. This turns out to be harmless because it all happens in a single NAPI poll cycle (and thus under local_bh_disable()), but it makes the rcu_read_lock() misleading. Rather than extend the scope of the rcu_read_lock(), just get rid of it entirely. With the addition of RCU annotations to the XDP_REDIRECT map types that take bh execution into account, lockdep even understands this to be safe, so there's really no reason to keep it around. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Cc: Giuseppe Cavallaro Cc: Alexandre Torgue Cc: Jose Abreu Link: https://lore.kernel.org/bpf/20210624160609.292325-19-toke@redhat.com --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 16820873b01d..219535ab2c0c 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -4651,7 +4651,6 @@ static int stmmac_xdp_xmit_back(struct stmmac_priv *priv, return res; } -/* This function assumes rcu_read_lock() is held by the caller. */ static int __stmmac_xdp_run_prog(struct stmmac_priv *priv, struct bpf_prog *prog, struct xdp_buff *xdp) @@ -4693,17 +4692,14 @@ static struct sk_buff *stmmac_xdp_run_prog(struct stmmac_priv *priv, struct bpf_prog *prog; int res; - rcu_read_lock(); - prog = READ_ONCE(priv->xdp_prog); if (!prog) { res = STMMAC_XDP_PASS; - goto unlock; + goto out; } res = __stmmac_xdp_run_prog(priv, prog, xdp); -unlock: - rcu_read_unlock(); +out: return ERR_PTR(-res); } @@ -4973,10 +4969,8 @@ read_again: buf->xdp->data_end = buf->xdp->data + buf1_len; xsk_buff_dma_sync_for_cpu(buf->xdp, rx_q->xsk_pool); - rcu_read_lock(); prog = READ_ONCE(priv->xdp_prog); res = __stmmac_xdp_run_prog(priv, prog, buf->xdp); - rcu_read_unlock(); switch (res) { case STMMAC_XDP_PASS: -- cgit v1.2.3-59-g8ed1b From 0cc84b9a6003fa7f6ef5d19e7c8532a01cd41776 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Thu, 24 Jun 2021 18:06:09 +0200 Subject: ti: Remove rcu_read_lock() around XDP program invocation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The cpsw driver has rcu_read_lock()/rcu_read_unlock() pairs around XDP program invocations. However, the actual lifetime of the objects referred by the XDP program invocation is longer, all the way through to the call to xdp_do_flush(), making the scope of the rcu_read_lock() too small. This turns out to be harmless because it all happens in a single NAPI poll cycle (and thus under local_bh_disable()), but it makes the rcu_read_lock() misleading. Rather than extend the scope of the rcu_read_lock(), just get rid of it entirely. With the addition of RCU annotations to the XDP_REDIRECT map types that take bh execution into account, lockdep even understands this to be safe, so there's really no reason to keep it around. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Tested-by: Grygorii Strashko Reviewed-by: Grygorii Strashko Cc: linux-omap@vger.kernel.org Link: https://lore.kernel.org/bpf/20210624160609.292325-20-toke@redhat.com --- drivers/net/ethernet/ti/cpsw_priv.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/ti/cpsw_priv.c b/drivers/net/ethernet/ti/cpsw_priv.c index 5862f0a4a975..ecc2a6b7e28f 100644 --- a/drivers/net/ethernet/ti/cpsw_priv.c +++ b/drivers/net/ethernet/ti/cpsw_priv.c @@ -1328,13 +1328,9 @@ int cpsw_run_xdp(struct cpsw_priv *priv, int ch, struct xdp_buff *xdp, struct bpf_prog *prog; u32 act; - rcu_read_lock(); - prog = READ_ONCE(priv->xdp_prog); - if (!prog) { - ret = CPSW_XDP_PASS; - goto out; - } + if (!prog) + return CPSW_XDP_PASS; act = bpf_prog_run_xdp(prog, xdp); /* XDP prog might have changed packet data and boundaries */ @@ -1378,10 +1374,8 @@ int cpsw_run_xdp(struct cpsw_priv *priv, int ch, struct xdp_buff *xdp, ndev->stats.rx_bytes += *len; ndev->stats.rx_packets++; out: - rcu_read_unlock(); return ret; drop: - rcu_read_unlock(); page_pool_recycle_direct(cpsw->page_pool[ch], page); return ret; } -- cgit v1.2.3-59-g8ed1b From a196fa78a26571359740f701cf30d774eb8a72cb Mon Sep 17 00:00:00 2001 From: Gary Lin Date: Wed, 23 Jun 2021 12:09:18 +0800 Subject: bpfilter: Specify the log level for the kmsg message Per the kmsg document [0], if we don't specify the log level with a prefix "" in the message string, the default log level will be applied to the message. Since the default level could be warning(4), this would make the log utility such as journalctl treat the message, "Started bpfilter", as a warning. To avoid confusion, this commit adds the prefix "<5>" to make the message always a notice. [0] https://www.kernel.org/doc/Documentation/ABI/testing/dev-kmsg Fixes: 36c4357c63f3 ("net: bpfilter: print umh messages to /dev/kmsg") Reported-by: Martin Loviska Signed-off-by: Gary Lin Signed-off-by: Daniel Borkmann Acked-by: Dmitrii Banshchikov Link: https://lore.kernel.org/bpf/20210623040918.8683-1-glin@suse.com --- net/bpfilter/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bpfilter/main.c b/net/bpfilter/main.c index 05e1cfc1e5cd..291a92546246 100644 --- a/net/bpfilter/main.c +++ b/net/bpfilter/main.c @@ -57,7 +57,7 @@ int main(void) { debug_f = fopen("/dev/kmsg", "w"); setvbuf(debug_f, 0, _IOLBF, 0); - fprintf(debug_f, "Started bpfilter\n"); + fprintf(debug_f, "<5>Started bpfilter\n"); loop(); fclose(debug_f); return 0; -- cgit v1.2.3-59-g8ed1b From 328aac5ecd119ede3633f7d17969b1ff34ccc784 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Tue, 22 Jun 2021 16:30:26 +0530 Subject: bpf, x86: Fix extable offset calculation Commit 4c5de127598e1 ("bpf: Emit explicit NULL pointer checks for PROBE_LDX instructions.") is emitting a couple of instructions before the actual load. Consider those additional instructions while calculating extable offset. Fixes: 4c5de127598e1 ("bpf: Emit explicit NULL pointer checks for PROBE_LDX instructions.") Signed-off-by: Ravi Bangoria Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210622110026.1157847-1-ravi.bangoria@linux.ibm.com --- arch/x86/net/bpf_jit_comp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index db1e83813db5..e835164189f1 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1281,7 +1281,7 @@ st: if (is_imm8(insn->off)) emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off); if (BPF_MODE(insn->code) == BPF_PROBE_MEM) { struct exception_table_entry *ex; - u8 *_insn = image + proglen; + u8 *_insn = image + proglen + (start_of_ldx - temp); s64 delta; /* populate jmp_offset for JMP above */ -- cgit v1.2.3-59-g8ed1b From 95b861a7935bf75f647959073093ab8058b88c26 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Sun, 27 Jun 2021 08:36:27 -0700 Subject: bpf: Allow bpf_get_current_ancestor_cgroup_id for tracing Allow the helper to be called from tracing programs. This is needed to handle cgroup hiererachies in the program. Signed-off-by: Namhyung Kim Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210627153627.824198-1-namhyung@kernel.org --- kernel/trace/bpf_trace.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 7a52bc172841..64bd2d84367f 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1017,6 +1017,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #ifdef CONFIG_CGROUPS case BPF_FUNC_get_current_cgroup_id: return &bpf_get_current_cgroup_id_proto; + case BPF_FUNC_get_current_ancestor_cgroup_id: + return &bpf_get_current_ancestor_cgroup_id_proto; #endif case BPF_FUNC_send_signal: return &bpf_send_signal_proto; -- cgit v1.2.3-59-g8ed1b From ccff81e1d028bbbf8573d3364a87542386c707bf Mon Sep 17 00:00:00 2001 From: Rustam Kovhaev Date: Sat, 26 Jun 2021 11:11:56 -0700 Subject: bpf: Fix false positive kmemleak report in bpf_ringbuf_area_alloc() kmemleak scans struct page, but it does not scan the page content. If we allocate some memory with kmalloc(), then allocate page with alloc_page(), and if we put kmalloc pointer somewhere inside that page, kmemleak will report kmalloc pointer as a false positive. We can instruct kmemleak to scan the memory area by calling kmemleak_alloc() and kmemleak_free(), but part of struct bpf_ringbuf is mmaped to user space, and if struct bpf_ringbuf changes we would have to revisit and review size argument in kmemleak_alloc(), because we do not want kmemleak to scan the user space memory. Let's simplify things and use kmemleak_not_leak() here. For posterity, also adding additional prior analysis from Andrii: I think either kmemleak or syzbot are misreporting this. I've added a bunch of printks around all allocations performed by BPF ringbuf. [...] On repro side I get these two warnings: [vmuser@archvm bpf]$ sudo ./repro BUG: memory leak unreferenced object 0xffff88810d538c00 (size 64): comm "repro", pid 2140, jiffies 4294692933 (age 14.540s) hex dump (first 32 bytes): 00 af 19 04 00 ea ff ff c0 ae 19 04 00 ea ff ff ................ 80 ae 19 04 00 ea ff ff c0 29 2e 04 00 ea ff ff .........)...... backtrace: [<0000000077bfbfbd>] __bpf_map_area_alloc+0x31/0xc0 [<00000000587fa522>] ringbuf_map_alloc.cold.4+0x48/0x218 [<0000000044d49e96>] __do_sys_bpf+0x359/0x1d90 [<00000000f601d565>] do_syscall_64+0x2d/0x40 [<0000000043d3112a>] entry_SYSCALL_64_after_hwframe+0x44/0xae BUG: memory leak unreferenced object 0xffff88810d538c80 (size 64): comm "repro", pid 2143, jiffies 4294699025 (age 8.448s) hex dump (first 32 bytes): 80 aa 19 04 00 ea ff ff 00 ab 19 04 00 ea ff ff ................ c0 ab 19 04 00 ea ff ff 80 44 28 04 00 ea ff ff .........D(..... backtrace: [<0000000077bfbfbd>] __bpf_map_area_alloc+0x31/0xc0 [<00000000587fa522>] ringbuf_map_alloc.cold.4+0x48/0x218 [<0000000044d49e96>] __do_sys_bpf+0x359/0x1d90 [<00000000f601d565>] do_syscall_64+0x2d/0x40 [<0000000043d3112a>] entry_SYSCALL_64_after_hwframe+0x44/0xae Note that both reported leaks (ffff88810d538c80 and ffff88810d538c00) correspond to pages array bpf_ringbuf is allocating and tracking properly internally. Note also that syzbot repro doesn't close FD of created BPF ringbufs, and even when ./repro itself exits with error, there are still two forked processes hanging around in my system. So clearly ringbuf maps are alive at that point. So reporting any memory leak looks weird at that point, because that memory is being used by active referenced BPF ringbuf. It's also a question why repro doesn't clean up its forks. But if I do a `pkill repro`, I do see that all the allocated memory is /properly/ cleaned up [and the] "leaks" are deallocated properly. BTW, if I add close() right after bpf() syscall in syzbot repro, I see that everything is immediately deallocated, like designed. And no memory leak is reported. So I don't think the problem is anywhere in bpf_ringbuf code, rather in the leak detection and/or repro itself. Reported-by: syzbot+5d895828587f49e7fe9b@syzkaller.appspotmail.com Signed-off-by: Rustam Kovhaev [ Daniel: also included analysis from Andrii to the commit log ] Signed-off-by: Daniel Borkmann Tested-by: syzbot+5d895828587f49e7fe9b@syzkaller.appspotmail.com Cc: Dmitry Vyukov Cc: Andrii Nakryiko Link: https://lore.kernel.org/bpf/CAEf4BzYk+dqs+jwu6VKXP-RttcTEGFe+ySTGWT9CRNkagDiJVA@mail.gmail.com Link: https://lore.kernel.org/lkml/YNTAqiE7CWJhOK2M@nuc10 Link: https://lore.kernel.org/lkml/20210615101515.GC26027@arm.com Link: https://syzkaller.appspot.com/bug?extid=5d895828587f49e7fe9b Link: https://lore.kernel.org/bpf/20210626181156.1873604-1-rkovhaev@gmail.com --- kernel/bpf/ringbuf.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 84b3b35fc0d0..9e0c10c6892a 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE) @@ -105,6 +106,7 @@ static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node) rb = vmap(pages, nr_meta_pages + 2 * nr_data_pages, VM_ALLOC | VM_USERMAP, PAGE_KERNEL); if (rb) { + kmemleak_not_leak(pages); rb->pages = pages; rb->nr_pages = nr_pages; return rb; -- cgit v1.2.3-59-g8ed1b From a78cae2476812cecaa4a33d0086bbb53986906bc Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 25 Jun 2021 15:16:12 -0700 Subject: xdp: Move the rxq_info.mem clearing to unreg_mem_model() xdp_rxq_info_unreg() implicitly calls xdp_rxq_info_unreg_mem_model(). This may well be confusing to the driver authors, and lead to double free if they call xdp_rxq_info_unreg_mem_model() before xdp_rxq_info_unreg() (when mem model type == MEM_TYPE_PAGE_POOL). In fact error path of mvpp2_rxq_init() seems to currently do exactly that. The double free will result in refcount underflow in page_pool_destroy(). Make the interface a little more programmer friendly by clearing type and id so that xdp_rxq_info_unreg_mem_model() can be called multiple times. Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210625221612.2637086-1-kuba@kernel.org --- net/core/xdp.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/net/core/xdp.c b/net/core/xdp.c index 725d20f1b100..cc92ccb38432 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -113,8 +113,13 @@ static void mem_allocator_disconnect(void *allocator) void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) { struct xdp_mem_allocator *xa; + int type = xdp_rxq->mem.type; int id = xdp_rxq->mem.id; + /* Reset mem info to defaults */ + xdp_rxq->mem.id = 0; + xdp_rxq->mem.type = 0; + if (xdp_rxq->reg_state != REG_STATE_REGISTERED) { WARN(1, "Missing register, driver bug"); return; @@ -123,7 +128,7 @@ void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) if (id == 0) return; - if (xdp_rxq->mem.type == MEM_TYPE_PAGE_POOL) { + if (type == MEM_TYPE_PAGE_POOL) { rcu_read_lock(); xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params); page_pool_destroy(xa->page_pool); @@ -144,10 +149,6 @@ void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq) xdp_rxq->reg_state = REG_STATE_UNREGISTERED; xdp_rxq->dev = NULL; - - /* Reset mem info to defaults */ - xdp_rxq->mem.id = 0; - xdp_rxq->mem.type = 0; } EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg); -- cgit v1.2.3-59-g8ed1b