diff options
author | 2025-07-02 15:07:18 -0700 | |
---|---|---|
committer | 2025-07-02 15:07:18 -0700 | |
commit | 285c895fba9e42dfdc9eea1c3001585d9ee58233 (patch) | |
tree | 5170087b9fb795ee9716a70f5f93543f03ad7894 | |
parent | Merge branch 'vsock-test-check-for-null-ptr-deref-when-transport-changes' (diff) | |
parent | selftest: net: extend msg_zerocopy test with forwarding (diff) | |
download | wireguard-linux-285c895fba9e42dfdc9eea1c3001585d9ee58233.tar.xz wireguard-linux-285c895fba9e42dfdc9eea1c3001585d9ee58233.zip |
Merge branch 'preserve-msg_zerocopy-with-forwarding'
Willem de Bruijn says:
====================
preserve MSG_ZEROCOPY with forwarding
Avoid false positive copying of zerocopy skb frags when entering the
ingress path if the skb is not queued locally but forwarded.
Patch 1 for more details and feature.
Patch 2 converts the existing selftest to a pass/fail test and adds
coverage for this new feature.
====================
Link: https://patch.msgid.link/20250630194312.1571410-1-willemdebruijn.kernel@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
-rw-r--r-- | net/core/dev.c | 2 | ||||
-rw-r--r-- | net/ipv4/ip_input.c | 6 | ||||
-rw-r--r-- | net/ipv6/ip6_input.c | 7 | ||||
-rw-r--r-- | tools/testing/selftests/net/msg_zerocopy.c | 24 | ||||
-rwxr-xr-x | tools/testing/selftests/net/msg_zerocopy.sh | 84 |
5 files changed, 90 insertions, 33 deletions
diff --git a/net/core/dev.c b/net/core/dev.c index 7ee808eb068e..96d33dead604 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5937,8 +5937,6 @@ check_vlan_id: } if (pt_prev) { - if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) - goto drop; *ppt_prev = pt_prev; } else { drop: diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 30a5e9460d00..f8696e67def4 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -226,6 +226,12 @@ resubmit: static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { + if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) { + __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS); + kfree_skb_reason(skb, SKB_DROP_REASON_NOMEM); + return 0; + } + skb_clear_delivery_time(skb); __skb_pull(skb, skb_network_header_len(skb)); diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 0b3b81fd4a58..168ec07e31cc 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -478,6 +478,13 @@ discard: static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { + if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) { + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INDISCARDS); + kfree_skb_reason(skb, SKB_DROP_REASON_NOMEM); + return 0; + } + skb_clear_delivery_time(skb); ip6_protocol_deliver_rcu(net, skb, 0, false); diff --git a/tools/testing/selftests/net/msg_zerocopy.c b/tools/testing/selftests/net/msg_zerocopy.c index 7ea5fb28c93d..1d5d3c4e7e87 100644 --- a/tools/testing/selftests/net/msg_zerocopy.c +++ b/tools/testing/selftests/net/msg_zerocopy.c @@ -77,6 +77,7 @@ static int cfg_cork; static bool cfg_cork_mixed; static int cfg_cpu = -1; /* default: pin to last cpu */ +static int cfg_expect_zerocopy = -1; static int cfg_family = PF_UNSPEC; static int cfg_ifindex = 1; static int cfg_payload_len; @@ -92,9 +93,9 @@ static socklen_t cfg_alen; static struct sockaddr_storage cfg_dst_addr; static struct sockaddr_storage cfg_src_addr; +static int exitcode; static char payload[IP_MAXPACKET]; static long packets, bytes, completions, expected_completions; -static int zerocopied = -1; static uint32_t next_completion; static uint32_t sends_since_notify; @@ -444,11 +445,13 @@ static bool do_recv_completion(int fd, int domain) next_completion = hi + 1; zerocopy = !(serr->ee_code & SO_EE_CODE_ZEROCOPY_COPIED); - if (zerocopied == -1) - zerocopied = zerocopy; - else if (zerocopied != zerocopy) { - fprintf(stderr, "serr: inconsistent\n"); - zerocopied = zerocopy; + if (cfg_expect_zerocopy != -1 && + cfg_expect_zerocopy != zerocopy) { + fprintf(stderr, "serr: ee_code: %u != expected %u\n", + zerocopy, cfg_expect_zerocopy); + exitcode = 1; + /* suppress repeated messages */ + cfg_expect_zerocopy = zerocopy; } if (cfg_verbose >= 2) @@ -571,7 +574,7 @@ static void do_tx(int domain, int type, int protocol) fprintf(stderr, "tx=%lu (%lu MB) txc=%lu zc=%c\n", packets, bytes >> 20, completions, - zerocopied == 1 ? 'y' : 'n'); + cfg_zerocopy && cfg_expect_zerocopy == 1 ? 'y' : 'n'); } static int do_setup_rx(int domain, int type, int protocol) @@ -715,7 +718,7 @@ static void parse_opts(int argc, char **argv) cfg_payload_len = max_payload_len; - while ((c = getopt(argc, argv, "46c:C:D:i:l:mp:rs:S:t:vz")) != -1) { + while ((c = getopt(argc, argv, "46c:C:D:i:l:mp:rs:S:t:vzZ:")) != -1) { switch (c) { case '4': if (cfg_family != PF_UNSPEC) @@ -770,6 +773,9 @@ static void parse_opts(int argc, char **argv) case 'z': cfg_zerocopy = true; break; + case 'Z': + cfg_expect_zerocopy = !!atoi(optarg); + break; } } @@ -817,5 +823,5 @@ int main(int argc, char **argv) else error(1, 0, "unknown cfg_test %s", cfg_test); - return 0; + return exitcode; } diff --git a/tools/testing/selftests/net/msg_zerocopy.sh b/tools/testing/selftests/net/msg_zerocopy.sh index 89c22f5320e0..28178a38a4e7 100755 --- a/tools/testing/selftests/net/msg_zerocopy.sh +++ b/tools/testing/selftests/net/msg_zerocopy.sh @@ -6,6 +6,7 @@ set -e readonly DEV="veth0" +readonly DUMMY_DEV="dummy0" readonly DEV_MTU=65535 readonly BIN="./msg_zerocopy" @@ -14,21 +15,25 @@ readonly NSPREFIX="ns-${RAND}" readonly NS1="${NSPREFIX}1" readonly NS2="${NSPREFIX}2" -readonly SADDR4='192.168.1.1' -readonly DADDR4='192.168.1.2' -readonly SADDR6='fd::1' -readonly DADDR6='fd::2' +readonly LPREFIX4='192.168.1' +readonly RPREFIX4='192.168.2' +readonly LPREFIX6='fd' +readonly RPREFIX6='fc' + readonly path_sysctl_mem="net.core.optmem_max" # No arguments: automated test if [[ "$#" -eq "0" ]]; then - $0 4 tcp -t 1 - $0 6 tcp -t 1 - $0 4 udp -t 1 - $0 6 udp -t 1 - echo "OK. All tests passed" - exit 0 + ret=0 + + $0 4 tcp -t 1 || ret=1 + $0 6 tcp -t 1 || ret=1 + $0 4 udp -t 1 || ret=1 + $0 6 udp -t 1 || ret=1 + + [[ "$ret" == "0" ]] && echo "OK. All tests passed" + exit $ret fi # Argument parsing @@ -45,11 +50,18 @@ readonly EXTRA_ARGS="$@" # Argument parsing: configure addresses if [[ "${IP}" == "4" ]]; then - readonly SADDR="${SADDR4}" - readonly DADDR="${DADDR4}" + readonly SADDR="${LPREFIX4}.1" + readonly DADDR="${LPREFIX4}.2" + readonly DUMMY_ADDR="${RPREFIX4}.1" + readonly DADDR_TXONLY="${RPREFIX4}.2" + readonly MASK="24" elif [[ "${IP}" == "6" ]]; then - readonly SADDR="${SADDR6}" - readonly DADDR="${DADDR6}" + readonly SADDR="${LPREFIX6}::1" + readonly DADDR="${LPREFIX6}::2" + readonly DUMMY_ADDR="${RPREFIX6}::1" + readonly DADDR_TXONLY="${RPREFIX6}::2" + readonly MASK="64" + readonly NODAD="nodad" else echo "Invalid IP version ${IP}" exit 1 @@ -89,33 +101,61 @@ ip netns exec "${NS2}" sysctl -w -q "${path_sysctl_mem}=1000000" ip link add "${DEV}" mtu "${DEV_MTU}" netns "${NS1}" type veth \ peer name "${DEV}" mtu "${DEV_MTU}" netns "${NS2}" +ip link add "${DUMMY_DEV}" mtu "${DEV_MTU}" netns "${NS2}" type dummy + # Bring the devices up ip -netns "${NS1}" link set "${DEV}" up ip -netns "${NS2}" link set "${DEV}" up +ip -netns "${NS2}" link set "${DUMMY_DEV}" up # Set fixed MAC addresses on the devices ip -netns "${NS1}" link set dev "${DEV}" address 02:02:02:02:02:02 ip -netns "${NS2}" link set dev "${DEV}" address 06:06:06:06:06:06 # Add fixed IP addresses to the devices -ip -netns "${NS1}" addr add 192.168.1.1/24 dev "${DEV}" -ip -netns "${NS2}" addr add 192.168.1.2/24 dev "${DEV}" -ip -netns "${NS1}" addr add fd::1/64 dev "${DEV}" nodad -ip -netns "${NS2}" addr add fd::2/64 dev "${DEV}" nodad +ip -netns "${NS1}" addr add "${SADDR}/${MASK}" dev "${DEV}" ${NODAD} +ip -netns "${NS2}" addr add "${DADDR}/${MASK}" dev "${DEV}" ${NODAD} +ip -netns "${NS2}" addr add "${DUMMY_ADDR}/${MASK}" dev "${DUMMY_DEV}" ${NODAD} + +ip -netns "${NS1}" route add default via "${DADDR}" dev "${DEV}" +ip -netns "${NS2}" route add default via "${DADDR_TXONLY}" dev "${DUMMY_DEV}" + +ip netns exec "${NS2}" sysctl -wq net.ipv4.ip_forward=1 +ip netns exec "${NS2}" sysctl -wq net.ipv6.conf.all.forwarding=1 # Optionally disable sg or csum offload to test edge cases # ip netns exec "${NS1}" ethtool -K "${DEV}" sg off +ret=0 + do_test() { local readonly ARGS="$1" - echo "ipv${IP} ${TXMODE} ${ARGS}" - ip netns exec "${NS2}" "${BIN}" "-${IP}" -i "${DEV}" -t 2 -C 2 -S "${SADDR}" -D "${DADDR}" ${ARGS} -r "${RXMODE}" & + # tx-rx test + # packets queued to a local socket are copied, + # sender notification has SO_EE_CODE_ZEROCOPY_COPIED. + + echo -e "\nipv${IP} ${TXMODE} ${ARGS} tx-rx\n" + ip netns exec "${NS2}" "${BIN}" "-${IP}" -i "${DEV}" -t 2 -C 2 \ + -S "${SADDR}" -D "${DADDR}" ${ARGS} -r "${RXMODE}" & sleep 0.2 - ip netns exec "${NS1}" "${BIN}" "-${IP}" -i "${DEV}" -t 1 -C 3 -S "${SADDR}" -D "${DADDR}" ${ARGS} "${TXMODE}" + ip netns exec "${NS1}" "${BIN}" "-${IP}" -i "${DEV}" -t 1 -C 3 \ + -S "${SADDR}" -D "${DADDR}" ${ARGS} "${TXMODE}" -Z 0 || ret=1 wait + + # next test is unconnected tx to dummy0, cannot exercise with tcp + [[ "${TXMODE}" == "tcp" ]] && return + + # tx-only test: send out dummy0 + # packets leaving the host are not copied, + # sender notification does not have SO_EE_CODE_ZEROCOPY_COPIED. + + echo -e "\nipv${IP} ${TXMODE} ${ARGS} tx-only\n" + ip netns exec "${NS1}" "${BIN}" "-${IP}" -i "${DEV}" -t 1 -C 3 \ + -S "${SADDR}" -D "${DADDR_TXONLY}" ${ARGS} "${TXMODE}" -Z 1 || ret=1 } do_test "${EXTRA_ARGS}" do_test "-z ${EXTRA_ARGS}" -echo ok + +[[ "$ret" == "0" ]] && echo "OK" |