diff options
Diffstat (limited to 'tools')
-rwxr-xr-x | tools/hv/bondvf.sh | 255 | ||||
-rw-r--r-- | tools/include/uapi/linux/bpf.h | 1 | ||||
-rwxr-xr-x | tools/kvm/kvm_stat/kvm_stat | 22 | ||||
-rw-r--r-- | tools/lib/bpf/bpf.c | 1 | ||||
-rw-r--r-- | tools/perf/ui/browser.c | 2 | ||||
-rw-r--r-- | tools/perf/util/evsel.c | 8 | ||||
-rw-r--r-- | tools/perf/util/machine.c | 2 | ||||
-rw-r--r-- | tools/testing/selftests/bpf/test_maps.c | 4 | ||||
-rw-r--r-- | tools/testing/selftests/bpf/test_progs.c | 8 | ||||
-rw-r--r-- | tools/testing/selftests/bpf/test_verifier.c | 28 | ||||
-rw-r--r-- | tools/testing/selftests/net/.gitignore | 1 | ||||
-rw-r--r-- | tools/testing/selftests/net/Makefile | 2 | ||||
-rw-r--r-- | tools/testing/selftests/net/msg_zerocopy.c | 697 | ||||
-rwxr-xr-x | tools/testing/selftests/net/msg_zerocopy.sh | 112 |
14 files changed, 873 insertions, 270 deletions
diff --git a/tools/hv/bondvf.sh b/tools/hv/bondvf.sh deleted file mode 100755 index 80f102860cf8..000000000000 --- a/tools/hv/bondvf.sh +++ /dev/null @@ -1,255 +0,0 @@ -#!/bin/bash - -# This example script creates bonding network devices based on synthetic NIC -# (the virtual network adapter usually provided by Hyper-V) and the matching -# VF NIC (SRIOV virtual function). So the synthetic NIC and VF NIC can -# function as one network device, and fail over to the synthetic NIC if VF is -# down. -# -# Usage: -# - After configured vSwitch and vNIC with SRIOV, start Linux virtual -# machine (VM) -# - Run this scripts on the VM. It will create configuration files in -# distro specific directory. -# - Reboot the VM, so that the bonding config are enabled. -# -# The config files are DHCP by default. You may edit them if you need to change -# to Static IP or change other settings. -# - -sysdir=/sys/class/net -netvsc_cls={f8615163-df3e-46c5-913f-f2d2f965ed0e} -bondcnt=0 - -# Detect Distro -if [ -f /etc/redhat-release ]; -then - cfgdir=/etc/sysconfig/network-scripts - distro=redhat -elif grep -q 'Ubuntu' /etc/issue -then - cfgdir=/etc/network - distro=ubuntu -elif grep -q 'SUSE' /etc/issue -then - cfgdir=/etc/sysconfig/network - distro=suse -else - echo "Unsupported Distro" - exit 1 -fi - -echo Detected Distro: $distro, or compatible - -# Get a list of ethernet names -list_eth=(`cd $sysdir && ls -d */ | cut -d/ -f1 | grep -v bond`) -eth_cnt=${#list_eth[@]} - -echo List of net devices: - -# Get the MAC addresses -for (( i=0; i < $eth_cnt; i++ )) -do - list_mac[$i]=`cat $sysdir/${list_eth[$i]}/address` - echo ${list_eth[$i]}, ${list_mac[$i]} -done - -# Find NIC with matching MAC -for (( i=0; i < $eth_cnt-1; i++ )) -do - for (( j=i+1; j < $eth_cnt; j++ )) - do - if [ "${list_mac[$i]}" = "${list_mac[$j]}" ] - then - list_match[$i]=${list_eth[$j]} - break - fi - done -done - -function create_eth_cfg_redhat { - local fn=$cfgdir/ifcfg-$1 - - rm -f $fn - echo DEVICE=$1 >>$fn - echo TYPE=Ethernet >>$fn - echo BOOTPROTO=none >>$fn - echo UUID=`uuidgen` >>$fn - echo ONBOOT=yes >>$fn - echo PEERDNS=yes >>$fn - echo IPV6INIT=yes >>$fn - echo MASTER=$2 >>$fn - echo SLAVE=yes >>$fn -} - -function create_eth_cfg_pri_redhat { - create_eth_cfg_redhat $1 $2 -} - -function create_bond_cfg_redhat { - local fn=$cfgdir/ifcfg-$1 - - rm -f $fn - echo DEVICE=$1 >>$fn - echo TYPE=Bond >>$fn - echo BOOTPROTO=dhcp >>$fn - echo UUID=`uuidgen` >>$fn - echo ONBOOT=yes >>$fn - echo PEERDNS=yes >>$fn - echo IPV6INIT=yes >>$fn - echo BONDING_MASTER=yes >>$fn - echo BONDING_OPTS=\"mode=active-backup miimon=100 primary=$2\" >>$fn -} - -function del_eth_cfg_ubuntu { - local mainfn=$cfgdir/interfaces - local fnlist=( $mainfn ) - - local dirlist=(`awk '/^[ \t]*source/{print $2}' $mainfn`) - - local i - for i in "${dirlist[@]}" - do - fnlist+=(`ls $i 2>/dev/null`) - done - - local tmpfl=$(mktemp) - - local nic_start='^[ \t]*(auto|iface|mapping|allow-.*)[ \t]+'$1 - local nic_end='^[ \t]*(auto|iface|mapping|allow-.*|source)' - - local fn - for fn in "${fnlist[@]}" - do - awk "/$nic_end/{x=0} x{next} /$nic_start/{x=1;next} 1" \ - $fn >$tmpfl - - cp $tmpfl $fn - done - - rm $tmpfl -} - -function create_eth_cfg_ubuntu { - local fn=$cfgdir/interfaces - - del_eth_cfg_ubuntu $1 - echo $'\n'auto $1 >>$fn - echo iface $1 inet manual >>$fn - echo bond-master $2 >>$fn -} - -function create_eth_cfg_pri_ubuntu { - local fn=$cfgdir/interfaces - - del_eth_cfg_ubuntu $1 - echo $'\n'allow-hotplug $1 >>$fn - echo iface $1 inet manual >>$fn - echo bond-master $2 >>$fn - echo bond-primary $1 >>$fn -} - -function create_bond_cfg_ubuntu { - local fn=$cfgdir/interfaces - - del_eth_cfg_ubuntu $1 - - echo $'\n'auto $1 >>$fn - echo iface $1 inet dhcp >>$fn - echo bond-mode active-backup >>$fn - echo bond-miimon 100 >>$fn - echo bond-slaves none >>$fn -} - -function create_eth_cfg_suse { - local fn=$cfgdir/ifcfg-$1 - - rm -f $fn - echo BOOTPROTO=none >>$fn - echo STARTMODE=auto >>$fn -} - -function create_eth_cfg_pri_suse { - local fn=$cfgdir/ifcfg-$1 - - rm -f $fn - echo BOOTPROTO=none >>$fn - echo STARTMODE=hotplug >>$fn -} - -function create_bond_cfg_suse { - local fn=$cfgdir/ifcfg-$1 - - rm -f $fn - echo BOOTPROTO=dhcp >>$fn - echo STARTMODE=auto >>$fn - echo BONDING_MASTER=yes >>$fn - echo BONDING_SLAVE_0=$2 >>$fn - echo BONDING_SLAVE_1=$3 >>$fn - echo BONDING_MODULE_OPTS=\'mode=active-backup miimon=100 primary=$2\' >>$fn -} - -function create_bond { - local bondname=bond$bondcnt - local primary - local secondary - - local class_id1=`cat $sysdir/$1/device/class_id 2>/dev/null` - local class_id2=`cat $sysdir/$2/device/class_id 2>/dev/null` - - if [ "$class_id1" = "$netvsc_cls" ] - then - primary=$2 - secondary=$1 - elif [ "$class_id2" = "$netvsc_cls" ] - then - primary=$1 - secondary=$2 - else - return 0 - fi - - echo $'\nBond name:' $bondname - - if [ $distro == ubuntu ] - then - local mainfn=$cfgdir/interfaces - local s="^[ \t]*(auto|iface|mapping|allow-.*)[ \t]+${bondname}" - - grep -E "$s" $mainfn - if [ $? -eq 0 ] - then - echo "WARNING: ${bondname} has been configured already" - return - fi - elif [ $distro == redhat ] || [ $distro == suse ] - then - local fn=$cfgdir/ifcfg-$bondname - if [ -f $fn ] - then - echo "WARNING: ${bondname} has been configured already" - return - fi - else - echo "Unsupported Distro: ${distro}" - return - fi - - echo configuring $primary - create_eth_cfg_pri_$distro $primary $bondname - - echo configuring $secondary - create_eth_cfg_$distro $secondary $bondname - - echo creating: $bondname with primary slave: $primary - create_bond_cfg_$distro $bondname $primary $secondary -} - -for (( i=0; i < $eth_cnt-1; i++ )) -do - if [ -n "${list_match[$i]}" ] - then - create_bond ${list_eth[$i]} ${list_match[$i]} - let bondcnt=bondcnt+1 - fi -done diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index ce2988be4f0e..1579cab49717 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -104,6 +104,7 @@ enum bpf_map_type { BPF_MAP_TYPE_LPM_TRIE, BPF_MAP_TYPE_ARRAY_OF_MAPS, BPF_MAP_TYPE_HASH_OF_MAPS, + BPF_MAP_TYPE_DEVMAP, }; enum bpf_prog_type { diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat index dd8f00cfb8b4..32283d88701a 100755 --- a/tools/kvm/kvm_stat/kvm_stat +++ b/tools/kvm/kvm_stat/kvm_stat @@ -474,7 +474,7 @@ class Provider(object): @staticmethod def is_field_wanted(fields_filter, field): """Indicate whether field is valid according to fields_filter.""" - if not fields_filter: + if not fields_filter or fields_filter == "help": return True return re.match(fields_filter, field) is not None @@ -1413,8 +1413,8 @@ performance. Requirements: - Access to: - /sys/kernel/debug/kvm - /sys/kernel/debug/trace/events/* + %s + %s/events/* /proc/pid/task - /proc/sys/kernel/perf_event_paranoid < 1 if user has no CAP_SYS_ADMIN and perf events are used. @@ -1434,7 +1434,7 @@ Interactive Commands: s set update interval x toggle reporting of stats for individual child trace events Press any other key to refresh statistics immediately. -""" +""" % (PATH_DEBUGFS_KVM, PATH_DEBUGFS_TRACING) class PlainHelpFormatter(optparse.IndentedHelpFormatter): def format_description(self, description): @@ -1496,7 +1496,8 @@ Press any other key to refresh statistics immediately. action='store', default=DEFAULT_REGEX, dest='fields', - help='fields to display (regex)', + help='''fields to display (regex) + "-f help" for a list of available events''', ) optparser.add_option('-p', '--pid', action='store', @@ -1559,6 +1560,17 @@ def main(): stats = Stats(options) + if options.fields == "help": + event_list = "\n" + s = stats.get() + for key in s.keys(): + if key.find('(') != -1: + key = key[0:key.find('(')] + if event_list.find('\n' + key + '\n') == -1: + event_list += key + '\n' + sys.stdout.write(event_list) + return "" + if options.log: log(stats) elif not options.once: diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 412a7c82995a..256f571f2ab5 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -314,7 +314,6 @@ int bpf_obj_get_info_by_fd(int prog_fd, void *info, __u32 *info_len) int err; bzero(&attr, sizeof(attr)); - bzero(info, *info_len); attr.info.bpf_fd = prog_fd; attr.info.info_len = *info_len; attr.info.info = ptr_to_u64(info); diff --git a/tools/perf/ui/browser.c b/tools/perf/ui/browser.c index a4d3762cd825..83874b0e266c 100644 --- a/tools/perf/ui/browser.c +++ b/tools/perf/ui/browser.c @@ -704,7 +704,7 @@ static void __ui_browser__line_arrow_down(struct ui_browser *browser, ui_browser__gotorc(browser, row, column + 1); SLsmg_draw_hline(2); - if (row++ == 0) + if (++row == 0) goto out; } else row = 0; diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 87b431886670..413f74df08de 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -273,7 +273,7 @@ struct perf_evsel *perf_evsel__new_cycles(void) struct perf_event_attr attr = { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES, - .exclude_kernel = 1, + .exclude_kernel = geteuid() != 0, }; struct perf_evsel *evsel; @@ -298,8 +298,10 @@ struct perf_evsel *perf_evsel__new_cycles(void) goto out; /* use asprintf() because free(evsel) assumes name is allocated */ - if (asprintf(&evsel->name, "cycles%.*s", - attr.precise_ip ? attr.precise_ip + 1 : 0, ":ppp") < 0) + if (asprintf(&evsel->name, "cycles%s%s%.*s", + (attr.precise_ip || attr.exclude_kernel) ? ":" : "", + attr.exclude_kernel ? "u" : "", + attr.precise_ip ? attr.precise_ip + 1 : 0, "ppp") < 0) goto error_free; out: return evsel; diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 5de2b86b9880..2e9eb6aa3ce2 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -2209,7 +2209,7 @@ int machine__get_kernel_start(struct machine *machine) machine->kernel_start = 1ULL << 63; if (map) { err = map__load(map); - if (map->start) + if (!err) machine->kernel_start = map->start; } return err; diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index 36d6ac3f0c1c..c991ab69a720 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -440,7 +440,7 @@ static void test_arraymap_percpu_many_keys(void) static void test_devmap(int task, void *data) { - int next_key, fd; + int fd; __u32 key, value; fd = bpf_create_map(BPF_MAP_TYPE_DEVMAP, sizeof(key), sizeof(value), @@ -620,6 +620,8 @@ static void run_all_tests(void) test_arraymap_percpu_many_keys(); + test_devmap(0, NULL); + test_map_large(); test_map_parallel(); test_map_stress(); diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 5855cd3d3d45..1f7dd35551b9 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -340,6 +340,7 @@ static void test_bpf_obj_id(void) /* Check getting prog info */ info_len = sizeof(struct bpf_prog_info) * 2; + bzero(&prog_infos[i], info_len); prog_infos[i].jited_prog_insns = ptr_to_u64(jited_insns); prog_infos[i].jited_prog_len = sizeof(jited_insns); prog_infos[i].xlated_prog_insns = ptr_to_u64(xlated_insns); @@ -369,6 +370,7 @@ static void test_bpf_obj_id(void) /* Check getting map info */ info_len = sizeof(struct bpf_map_info) * 2; + bzero(&map_infos[i], info_len); err = bpf_obj_get_info_by_fd(map_fds[i], &map_infos[i], &info_len); if (CHECK(err || @@ -394,7 +396,7 @@ static void test_bpf_obj_id(void) nr_id_found = 0; next_id = 0; while (!bpf_prog_get_next_id(next_id, &next_id)) { - struct bpf_prog_info prog_info; + struct bpf_prog_info prog_info = {}; int prog_fd; info_len = sizeof(prog_info); @@ -418,6 +420,8 @@ static void test_bpf_obj_id(void) nr_id_found++; err = bpf_obj_get_info_by_fd(prog_fd, &prog_info, &info_len); + prog_infos[i].jited_prog_insns = 0; + prog_infos[i].xlated_prog_insns = 0; CHECK(err || info_len != sizeof(struct bpf_prog_info) || memcmp(&prog_info, &prog_infos[i], info_len), "get-prog-info(next_id->fd)", @@ -436,7 +440,7 @@ static void test_bpf_obj_id(void) nr_id_found = 0; next_id = 0; while (!bpf_map_get_next_id(next_id, &next_id)) { - struct bpf_map_info map_info; + struct bpf_map_info map_info = {}; int map_fd; info_len = sizeof(map_info); diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index af7d173910f4..addea82f76c9 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -5980,6 +5980,34 @@ static struct bpf_test tests[] = { .result = REJECT, .result_unpriv = REJECT, }, + { + "subtraction bounds (map value)", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9), + BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JGT, BPF_REG_1, 0xff, 7), + BPF_LDX_MEM(BPF_B, BPF_REG_3, BPF_REG_0, 1), + BPF_JMP_IMM(BPF_JGT, BPF_REG_3, 0xff, 5), + BPF_ALU64_REG(BPF_SUB, BPF_REG_1, BPF_REG_3), + BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 56), + BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map1 = { 3 }, + .errstr_unpriv = "R0 pointer arithmetic prohibited", + .errstr = "R0 min value is negative, either use unsigned index or do a if (index >=0) check.", + .result = REJECT, + .result_unpriv = REJECT, + }, }; static int probe_filter_length(const struct bpf_insn *fp) diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index afe109e5508a..9801253e4802 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -1,3 +1,4 @@ +msg_zerocopy socket psock_fanout psock_tpacket diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index f6c9dbf478f8..6135a8448900 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -7,7 +7,7 @@ TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh TEST_GEN_FILES = socket TEST_GEN_FILES += psock_fanout psock_tpacket TEST_GEN_FILES += reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa -TEST_GEN_FILES += reuseport_dualstack +TEST_GEN_FILES += reuseport_dualstack msg_zerocopy include ../lib.mk diff --git a/tools/testing/selftests/net/msg_zerocopy.c b/tools/testing/selftests/net/msg_zerocopy.c new file mode 100644 index 000000000000..448c69a8af74 --- /dev/null +++ b/tools/testing/selftests/net/msg_zerocopy.c @@ -0,0 +1,697 @@ +/* Evaluate MSG_ZEROCOPY + * + * Send traffic between two processes over one of the supported + * protocols and modes: + * + * PF_INET/PF_INET6 + * - SOCK_STREAM + * - SOCK_DGRAM + * - SOCK_DGRAM with UDP_CORK + * - SOCK_RAW + * - SOCK_RAW with IP_HDRINCL + * + * PF_PACKET + * - SOCK_DGRAM + * - SOCK_RAW + * + * Start this program on two connected hosts, one in send mode and + * the other with option '-r' to put it in receiver mode. + * + * If zerocopy mode ('-z') is enabled, the sender will verify that + * the kernel queues completions on the error queue for all zerocopy + * transfers. + */ + +#define _GNU_SOURCE + +#include <arpa/inet.h> +#include <error.h> +#include <errno.h> +#include <limits.h> +#include <linux/errqueue.h> +#include <linux/if_packet.h> +#include <linux/ipv6.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <net/ethernet.h> +#include <net/if.h> +#include <netinet/ip.h> +#include <netinet/ip6.h> +#include <netinet/tcp.h> +#include <netinet/udp.h> +#include <poll.h> +#include <sched.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdint.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> + +#ifndef SO_EE_ORIGIN_ZEROCOPY +#define SO_EE_ORIGIN_ZEROCOPY SO_EE_ORIGIN_UPAGE +#endif + +#ifndef SO_ZEROCOPY +#define SO_ZEROCOPY 59 +#endif + +#ifndef SO_EE_CODE_ZEROCOPY_COPIED +#define SO_EE_CODE_ZEROCOPY_COPIED 1 +#endif + +#ifndef MSG_ZEROCOPY +#define MSG_ZEROCOPY 0x4000000 +#endif + +static int cfg_cork; +static bool cfg_cork_mixed; +static int cfg_cpu = -1; /* default: pin to last cpu */ +static int cfg_family = PF_UNSPEC; +static int cfg_ifindex = 1; +static int cfg_payload_len; +static int cfg_port = 8000; +static bool cfg_rx; +static int cfg_runtime_ms = 4200; +static int cfg_verbose; +static int cfg_waittime_ms = 500; +static bool cfg_zerocopy; + +static socklen_t cfg_alen; +static struct sockaddr_storage cfg_dst_addr; +static struct sockaddr_storage cfg_src_addr; + +static char payload[IP_MAXPACKET]; +static long packets, bytes, completions, expected_completions; +static int zerocopied = -1; +static uint32_t next_completion; + +static unsigned long gettimeofday_ms(void) +{ + struct timeval tv; + + gettimeofday(&tv, NULL); + return (tv.tv_sec * 1000) + (tv.tv_usec / 1000); +} + +static uint16_t get_ip_csum(const uint16_t *start, int num_words) +{ + unsigned long sum = 0; + int i; + + for (i = 0; i < num_words; i++) + sum += start[i]; + + while (sum >> 16) + sum = (sum & 0xFFFF) + (sum >> 16); + + return ~sum; +} + +static int do_setcpu(int cpu) +{ + cpu_set_t mask; + + CPU_ZERO(&mask); + CPU_SET(cpu, &mask); + if (sched_setaffinity(0, sizeof(mask), &mask)) + error(1, 0, "setaffinity %d", cpu); + + if (cfg_verbose) + fprintf(stderr, "cpu: %u\n", cpu); + + return 0; +} + +static void do_setsockopt(int fd, int level, int optname, int val) +{ + if (setsockopt(fd, level, optname, &val, sizeof(val))) + error(1, errno, "setsockopt %d.%d: %d", level, optname, val); +} + +static int do_poll(int fd, int events) +{ + struct pollfd pfd; + int ret; + + pfd.events = events; + pfd.revents = 0; + pfd.fd = fd; + + ret = poll(&pfd, 1, cfg_waittime_ms); + if (ret == -1) + error(1, errno, "poll"); + + return ret && (pfd.revents & events); +} + +static int do_accept(int fd) +{ + int fda = fd; + + fd = accept(fda, NULL, NULL); + if (fd == -1) + error(1, errno, "accept"); + if (close(fda)) + error(1, errno, "close listen sock"); + + return fd; +} + +static bool do_sendmsg(int fd, struct msghdr *msg, bool do_zerocopy) +{ + int ret, len, i, flags; + + len = 0; + for (i = 0; i < msg->msg_iovlen; i++) + len += msg->msg_iov[i].iov_len; + + flags = MSG_DONTWAIT; + if (do_zerocopy) + flags |= MSG_ZEROCOPY; + + ret = sendmsg(fd, msg, flags); + if (ret == -1 && errno == EAGAIN) + return false; + if (ret == -1) + error(1, errno, "send"); + if (cfg_verbose && ret != len) + fprintf(stderr, "send: ret=%u != %u\n", ret, len); + + if (len) { + packets++; + bytes += ret; + if (do_zerocopy && ret) + expected_completions++; + } + + return true; +} + +static void do_sendmsg_corked(int fd, struct msghdr *msg) +{ + bool do_zerocopy = cfg_zerocopy; + int i, payload_len, extra_len; + + /* split up the packet. for non-multiple, make first buffer longer */ + payload_len = cfg_payload_len / cfg_cork; + extra_len = cfg_payload_len - (cfg_cork * payload_len); + + do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 1); + + for (i = 0; i < cfg_cork; i++) { + + /* in mixed-frags mode, alternate zerocopy and copy frags + * start with non-zerocopy, to ensure attach later works + */ + if (cfg_cork_mixed) + do_zerocopy = (i & 1); + + msg->msg_iov[0].iov_len = payload_len + extra_len; + extra_len = 0; + + do_sendmsg(fd, msg, do_zerocopy); + } + + do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 0); +} + +static int setup_iph(struct iphdr *iph, uint16_t payload_len) +{ + struct sockaddr_in *daddr = (void *) &cfg_dst_addr; + struct sockaddr_in *saddr = (void *) &cfg_src_addr; + + memset(iph, 0, sizeof(*iph)); + + iph->version = 4; + iph->tos = 0; + iph->ihl = 5; + iph->ttl = 2; + iph->saddr = saddr->sin_addr.s_addr; + iph->daddr = daddr->sin_addr.s_addr; + iph->protocol = IPPROTO_EGP; + iph->tot_len = htons(sizeof(*iph) + payload_len); + iph->check = get_ip_csum((void *) iph, iph->ihl << 1); + + return sizeof(*iph); +} + +static int setup_ip6h(struct ipv6hdr *ip6h, uint16_t payload_len) +{ + struct sockaddr_in6 *daddr = (void *) &cfg_dst_addr; + struct sockaddr_in6 *saddr = (void *) &cfg_src_addr; + + memset(ip6h, 0, sizeof(*ip6h)); + + ip6h->version = 6; + ip6h->payload_len = htons(payload_len); + ip6h->nexthdr = IPPROTO_EGP; + ip6h->hop_limit = 2; + ip6h->saddr = saddr->sin6_addr; + ip6h->daddr = daddr->sin6_addr; + + return sizeof(*ip6h); +} + +static void setup_sockaddr(int domain, const char *str_addr, void *sockaddr) +{ + struct sockaddr_in6 *addr6 = (void *) sockaddr; + struct sockaddr_in *addr4 = (void *) sockaddr; + + switch (domain) { + case PF_INET: + addr4->sin_family = AF_INET; + addr4->sin_port = htons(cfg_port); + if (inet_pton(AF_INET, str_addr, &(addr4->sin_addr)) != 1) + error(1, 0, "ipv4 parse error: %s", str_addr); + break; + case PF_INET6: + addr6->sin6_family = AF_INET6; + addr6->sin6_port = htons(cfg_port); + if (inet_pton(AF_INET6, str_addr, &(addr6->sin6_addr)) != 1) + error(1, 0, "ipv6 parse error: %s", str_addr); + break; + default: + error(1, 0, "illegal domain"); + } +} + +static int do_setup_tx(int domain, int type, int protocol) +{ + int fd; + + fd = socket(domain, type, protocol); + if (fd == -1) + error(1, errno, "socket t"); + + do_setsockopt(fd, SOL_SOCKET, SO_SNDBUF, 1 << 21); + if (cfg_zerocopy) + do_setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, 1); + + if (domain != PF_PACKET) + if (connect(fd, (void *) &cfg_dst_addr, cfg_alen)) + error(1, errno, "connect"); + + return fd; +} + +static bool do_recv_completion(int fd) +{ + struct sock_extended_err *serr; + struct msghdr msg = {}; + struct cmsghdr *cm; + uint32_t hi, lo, range; + int ret, zerocopy; + char control[100]; + + msg.msg_control = control; + msg.msg_controllen = sizeof(control); + + ret = recvmsg(fd, &msg, MSG_ERRQUEUE); + if (ret == -1 && errno == EAGAIN) + return false; + if (ret == -1) + error(1, errno, "recvmsg notification"); + if (msg.msg_flags & MSG_CTRUNC) + error(1, errno, "recvmsg notification: truncated"); + + cm = CMSG_FIRSTHDR(&msg); + if (!cm) + error(1, 0, "cmsg: no cmsg"); + if (!((cm->cmsg_level == SOL_IP && cm->cmsg_type == IP_RECVERR) || + (cm->cmsg_level == SOL_IPV6 && cm->cmsg_type == IPV6_RECVERR) || + (cm->cmsg_level == SOL_PACKET && cm->cmsg_type == PACKET_TX_TIMESTAMP))) + error(1, 0, "serr: wrong type: %d.%d", + cm->cmsg_level, cm->cmsg_type); + + serr = (void *) CMSG_DATA(cm); + if (serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY) + error(1, 0, "serr: wrong origin: %u", serr->ee_origin); + if (serr->ee_errno != 0) + error(1, 0, "serr: wrong error code: %u", serr->ee_errno); + + hi = serr->ee_data; + lo = serr->ee_info; + range = hi - lo + 1; + + /* Detect notification gaps. These should not happen often, if at all. + * Gaps can occur due to drops, reordering and retransmissions. + */ + if (lo != next_completion) + fprintf(stderr, "gap: %u..%u does not append to %u\n", + lo, hi, next_completion); + next_completion = hi + 1; + + zerocopy = !(serr->ee_code & SO_EE_CODE_ZEROCOPY_COPIED); + if (zerocopied == -1) + zerocopied = zerocopy; + else if (zerocopied != zerocopy) { + fprintf(stderr, "serr: inconsistent\n"); + zerocopied = zerocopy; + } + + if (cfg_verbose >= 2) + fprintf(stderr, "completed: %u (h=%u l=%u)\n", + range, hi, lo); + + completions += range; + return true; +} + +/* Read all outstanding messages on the errqueue */ +static void do_recv_completions(int fd) +{ + while (do_recv_completion(fd)) {} +} + +/* Wait for all remaining completions on the errqueue */ +static void do_recv_remaining_completions(int fd) +{ + int64_t tstop = gettimeofday_ms() + cfg_waittime_ms; + + while (completions < expected_completions && + gettimeofday_ms() < tstop) { + if (do_poll(fd, POLLERR)) + do_recv_completions(fd); + } + + if (completions < expected_completions) + error(1, 0, "missing notifications: %lu < %lu\n", + completions, expected_completions); +} + +static void do_tx(int domain, int type, int protocol) +{ + struct iovec iov[3] = { {0} }; + struct sockaddr_ll laddr; + struct msghdr msg = {0}; + struct ethhdr eth; + union { + struct ipv6hdr ip6h; + struct iphdr iph; + } nh; + uint64_t tstop; + int fd; + + fd = do_setup_tx(domain, type, protocol); + + if (domain == PF_PACKET) { + uint16_t proto = cfg_family == PF_INET ? ETH_P_IP : ETH_P_IPV6; + + /* sock_raw passes ll header as data */ + if (type == SOCK_RAW) { + memset(eth.h_dest, 0x06, ETH_ALEN); + memset(eth.h_source, 0x02, ETH_ALEN); + eth.h_proto = htons(proto); + iov[0].iov_base = ð + iov[0].iov_len = sizeof(eth); + msg.msg_iovlen++; + } + + /* both sock_raw and sock_dgram expect name */ + memset(&laddr, 0, sizeof(laddr)); + laddr.sll_family = AF_PACKET; + laddr.sll_ifindex = cfg_ifindex; + laddr.sll_protocol = htons(proto); + laddr.sll_halen = ETH_ALEN; + + memset(laddr.sll_addr, 0x06, ETH_ALEN); + + msg.msg_name = &laddr; + msg.msg_namelen = sizeof(laddr); + } + + /* packet and raw sockets with hdrincl must pass network header */ + if (domain == PF_PACKET || protocol == IPPROTO_RAW) { + if (cfg_family == PF_INET) + iov[1].iov_len = setup_iph(&nh.iph, cfg_payload_len); + else + iov[1].iov_len = setup_ip6h(&nh.ip6h, cfg_payload_len); + + iov[1].iov_base = (void *) &nh; + msg.msg_iovlen++; + } + + iov[2].iov_base = payload; + iov[2].iov_len = cfg_payload_len; + msg.msg_iovlen++; + msg.msg_iov = &iov[3 - msg.msg_iovlen]; + + tstop = gettimeofday_ms() + cfg_runtime_ms; + do { + if (cfg_cork) + do_sendmsg_corked(fd, &msg); + else + do_sendmsg(fd, &msg, cfg_zerocopy); + + while (!do_poll(fd, POLLOUT)) { + if (cfg_zerocopy) + do_recv_completions(fd); + } + + } while (gettimeofday_ms() < tstop); + + if (cfg_zerocopy) + do_recv_remaining_completions(fd); + + if (close(fd)) + error(1, errno, "close"); + + fprintf(stderr, "tx=%lu (%lu MB) txc=%lu zc=%c\n", + packets, bytes >> 20, completions, + zerocopied == 1 ? 'y' : 'n'); +} + +static int do_setup_rx(int domain, int type, int protocol) +{ + int fd; + + /* If tx over PF_PACKET, rx over PF_INET(6)/SOCK_RAW, + * to recv the only copy of the packet, not a clone + */ + if (domain == PF_PACKET) + error(1, 0, "Use PF_INET/SOCK_RAW to read"); + + if (type == SOCK_RAW && protocol == IPPROTO_RAW) + error(1, 0, "IPPROTO_RAW: not supported on Rx"); + + fd = socket(domain, type, protocol); + if (fd == -1) + error(1, errno, "socket r"); + + do_setsockopt(fd, SOL_SOCKET, SO_RCVBUF, 1 << 21); + do_setsockopt(fd, SOL_SOCKET, SO_RCVLOWAT, 1 << 16); + do_setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, 1); + + if (bind(fd, (void *) &cfg_dst_addr, cfg_alen)) + error(1, errno, "bind"); + + if (type == SOCK_STREAM) { + if (listen(fd, 1)) + error(1, errno, "listen"); + fd = do_accept(fd); + } + + return fd; +} + +/* Flush all outstanding bytes for the tcp receive queue */ +static void do_flush_tcp(int fd) +{ + int ret; + + /* MSG_TRUNC flushes up to len bytes */ + ret = recv(fd, NULL, 1 << 21, MSG_TRUNC | MSG_DONTWAIT); + if (ret == -1 && errno == EAGAIN) + return; + if (ret == -1) + error(1, errno, "flush"); + if (!ret) + return; + + packets++; + bytes += ret; +} + +/* Flush all outstanding datagrams. Verify first few bytes of each. */ +static void do_flush_datagram(int fd, int type) +{ + int ret, off = 0; + char buf[64]; + + /* MSG_TRUNC will return full datagram length */ + ret = recv(fd, buf, sizeof(buf), MSG_DONTWAIT | MSG_TRUNC); + if (ret == -1 && errno == EAGAIN) + return; + + /* raw ipv4 return with header, raw ipv6 without */ + if (cfg_family == PF_INET && type == SOCK_RAW) { + off += sizeof(struct iphdr); + ret -= sizeof(struct iphdr); + } + + if (ret == -1) + error(1, errno, "recv"); + if (ret != cfg_payload_len) + error(1, 0, "recv: ret=%u != %u", ret, cfg_payload_len); + if (ret > sizeof(buf) - off) + ret = sizeof(buf) - off; + if (memcmp(buf + off, payload, ret)) + error(1, 0, "recv: data mismatch"); + + packets++; + bytes += cfg_payload_len; +} + +static void do_rx(int domain, int type, int protocol) +{ + uint64_t tstop; + int fd; + + fd = do_setup_rx(domain, type, protocol); + + tstop = gettimeofday_ms() + cfg_runtime_ms; + do { + if (type == SOCK_STREAM) + do_flush_tcp(fd); + else + do_flush_datagram(fd, type); + + do_poll(fd, POLLIN); + + } while (gettimeofday_ms() < tstop); + + if (close(fd)) + error(1, errno, "close"); + + fprintf(stderr, "rx=%lu (%lu MB)\n", packets, bytes >> 20); +} + +static void do_test(int domain, int type, int protocol) +{ + int i; + + if (cfg_cork && (domain == PF_PACKET || type != SOCK_DGRAM)) + error(1, 0, "can only cork udp sockets"); + + do_setcpu(cfg_cpu); + + for (i = 0; i < IP_MAXPACKET; i++) + payload[i] = 'a' + (i % 26); + + if (cfg_rx) + do_rx(domain, type, protocol); + else + do_tx(domain, type, protocol); +} + +static void usage(const char *filepath) +{ + error(1, 0, "Usage: %s [options] <test>", filepath); +} + +static void parse_opts(int argc, char **argv) +{ + const int max_payload_len = sizeof(payload) - + sizeof(struct ipv6hdr) - + sizeof(struct tcphdr) - + 40 /* max tcp options */; + int c; + + cfg_payload_len = max_payload_len; + + while ((c = getopt(argc, argv, "46c:C:D:i:mp:rs:S:t:vz")) != -1) { + switch (c) { + case '4': + if (cfg_family != PF_UNSPEC) + error(1, 0, "Pass one of -4 or -6"); + cfg_family = PF_INET; + cfg_alen = sizeof(struct sockaddr_in); + break; + case '6': + if (cfg_family != PF_UNSPEC) + error(1, 0, "Pass one of -4 or -6"); + cfg_family = PF_INET6; + cfg_alen = sizeof(struct sockaddr_in6); + break; + case 'c': + cfg_cork = strtol(optarg, NULL, 0); + break; + case 'C': + cfg_cpu = strtol(optarg, NULL, 0); + break; + case 'D': + setup_sockaddr(cfg_family, optarg, &cfg_dst_addr); + break; + case 'i': + cfg_ifindex = if_nametoindex(optarg); + if (cfg_ifindex == 0) + error(1, errno, "invalid iface: %s", optarg); + break; + case 'm': + cfg_cork_mixed = true; + break; + case 'p': + cfg_port = htons(strtoul(optarg, NULL, 0)); + break; + case 'r': + cfg_rx = true; + break; + case 's': + cfg_payload_len = strtoul(optarg, NULL, 0); + break; + case 'S': + setup_sockaddr(cfg_family, optarg, &cfg_src_addr); + break; + case 't': + cfg_runtime_ms = 200 + strtoul(optarg, NULL, 10) * 1000; + break; + case 'v': + cfg_verbose++; + break; + case 'z': + cfg_zerocopy = true; + break; + } + } + + if (cfg_payload_len > max_payload_len) + error(1, 0, "-s: payload exceeds max (%d)", max_payload_len); + if (cfg_cork_mixed && (!cfg_zerocopy || !cfg_cork)) + error(1, 0, "-m: cork_mixed requires corking and zerocopy"); + + if (optind != argc - 1) + usage(argv[0]); +} + +int main(int argc, char **argv) +{ + const char *cfg_test; + + parse_opts(argc, argv); + + cfg_test = argv[argc - 1]; + + if (!strcmp(cfg_test, "packet")) + do_test(PF_PACKET, SOCK_RAW, 0); + else if (!strcmp(cfg_test, "packet_dgram")) + do_test(PF_PACKET, SOCK_DGRAM, 0); + else if (!strcmp(cfg_test, "raw")) + do_test(cfg_family, SOCK_RAW, IPPROTO_EGP); + else if (!strcmp(cfg_test, "raw_hdrincl")) + do_test(cfg_family, SOCK_RAW, IPPROTO_RAW); + else if (!strcmp(cfg_test, "tcp")) + do_test(cfg_family, SOCK_STREAM, 0); + else if (!strcmp(cfg_test, "udp")) + do_test(cfg_family, SOCK_DGRAM, 0); + else + error(1, 0, "unknown cfg_test %s", cfg_test); + + return 0; +} diff --git a/tools/testing/selftests/net/msg_zerocopy.sh b/tools/testing/selftests/net/msg_zerocopy.sh new file mode 100755 index 000000000000..d571d213418d --- /dev/null +++ b/tools/testing/selftests/net/msg_zerocopy.sh @@ -0,0 +1,112 @@ +#!/bin/bash +# +# Send data between two processes across namespaces +# Run twice: once without and once with zerocopy + +set -e + +readonly DEV="veth0" +readonly DEV_MTU=65535 +readonly BIN="./msg_zerocopy" + +readonly RAND="$(mktemp -u XXXXXX)" +readonly NSPREFIX="ns-${RAND}" +readonly NS1="${NSPREFIX}1" +readonly NS2="${NSPREFIX}2" + +readonly SADDR4='192.168.1.1' +readonly DADDR4='192.168.1.2' +readonly SADDR6='fd::1' +readonly DADDR6='fd::2' + +readonly path_sysctl_mem="net.core.optmem_max" + +# Argument parsing +if [[ "$#" -lt "2" ]]; then + echo "Usage: $0 [4|6] [tcp|udp|raw|raw_hdrincl|packet|packet_dgram] <args>" + exit 1 +fi + +readonly IP="$1" +shift +readonly TXMODE="$1" +shift +readonly EXTRA_ARGS="$@" + +# Argument parsing: configure addresses +if [[ "${IP}" == "4" ]]; then + readonly SADDR="${SADDR4}" + readonly DADDR="${DADDR4}" +elif [[ "${IP}" == "6" ]]; then + readonly SADDR="${SADDR6}" + readonly DADDR="${DADDR6}" +else + echo "Invalid IP version ${IP}" + exit 1 +fi + +# Argument parsing: select receive mode +# +# This differs from send mode for +# - packet: use raw recv, because packet receives skb clones +# - raw_hdrinc: use raw recv, because hdrincl is a tx-only option +case "${TXMODE}" in +'packet' | 'packet_dgram' | 'raw_hdrincl') + RXMODE='raw' + ;; +*) + RXMODE="${TXMODE}" + ;; +esac + +# Start of state changes: install cleanup handler +save_sysctl_mem="$(sysctl -n ${path_sysctl_mem})" + +cleanup() { + ip netns del "${NS2}" + ip netns del "${NS1}" + sysctl -w -q "${path_sysctl_mem}=${save_sysctl_mem}" +} + +trap cleanup EXIT + +# Configure system settings +sysctl -w -q "${path_sysctl_mem}=1000000" + +# Create virtual ethernet pair between network namespaces +ip netns add "${NS1}" +ip netns add "${NS2}" + +ip link add "${DEV}" mtu "${DEV_MTU}" netns "${NS1}" type veth \ + peer name "${DEV}" mtu "${DEV_MTU}" netns "${NS2}" + +# Bring the devices up +ip -netns "${NS1}" link set "${DEV}" up +ip -netns "${NS2}" link set "${DEV}" up + +# Set fixed MAC addresses on the devices +ip -netns "${NS1}" link set dev "${DEV}" address 02:02:02:02:02:02 +ip -netns "${NS2}" link set dev "${DEV}" address 06:06:06:06:06:06 + +# Add fixed IP addresses to the devices +ip -netns "${NS1}" addr add 192.168.1.1/24 dev "${DEV}" +ip -netns "${NS2}" addr add 192.168.1.2/24 dev "${DEV}" +ip -netns "${NS1}" addr add fd::1/64 dev "${DEV}" nodad +ip -netns "${NS2}" addr add fd::2/64 dev "${DEV}" nodad + +# Optionally disable sg or csum offload to test edge cases +# ip netns exec "${NS1}" ethtool -K "${DEV}" sg off + +do_test() { + local readonly ARGS="$1" + + echo "ipv${IP} ${TXMODE} ${ARGS}" + ip netns exec "${NS2}" "${BIN}" "-${IP}" -i "${DEV}" -t 2 -C 2 -S "${SADDR}" -D "${DADDR}" ${ARGS} -r "${RXMODE}" & + sleep 0.2 + ip netns exec "${NS1}" "${BIN}" "-${IP}" -i "${DEV}" -t 1 -C 3 -S "${SADDR}" -D "${DADDR}" ${ARGS} "${TXMODE}" + wait +} + +do_test "${EXTRA_ARGS}" +do_test "-z ${EXTRA_ARGS}" +echo ok |