From 68e8b849b221b37a78a110a0307717d45e3593a0 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Wed, 2 May 2018 13:01:22 +0200 Subject: net: initial AF_XDP skeleton MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Buildable skeleton of AF_XDP without any functionality. Just what it takes to register a new address family. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov --- net/xdp/Kconfig | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 net/xdp/Kconfig (limited to 'net/xdp') diff --git a/net/xdp/Kconfig b/net/xdp/Kconfig new file mode 100644 index 000000000000..90e4a7152854 --- /dev/null +++ b/net/xdp/Kconfig @@ -0,0 +1,7 @@ +config XDP_SOCKETS + bool "XDP sockets" + depends on BPF_SYSCALL + default n + help + XDP sockets allows a channel between XDP programs and + userspace applications. -- cgit v1.2.3-59-g8ed1b From c0c77d8fb787cfe0c3fca689c2a30d1dad4eaba7 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Wed, 2 May 2018 13:01:23 +0200 Subject: xsk: add user memory registration support sockopt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In this commit the base structure of the AF_XDP address family is set up. Further, we introduce the abilty register a window of user memory to the kernel via the XDP_UMEM_REG setsockopt syscall. The memory window is viewed by an AF_XDP socket as a set of equally large frames. After a user memory registration all frames are "owned" by the user application, and not the kernel. v2: More robust checks on umem creation and unaccount on error. Call set_page_dirty_lock on cleanup. Simplified xdp_umem_reg. Co-authored-by: Magnus Karlsson Signed-off-by: Magnus Karlsson Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov --- include/net/xdp_sock.h | 31 ++++++ include/uapi/linux/if_xdp.h | 34 ++++++ net/Makefile | 1 + net/xdp/Makefile | 2 + net/xdp/xdp_umem.c | 245 ++++++++++++++++++++++++++++++++++++++++++++ net/xdp/xdp_umem.h | 45 ++++++++ net/xdp/xdp_umem_props.h | 23 +++++ net/xdp/xsk.c | 215 ++++++++++++++++++++++++++++++++++++++ 8 files changed, 596 insertions(+) create mode 100644 include/net/xdp_sock.h create mode 100644 include/uapi/linux/if_xdp.h create mode 100644 net/xdp/Makefile create mode 100644 net/xdp/xdp_umem.c create mode 100644 net/xdp/xdp_umem.h create mode 100644 net/xdp/xdp_umem_props.h create mode 100644 net/xdp/xsk.c (limited to 'net/xdp') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h new file mode 100644 index 000000000000..94785f5db13e --- /dev/null +++ b/include/net/xdp_sock.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 + * AF_XDP internal functions + * Copyright(c) 2018 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef _LINUX_XDP_SOCK_H +#define _LINUX_XDP_SOCK_H + +#include +#include + +struct xdp_umem; + +struct xdp_sock { + /* struct sock must be the first member of struct xdp_sock */ + struct sock sk; + struct xdp_umem *umem; + /* Protects multiple processes in the control path */ + struct mutex mutex; +}; + +#endif /* _LINUX_XDP_SOCK_H */ diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h new file mode 100644 index 000000000000..41252135a0fe --- /dev/null +++ b/include/uapi/linux/if_xdp.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note + * + * if_xdp: XDP socket user-space interface + * Copyright(c) 2018 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * Author(s): Björn Töpel + * Magnus Karlsson + */ + +#ifndef _LINUX_IF_XDP_H +#define _LINUX_IF_XDP_H + +#include + +/* XDP socket options */ +#define XDP_UMEM_REG 3 + +struct xdp_umem_reg { + __u64 addr; /* Start of packet data area */ + __u64 len; /* Length of packet data area */ + __u32 frame_size; /* Frame size */ + __u32 frame_headroom; /* Frame head room */ +}; + +#endif /* _LINUX_IF_XDP_H */ diff --git a/net/Makefile b/net/Makefile index a6147c61b174..77aaddedbd29 100644 --- a/net/Makefile +++ b/net/Makefile @@ -85,3 +85,4 @@ obj-y += l3mdev/ endif obj-$(CONFIG_QRTR) += qrtr/ obj-$(CONFIG_NET_NCSI) += ncsi/ +obj-$(CONFIG_XDP_SOCKETS) += xdp/ diff --git a/net/xdp/Makefile b/net/xdp/Makefile new file mode 100644 index 000000000000..a5d736640a0f --- /dev/null +++ b/net/xdp/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o + diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c new file mode 100644 index 000000000000..ec8b3552be44 --- /dev/null +++ b/net/xdp/xdp_umem.c @@ -0,0 +1,245 @@ +// SPDX-License-Identifier: GPL-2.0 +/* XDP user-space packet buffer + * Copyright(c) 2018 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "xdp_umem.h" + +#define XDP_UMEM_MIN_FRAME_SIZE 2048 + +int xdp_umem_create(struct xdp_umem **umem) +{ + *umem = kzalloc(sizeof(**umem), GFP_KERNEL); + + if (!(*umem)) + return -ENOMEM; + + return 0; +} + +static void xdp_umem_unpin_pages(struct xdp_umem *umem) +{ + unsigned int i; + + if (umem->pgs) { + for (i = 0; i < umem->npgs; i++) { + struct page *page = umem->pgs[i]; + + set_page_dirty_lock(page); + put_page(page); + } + + kfree(umem->pgs); + umem->pgs = NULL; + } +} + +static void xdp_umem_unaccount_pages(struct xdp_umem *umem) +{ + if (umem->user) { + atomic_long_sub(umem->npgs, &umem->user->locked_vm); + free_uid(umem->user); + } +} + +static void xdp_umem_release(struct xdp_umem *umem) +{ + struct task_struct *task; + struct mm_struct *mm; + + if (umem->pgs) { + xdp_umem_unpin_pages(umem); + + task = get_pid_task(umem->pid, PIDTYPE_PID); + put_pid(umem->pid); + if (!task) + goto out; + mm = get_task_mm(task); + put_task_struct(task); + if (!mm) + goto out; + + mmput(mm); + umem->pgs = NULL; + } + + xdp_umem_unaccount_pages(umem); +out: + kfree(umem); +} + +static void xdp_umem_release_deferred(struct work_struct *work) +{ + struct xdp_umem *umem = container_of(work, struct xdp_umem, work); + + xdp_umem_release(umem); +} + +void xdp_get_umem(struct xdp_umem *umem) +{ + atomic_inc(&umem->users); +} + +void xdp_put_umem(struct xdp_umem *umem) +{ + if (!umem) + return; + + if (atomic_dec_and_test(&umem->users)) { + INIT_WORK(&umem->work, xdp_umem_release_deferred); + schedule_work(&umem->work); + } +} + +static int xdp_umem_pin_pages(struct xdp_umem *umem) +{ + unsigned int gup_flags = FOLL_WRITE; + long npgs; + int err; + + umem->pgs = kcalloc(umem->npgs, sizeof(*umem->pgs), GFP_KERNEL); + if (!umem->pgs) + return -ENOMEM; + + down_write(¤t->mm->mmap_sem); + npgs = get_user_pages(umem->address, umem->npgs, + gup_flags, &umem->pgs[0], NULL); + up_write(¤t->mm->mmap_sem); + + if (npgs != umem->npgs) { + if (npgs >= 0) { + umem->npgs = npgs; + err = -ENOMEM; + goto out_pin; + } + err = npgs; + goto out_pgs; + } + return 0; + +out_pin: + xdp_umem_unpin_pages(umem); +out_pgs: + kfree(umem->pgs); + umem->pgs = NULL; + return err; +} + +static int xdp_umem_account_pages(struct xdp_umem *umem) +{ + unsigned long lock_limit, new_npgs, old_npgs; + + if (capable(CAP_IPC_LOCK)) + return 0; + + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + umem->user = get_uid(current_user()); + + do { + old_npgs = atomic_long_read(&umem->user->locked_vm); + new_npgs = old_npgs + umem->npgs; + if (new_npgs > lock_limit) { + free_uid(umem->user); + umem->user = NULL; + return -ENOBUFS; + } + } while (atomic_long_cmpxchg(&umem->user->locked_vm, old_npgs, + new_npgs) != old_npgs); + return 0; +} + +int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) +{ + u32 frame_size = mr->frame_size, frame_headroom = mr->frame_headroom; + u64 addr = mr->addr, size = mr->len; + unsigned int nframes, nfpp; + int size_chk, err; + + if (!umem) + return -EINVAL; + + if (frame_size < XDP_UMEM_MIN_FRAME_SIZE || frame_size > PAGE_SIZE) { + /* Strictly speaking we could support this, if: + * - huge pages, or* + * - using an IOMMU, or + * - making sure the memory area is consecutive + * but for now, we simply say "computer says no". + */ + return -EINVAL; + } + + if (!is_power_of_2(frame_size)) + return -EINVAL; + + if (!PAGE_ALIGNED(addr)) { + /* Memory area has to be page size aligned. For + * simplicity, this might change. + */ + return -EINVAL; + } + + if ((addr + size) < addr) + return -EINVAL; + + nframes = size / frame_size; + if (nframes == 0 || nframes > UINT_MAX) + return -EINVAL; + + nfpp = PAGE_SIZE / frame_size; + if (nframes < nfpp || nframes % nfpp) + return -EINVAL; + + frame_headroom = ALIGN(frame_headroom, 64); + + size_chk = frame_size - frame_headroom - XDP_PACKET_HEADROOM; + if (size_chk < 0) + return -EINVAL; + + umem->pid = get_task_pid(current, PIDTYPE_PID); + umem->size = (size_t)size; + umem->address = (unsigned long)addr; + umem->props.frame_size = frame_size; + umem->props.nframes = nframes; + umem->frame_headroom = frame_headroom; + umem->npgs = size / PAGE_SIZE; + umem->pgs = NULL; + umem->user = NULL; + + umem->frame_size_log2 = ilog2(frame_size); + umem->nfpp_mask = nfpp - 1; + umem->nfpplog2 = ilog2(nfpp); + atomic_set(&umem->users, 1); + + err = xdp_umem_account_pages(umem); + if (err) + goto out; + + err = xdp_umem_pin_pages(umem); + if (err) + goto out_account; + return 0; + +out_account: + xdp_umem_unaccount_pages(umem); +out: + put_pid(umem->pid); + return err; +} diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h new file mode 100644 index 000000000000..4597ae81a221 --- /dev/null +++ b/net/xdp/xdp_umem.h @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0 + * XDP user-space packet buffer + * Copyright(c) 2018 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef XDP_UMEM_H_ +#define XDP_UMEM_H_ + +#include +#include +#include + +#include "xdp_umem_props.h" + +struct xdp_umem { + struct page **pgs; + struct xdp_umem_props props; + u32 npgs; + u32 frame_headroom; + u32 nfpp_mask; + u32 nfpplog2; + u32 frame_size_log2; + struct user_struct *user; + struct pid *pid; + unsigned long address; + size_t size; + atomic_t users; + struct work_struct work; +}; + +int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr); +void xdp_get_umem(struct xdp_umem *umem); +void xdp_put_umem(struct xdp_umem *umem); +int xdp_umem_create(struct xdp_umem **umem); + +#endif /* XDP_UMEM_H_ */ diff --git a/net/xdp/xdp_umem_props.h b/net/xdp/xdp_umem_props.h new file mode 100644 index 000000000000..77fb5daf29f3 --- /dev/null +++ b/net/xdp/xdp_umem_props.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 + * XDP user-space packet buffer + * Copyright(c) 2018 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef XDP_UMEM_PROPS_H_ +#define XDP_UMEM_PROPS_H_ + +struct xdp_umem_props { + u32 frame_size; + u32 nframes; +}; + +#endif /* XDP_UMEM_PROPS_H_ */ diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c new file mode 100644 index 000000000000..84e0e867febb --- /dev/null +++ b/net/xdp/xsk.c @@ -0,0 +1,215 @@ +// SPDX-License-Identifier: GPL-2.0 +/* XDP sockets + * + * AF_XDP sockets allows a channel between XDP programs and userspace + * applications. + * Copyright(c) 2018 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * Author(s): Björn Töpel + * Magnus Karlsson + */ + +#define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "xdp_umem.h" + +static struct xdp_sock *xdp_sk(struct sock *sk) +{ + return (struct xdp_sock *)sk; +} + +static int xsk_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct net *net; + + if (!sk) + return 0; + + net = sock_net(sk); + + local_bh_disable(); + sock_prot_inuse_add(net, sk->sk_prot, -1); + local_bh_enable(); + + sock_orphan(sk); + sock->sk = NULL; + + sk_refcnt_debug_release(sk); + sock_put(sk); + + return 0; +} + +static int xsk_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, unsigned int optlen) +{ + struct sock *sk = sock->sk; + struct xdp_sock *xs = xdp_sk(sk); + int err; + + if (level != SOL_XDP) + return -ENOPROTOOPT; + + switch (optname) { + case XDP_UMEM_REG: + { + struct xdp_umem_reg mr; + struct xdp_umem *umem; + + if (xs->umem) + return -EBUSY; + + if (copy_from_user(&mr, optval, sizeof(mr))) + return -EFAULT; + + mutex_lock(&xs->mutex); + err = xdp_umem_create(&umem); + + err = xdp_umem_reg(umem, &mr); + if (err) { + kfree(umem); + mutex_unlock(&xs->mutex); + return err; + } + + /* Make sure umem is ready before it can be seen by others */ + smp_wmb(); + + xs->umem = umem; + mutex_unlock(&xs->mutex); + return 0; + } + default: + break; + } + + return -ENOPROTOOPT; +} + +static struct proto xsk_proto = { + .name = "XDP", + .owner = THIS_MODULE, + .obj_size = sizeof(struct xdp_sock), +}; + +static const struct proto_ops xsk_proto_ops = { + .family = PF_XDP, + .owner = THIS_MODULE, + .release = xsk_release, + .bind = sock_no_bind, + .connect = sock_no_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = sock_no_getname, + .poll = sock_no_poll, + .ioctl = sock_no_ioctl, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = xsk_setsockopt, + .getsockopt = sock_no_getsockopt, + .sendmsg = sock_no_sendmsg, + .recvmsg = sock_no_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static void xsk_destruct(struct sock *sk) +{ + struct xdp_sock *xs = xdp_sk(sk); + + if (!sock_flag(sk, SOCK_DEAD)) + return; + + xdp_put_umem(xs->umem); + + sk_refcnt_debug_dec(sk); +} + +static int xsk_create(struct net *net, struct socket *sock, int protocol, + int kern) +{ + struct sock *sk; + struct xdp_sock *xs; + + if (!ns_capable(net->user_ns, CAP_NET_RAW)) + return -EPERM; + if (sock->type != SOCK_RAW) + return -ESOCKTNOSUPPORT; + + if (protocol) + return -EPROTONOSUPPORT; + + sock->state = SS_UNCONNECTED; + + sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern); + if (!sk) + return -ENOBUFS; + + sock->ops = &xsk_proto_ops; + + sock_init_data(sock, sk); + + sk->sk_family = PF_XDP; + + sk->sk_destruct = xsk_destruct; + sk_refcnt_debug_inc(sk); + + xs = xdp_sk(sk); + mutex_init(&xs->mutex); + + local_bh_disable(); + sock_prot_inuse_add(net, &xsk_proto, 1); + local_bh_enable(); + + return 0; +} + +static const struct net_proto_family xsk_family_ops = { + .family = PF_XDP, + .create = xsk_create, + .owner = THIS_MODULE, +}; + +static int __init xsk_init(void) +{ + int err; + + err = proto_register(&xsk_proto, 0 /* no slab */); + if (err) + goto out; + + err = sock_register(&xsk_family_ops); + if (err) + goto out_proto; + + return 0; + +out_proto: + proto_unregister(&xsk_proto); +out: + return err; +} + +fs_initcall(xsk_init); -- cgit v1.2.3-59-g8ed1b From 423f38329d267969130fb6f2c685f73d72687558 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 2 May 2018 13:01:24 +0200 Subject: xsk: add umem fill queue support and mmap Here, we add another setsockopt for registered user memory (umem) called XDP_UMEM_FILL_QUEUE. Using this socket option, the process can ask the kernel to allocate a queue (ring buffer) and also mmap it (XDP_UMEM_PGOFF_FILL_QUEUE) into the process. The queue is used to explicitly pass ownership of umem frames from the user process to the kernel. These frames will in a later patch be filled in with Rx packet data by the kernel. v2: Fixed potential crash in xsk_mmap. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov --- include/uapi/linux/if_xdp.h | 15 +++++++++++ net/xdp/Makefile | 2 +- net/xdp/xdp_umem.c | 5 ++++ net/xdp/xdp_umem.h | 2 ++ net/xdp/xsk.c | 65 ++++++++++++++++++++++++++++++++++++++++++++- net/xdp/xsk_queue.c | 58 ++++++++++++++++++++++++++++++++++++++++ net/xdp/xsk_queue.h | 38 ++++++++++++++++++++++++++ 7 files changed, 183 insertions(+), 2 deletions(-) create mode 100644 net/xdp/xsk_queue.c create mode 100644 net/xdp/xsk_queue.h (limited to 'net/xdp') diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index 41252135a0fe..975661e1baca 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -23,6 +23,7 @@ /* XDP socket options */ #define XDP_UMEM_REG 3 +#define XDP_UMEM_FILL_RING 4 struct xdp_umem_reg { __u64 addr; /* Start of packet data area */ @@ -31,4 +32,18 @@ struct xdp_umem_reg { __u32 frame_headroom; /* Frame head room */ }; +/* Pgoff for mmaping the rings */ +#define XDP_UMEM_PGOFF_FILL_RING 0x100000000 + +struct xdp_ring { + __u32 producer __attribute__((aligned(64))); + __u32 consumer __attribute__((aligned(64))); +}; + +/* Used for the fill and completion queues for buffers */ +struct xdp_umem_ring { + struct xdp_ring ptrs; + __u32 desc[0] __attribute__((aligned(64))); +}; + #endif /* _LINUX_IF_XDP_H */ diff --git a/net/xdp/Makefile b/net/xdp/Makefile index a5d736640a0f..074fb2b2d51c 100644 --- a/net/xdp/Makefile +++ b/net/xdp/Makefile @@ -1,2 +1,2 @@ -obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o +obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index ec8b3552be44..e1f627d0cc1c 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -65,6 +65,11 @@ static void xdp_umem_release(struct xdp_umem *umem) struct task_struct *task; struct mm_struct *mm; + if (umem->fq) { + xskq_destroy(umem->fq); + umem->fq = NULL; + } + if (umem->pgs) { xdp_umem_unpin_pages(umem); diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index 4597ae81a221..25634b8a5c6f 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -19,9 +19,11 @@ #include #include +#include "xsk_queue.h" #include "xdp_umem_props.h" struct xdp_umem { + struct xsk_queue *fq; struct page **pgs; struct xdp_umem_props props; u32 npgs; diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 84e0e867febb..da67a3c5c1c9 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -32,6 +32,7 @@ #include #include +#include "xsk_queue.h" #include "xdp_umem.h" static struct xdp_sock *xdp_sk(struct sock *sk) @@ -39,6 +40,21 @@ static struct xdp_sock *xdp_sk(struct sock *sk) return (struct xdp_sock *)sk; } +static int xsk_init_queue(u32 entries, struct xsk_queue **queue) +{ + struct xsk_queue *q; + + if (entries == 0 || *queue || !is_power_of_2(entries)) + return -EINVAL; + + q = xskq_create(entries); + if (!q) + return -ENOMEM; + + *queue = q; + return 0; +} + static int xsk_release(struct socket *sock) { struct sock *sk = sock->sk; @@ -101,6 +117,23 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, mutex_unlock(&xs->mutex); return 0; } + case XDP_UMEM_FILL_RING: + { + struct xsk_queue **q; + int entries; + + if (!xs->umem) + return -EINVAL; + + if (copy_from_user(&entries, optval, sizeof(entries))) + return -EFAULT; + + mutex_lock(&xs->mutex); + q = &xs->umem->fq; + err = xsk_init_queue(entries, q); + mutex_unlock(&xs->mutex); + return err; + } default: break; } @@ -108,6 +141,36 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, return -ENOPROTOOPT; } +static int xsk_mmap(struct file *file, struct socket *sock, + struct vm_area_struct *vma) +{ + unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; + unsigned long size = vma->vm_end - vma->vm_start; + struct xdp_sock *xs = xdp_sk(sock->sk); + struct xsk_queue *q = NULL; + unsigned long pfn; + struct page *qpg; + + if (!xs->umem) + return -EINVAL; + + if (offset == XDP_UMEM_PGOFF_FILL_RING) + q = xs->umem->fq; + else + return -EINVAL; + + if (!q) + return -EINVAL; + + qpg = virt_to_head_page(q->ring); + if (size > (PAGE_SIZE << compound_order(qpg))) + return -EINVAL; + + pfn = virt_to_phys(q->ring) >> PAGE_SHIFT; + return remap_pfn_range(vma, vma->vm_start, pfn, + size, vma->vm_page_prot); +} + static struct proto xsk_proto = { .name = "XDP", .owner = THIS_MODULE, @@ -131,7 +194,7 @@ static const struct proto_ops xsk_proto_ops = { .getsockopt = sock_no_getsockopt, .sendmsg = sock_no_sendmsg, .recvmsg = sock_no_recvmsg, - .mmap = sock_no_mmap, + .mmap = xsk_mmap, .sendpage = sock_no_sendpage, }; diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c new file mode 100644 index 000000000000..23da4f29d3fb --- /dev/null +++ b/net/xdp/xsk_queue.c @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: GPL-2.0 +/* XDP user-space ring structure + * Copyright(c) 2018 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include + +#include "xsk_queue.h" + +static u32 xskq_umem_get_ring_size(struct xsk_queue *q) +{ + return sizeof(struct xdp_umem_ring) + q->nentries * sizeof(u32); +} + +struct xsk_queue *xskq_create(u32 nentries) +{ + struct xsk_queue *q; + gfp_t gfp_flags; + size_t size; + + q = kzalloc(sizeof(*q), GFP_KERNEL); + if (!q) + return NULL; + + q->nentries = nentries; + q->ring_mask = nentries - 1; + + gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | + __GFP_COMP | __GFP_NORETRY; + size = xskq_umem_get_ring_size(q); + + q->ring = (struct xdp_ring *)__get_free_pages(gfp_flags, + get_order(size)); + if (!q->ring) { + kfree(q); + return NULL; + } + + return q; +} + +void xskq_destroy(struct xsk_queue *q) +{ + if (!q) + return; + + page_frag_free(q->ring); + kfree(q); +} diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h new file mode 100644 index 000000000000..7eb556bf73be --- /dev/null +++ b/net/xdp/xsk_queue.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0 + * XDP user-space ring structure + * Copyright(c) 2018 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef _LINUX_XSK_QUEUE_H +#define _LINUX_XSK_QUEUE_H + +#include +#include + +#include "xdp_umem_props.h" + +struct xsk_queue { + struct xdp_umem_props umem_props; + u32 ring_mask; + u32 nentries; + u32 prod_head; + u32 prod_tail; + u32 cons_head; + u32 cons_tail; + struct xdp_ring *ring; + u64 invalid_descs; +}; + +struct xsk_queue *xskq_create(u32 nentries); +void xskq_destroy(struct xsk_queue *q); + +#endif /* _LINUX_XSK_QUEUE_H */ -- cgit v1.2.3-59-g8ed1b From b9b6b68e8abd101be6eb5330e4999218c696d1e8 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Wed, 2 May 2018 13:01:25 +0200 Subject: xsk: add Rx queue setup and mmap support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Another setsockopt (XDP_RX_QUEUE) is added to let the process allocate a queue, where the kernel can pass completed Rx frames from the kernel to user process. The mmapping of the queue is done using the XDP_PGOFF_RX_QUEUE offset. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov --- include/net/xdp_sock.h | 4 ++++ include/uapi/linux/if_xdp.h | 16 ++++++++++++++++ net/xdp/xsk.c | 41 ++++++++++++++++++++++++++++++++--------- net/xdp/xsk_queue.c | 11 +++++++++-- net/xdp/xsk_queue.h | 2 +- 5 files changed, 62 insertions(+), 12 deletions(-) (limited to 'net/xdp') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 94785f5db13e..db9a321de087 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -18,11 +18,15 @@ #include #include +struct net_device; +struct xsk_queue; struct xdp_umem; struct xdp_sock { /* struct sock must be the first member of struct xdp_sock */ struct sock sk; + struct xsk_queue *rx; + struct net_device *dev; struct xdp_umem *umem; /* Protects multiple processes in the control path */ struct mutex mutex; diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index 975661e1baca..65324558829d 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -22,6 +22,7 @@ #include /* XDP socket options */ +#define XDP_RX_RING 1 #define XDP_UMEM_REG 3 #define XDP_UMEM_FILL_RING 4 @@ -33,13 +34,28 @@ struct xdp_umem_reg { }; /* Pgoff for mmaping the rings */ +#define XDP_PGOFF_RX_RING 0 #define XDP_UMEM_PGOFF_FILL_RING 0x100000000 +struct xdp_desc { + __u32 idx; + __u32 len; + __u16 offset; + __u8 flags; + __u8 padding[5]; +}; + struct xdp_ring { __u32 producer __attribute__((aligned(64))); __u32 consumer __attribute__((aligned(64))); }; +/* Used for the RX and TX queues for packets */ +struct xdp_rxtx_ring { + struct xdp_ring ptrs; + struct xdp_desc desc[0] __attribute__((aligned(64))); +}; + /* Used for the fill and completion queues for buffers */ struct xdp_umem_ring { struct xdp_ring ptrs; diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index da67a3c5c1c9..92bd9b7e548f 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -31,6 +31,7 @@ #include #include #include +#include #include "xsk_queue.h" #include "xdp_umem.h" @@ -40,14 +41,15 @@ static struct xdp_sock *xdp_sk(struct sock *sk) return (struct xdp_sock *)sk; } -static int xsk_init_queue(u32 entries, struct xsk_queue **queue) +static int xsk_init_queue(u32 entries, struct xsk_queue **queue, + bool umem_queue) { struct xsk_queue *q; if (entries == 0 || *queue || !is_power_of_2(entries)) return -EINVAL; - q = xskq_create(entries); + q = xskq_create(entries, umem_queue); if (!q) return -ENOMEM; @@ -89,6 +91,22 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, return -ENOPROTOOPT; switch (optname) { + case XDP_RX_RING: + { + struct xsk_queue **q; + int entries; + + if (optlen < sizeof(entries)) + return -EINVAL; + if (copy_from_user(&entries, optval, sizeof(entries))) + return -EFAULT; + + mutex_lock(&xs->mutex); + q = &xs->rx; + err = xsk_init_queue(entries, q, false); + mutex_unlock(&xs->mutex); + return err; + } case XDP_UMEM_REG: { struct xdp_umem_reg mr; @@ -130,7 +148,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, mutex_lock(&xs->mutex); q = &xs->umem->fq; - err = xsk_init_queue(entries, q); + err = xsk_init_queue(entries, q, true); mutex_unlock(&xs->mutex); return err; } @@ -151,13 +169,17 @@ static int xsk_mmap(struct file *file, struct socket *sock, unsigned long pfn; struct page *qpg; - if (!xs->umem) - return -EINVAL; + if (offset == XDP_PGOFF_RX_RING) { + q = xs->rx; + } else { + if (!xs->umem) + return -EINVAL; - if (offset == XDP_UMEM_PGOFF_FILL_RING) - q = xs->umem->fq; - else - return -EINVAL; + if (offset == XDP_UMEM_PGOFF_FILL_RING) + q = xs->umem->fq; + else + return -EINVAL; + } if (!q) return -EINVAL; @@ -205,6 +227,7 @@ static void xsk_destruct(struct sock *sk) if (!sock_flag(sk, SOCK_DEAD)) return; + xskq_destroy(xs->rx); xdp_put_umem(xs->umem); sk_refcnt_debug_dec(sk); diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c index 23da4f29d3fb..894f9f89afc7 100644 --- a/net/xdp/xsk_queue.c +++ b/net/xdp/xsk_queue.c @@ -21,7 +21,13 @@ static u32 xskq_umem_get_ring_size(struct xsk_queue *q) return sizeof(struct xdp_umem_ring) + q->nentries * sizeof(u32); } -struct xsk_queue *xskq_create(u32 nentries) +static u32 xskq_rxtx_get_ring_size(struct xsk_queue *q) +{ + return (sizeof(struct xdp_ring) + + q->nentries * sizeof(struct xdp_desc)); +} + +struct xsk_queue *xskq_create(u32 nentries, bool umem_queue) { struct xsk_queue *q; gfp_t gfp_flags; @@ -36,7 +42,8 @@ struct xsk_queue *xskq_create(u32 nentries) gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP | __GFP_NORETRY; - size = xskq_umem_get_ring_size(q); + size = umem_queue ? xskq_umem_get_ring_size(q) : + xskq_rxtx_get_ring_size(q); q->ring = (struct xdp_ring *)__get_free_pages(gfp_flags, get_order(size)); diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 7eb556bf73be..5439fa381763 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -32,7 +32,7 @@ struct xsk_queue { u64 invalid_descs; }; -struct xsk_queue *xskq_create(u32 nentries); +struct xsk_queue *xskq_create(u32 nentries, bool umem_queue); void xskq_destroy(struct xsk_queue *q); #endif /* _LINUX_XSK_QUEUE_H */ -- cgit v1.2.3-59-g8ed1b From 965a990984432cd01a9eb3514c64d86f56704295 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 2 May 2018 13:01:26 +0200 Subject: xsk: add support for bind for Rx Here, the bind syscall is added. Binding an AF_XDP socket, means associating the socket to an umem, a netdev and a queue index. This can be done in two ways. The first way, creating a "socket from scratch". Create the umem using the XDP_UMEM_REG setsockopt and an associated fill queue with XDP_UMEM_FILL_QUEUE. Create the Rx queue using the XDP_RX_QUEUE setsockopt. Call bind passing ifindex and queue index ("channel" in ethtool speak). The second way to bind a socket, is simply skipping the umem/netdev/queue index, and passing another already setup AF_XDP socket. The new socket will then have the same umem/netdev/queue index as the parent so it will share the same umem. You must also set the flags field in the socket address to XDP_SHARED_UMEM. v2: Use PTR_ERR instead of passing error variable explicitly. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov --- include/net/xdp_sock.h | 1 + include/uapi/linux/if_xdp.h | 11 ++++ net/xdp/xdp_umem.c | 5 ++ net/xdp/xdp_umem.h | 1 + net/xdp/xsk.c | 124 +++++++++++++++++++++++++++++++++++++++++++- net/xdp/xsk_queue.c | 8 +++ net/xdp/xsk_queue.h | 1 + 7 files changed, 150 insertions(+), 1 deletion(-) (limited to 'net/xdp') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index db9a321de087..85d02512f59b 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -28,6 +28,7 @@ struct xdp_sock { struct xsk_queue *rx; struct net_device *dev; struct xdp_umem *umem; + u16 queue_id; /* Protects multiple processes in the control path */ struct mutex mutex; }; diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index 65324558829d..e5091881f776 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -21,6 +21,17 @@ #include +/* Options for the sxdp_flags field */ +#define XDP_SHARED_UMEM 1 + +struct sockaddr_xdp { + __u16 sxdp_family; + __u32 sxdp_ifindex; + __u32 sxdp_queue_id; + __u32 sxdp_shared_umem_fd; + __u16 sxdp_flags; +}; + /* XDP socket options */ #define XDP_RX_RING 1 #define XDP_UMEM_REG 3 diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index e1f627d0cc1c..9bac1ad570fa 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -248,3 +248,8 @@ out: put_pid(umem->pid); return err; } + +bool xdp_umem_validate_queues(struct xdp_umem *umem) +{ + return umem->fq; +} diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index 25634b8a5c6f..b13133e9c501 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -39,6 +39,7 @@ struct xdp_umem { struct work_struct work; }; +bool xdp_umem_validate_queues(struct xdp_umem *umem); int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr); void xdp_get_umem(struct xdp_umem *umem); void xdp_put_umem(struct xdp_umem *umem); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 92bd9b7e548f..bf2c97b87992 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -57,9 +57,18 @@ static int xsk_init_queue(u32 entries, struct xsk_queue **queue, return 0; } +static void __xsk_release(struct xdp_sock *xs) +{ + /* Wait for driver to stop using the xdp socket. */ + synchronize_net(); + + dev_put(xs->dev); +} + static int xsk_release(struct socket *sock) { struct sock *sk = sock->sk; + struct xdp_sock *xs = xdp_sk(sk); struct net *net; if (!sk) @@ -71,6 +80,11 @@ static int xsk_release(struct socket *sock) sock_prot_inuse_add(net, sk->sk_prot, -1); local_bh_enable(); + if (xs->dev) { + __xsk_release(xs); + xs->dev = NULL; + } + sock_orphan(sk); sock->sk = NULL; @@ -80,6 +94,114 @@ static int xsk_release(struct socket *sock) return 0; } +static struct socket *xsk_lookup_xsk_from_fd(int fd) +{ + struct socket *sock; + int err; + + sock = sockfd_lookup(fd, &err); + if (!sock) + return ERR_PTR(-ENOTSOCK); + + if (sock->sk->sk_family != PF_XDP) { + sockfd_put(sock); + return ERR_PTR(-ENOPROTOOPT); + } + + return sock; +} + +static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) +{ + struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr; + struct sock *sk = sock->sk; + struct net_device *dev, *dev_curr; + struct xdp_sock *xs = xdp_sk(sk); + struct xdp_umem *old_umem = NULL; + int err = 0; + + if (addr_len < sizeof(struct sockaddr_xdp)) + return -EINVAL; + if (sxdp->sxdp_family != AF_XDP) + return -EINVAL; + + mutex_lock(&xs->mutex); + dev_curr = xs->dev; + dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex); + if (!dev) { + err = -ENODEV; + goto out_release; + } + + if (!xs->rx) { + err = -EINVAL; + goto out_unlock; + } + + if (sxdp->sxdp_queue_id >= dev->num_rx_queues) { + err = -EINVAL; + goto out_unlock; + } + + if (sxdp->sxdp_flags & XDP_SHARED_UMEM) { + struct xdp_sock *umem_xs; + struct socket *sock; + + if (xs->umem) { + /* We have already our own. */ + err = -EINVAL; + goto out_unlock; + } + + sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd); + if (IS_ERR(sock)) { + err = PTR_ERR(sock); + goto out_unlock; + } + + umem_xs = xdp_sk(sock->sk); + if (!umem_xs->umem) { + /* No umem to inherit. */ + err = -EBADF; + sockfd_put(sock); + goto out_unlock; + } else if (umem_xs->dev != dev || + umem_xs->queue_id != sxdp->sxdp_queue_id) { + err = -EINVAL; + sockfd_put(sock); + goto out_unlock; + } + + xdp_get_umem(umem_xs->umem); + old_umem = xs->umem; + xs->umem = umem_xs->umem; + sockfd_put(sock); + } else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) { + err = -EINVAL; + goto out_unlock; + } + + /* Rebind? */ + if (dev_curr && (dev_curr != dev || + xs->queue_id != sxdp->sxdp_queue_id)) { + __xsk_release(xs); + if (old_umem) + xdp_put_umem(old_umem); + } + + xs->dev = dev; + xs->queue_id = sxdp->sxdp_queue_id; + + xskq_set_umem(xs->rx, &xs->umem->props); + +out_unlock: + if (err) + dev_put(dev); +out_release: + mutex_unlock(&xs->mutex); + return err; +} + static int xsk_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) { @@ -203,7 +325,7 @@ static const struct proto_ops xsk_proto_ops = { .family = PF_XDP, .owner = THIS_MODULE, .release = xsk_release, - .bind = sock_no_bind, + .bind = xsk_bind, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c index 894f9f89afc7..d012e5e23591 100644 --- a/net/xdp/xsk_queue.c +++ b/net/xdp/xsk_queue.c @@ -16,6 +16,14 @@ #include "xsk_queue.h" +void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props) +{ + if (!q) + return; + + q->umem_props = *umem_props; +} + static u32 xskq_umem_get_ring_size(struct xsk_queue *q) { return sizeof(struct xdp_umem_ring) + q->nentries * sizeof(u32); diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 5439fa381763..9ddd2ee07a84 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -32,6 +32,7 @@ struct xsk_queue { u64 invalid_descs; }; +void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props); struct xsk_queue *xskq_create(u32 nentries, bool umem_queue); void xskq_destroy(struct xsk_queue *q); -- cgit v1.2.3-59-g8ed1b From c497176cb2e478f0a5713b0e05f242276e3194b5 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Wed, 2 May 2018 13:01:27 +0200 Subject: xsk: add Rx receive functions and poll support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Here the actual receive functions of AF_XDP are implemented, that in a later commit, will be called from the XDP layers. There's one set of functions for the XDP_DRV side and another for XDP_SKB (generic). A new XDP API, xdp_return_buff, is also introduced. Adding xdp_return_buff, which is analogous to xdp_return_frame, but acts upon an struct xdp_buff. The API will be used by AF_XDP in future commits. Support for the poll syscall is also implemented. v2: xskq_validate_id did not update cons_tail. The entries variable was calculated twice in xskq_nb_avail. Squashed xdp_return_buff commit. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov --- include/net/xdp.h | 1 + include/net/xdp_sock.h | 22 ++++++++++ net/core/xdp.c | 15 +++++-- net/xdp/xdp_umem.h | 18 ++++++++ net/xdp/xsk.c | 73 ++++++++++++++++++++++++++++++- net/xdp/xsk_queue.h | 114 ++++++++++++++++++++++++++++++++++++++++++++++++- 6 files changed, 238 insertions(+), 5 deletions(-) (limited to 'net/xdp') diff --git a/include/net/xdp.h b/include/net/xdp.h index 137ad5f9f40f..0b689cf561c7 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -104,6 +104,7 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) } void xdp_return_frame(struct xdp_frame *xdpf); +void xdp_return_buff(struct xdp_buff *xdp); int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, struct net_device *dev, u32 queue_index); diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 85d02512f59b..a0342dff6a4d 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -31,6 +31,28 @@ struct xdp_sock { u16 queue_id; /* Protects multiple processes in the control path */ struct mutex mutex; + u64 rx_dropped; }; +struct xdp_buff; +#ifdef CONFIG_XDP_SOCKETS +int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); +int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); +void xsk_flush(struct xdp_sock *xs); +#else +static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +{ + return -ENOTSUPP; +} + +static inline int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +{ + return -ENOTSUPP; +} + +static inline void xsk_flush(struct xdp_sock *xs) +{ +} +#endif /* CONFIG_XDP_SOCKETS */ + #endif /* _LINUX_XDP_SOCK_H */ diff --git a/net/core/xdp.c b/net/core/xdp.c index 0c86b53a3a63..bf6758f74339 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -308,11 +308,9 @@ err: } EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); -void xdp_return_frame(struct xdp_frame *xdpf) +static void xdp_return(void *data, struct xdp_mem_info *mem) { - struct xdp_mem_info *mem = &xdpf->mem; struct xdp_mem_allocator *xa; - void *data = xdpf->data; struct page *page; switch (mem->type) { @@ -339,4 +337,15 @@ void xdp_return_frame(struct xdp_frame *xdpf) break; } } + +void xdp_return_frame(struct xdp_frame *xdpf) +{ + xdp_return(xdpf->data, &xdpf->mem); +} EXPORT_SYMBOL_GPL(xdp_return_frame); + +void xdp_return_buff(struct xdp_buff *xdp) +{ + xdp_return(xdp->data, &xdp->rxq->mem); +} +EXPORT_SYMBOL_GPL(xdp_return_buff); diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index b13133e9c501..c7378a11721f 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -39,6 +39,24 @@ struct xdp_umem { struct work_struct work; }; +static inline char *xdp_umem_get_data(struct xdp_umem *umem, u32 idx) +{ + u64 pg, off; + char *data; + + pg = idx >> umem->nfpplog2; + off = (idx & umem->nfpp_mask) << umem->frame_size_log2; + + data = page_address(umem->pgs[pg]); + return data + off; +} + +static inline char *xdp_umem_get_data_with_headroom(struct xdp_umem *umem, + u32 idx) +{ + return xdp_umem_get_data(umem, idx) + umem->frame_headroom; +} + bool xdp_umem_validate_queues(struct xdp_umem *umem); int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr); void xdp_get_umem(struct xdp_umem *umem); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index bf2c97b87992..4e1e6c581e1d 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -41,6 +41,74 @@ static struct xdp_sock *xdp_sk(struct sock *sk) return (struct xdp_sock *)sk; } +static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +{ + u32 *id, len = xdp->data_end - xdp->data; + void *buffer; + int err = 0; + + if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) + return -EINVAL; + + id = xskq_peek_id(xs->umem->fq); + if (!id) + return -ENOSPC; + + buffer = xdp_umem_get_data_with_headroom(xs->umem, *id); + memcpy(buffer, xdp->data, len); + err = xskq_produce_batch_desc(xs->rx, *id, len, + xs->umem->frame_headroom); + if (!err) + xskq_discard_id(xs->umem->fq); + + return err; +} + +int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +{ + int err; + + err = __xsk_rcv(xs, xdp); + if (likely(!err)) + xdp_return_buff(xdp); + else + xs->rx_dropped++; + + return err; +} + +void xsk_flush(struct xdp_sock *xs) +{ + xskq_produce_flush_desc(xs->rx); + xs->sk.sk_data_ready(&xs->sk); +} + +int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +{ + int err; + + err = __xsk_rcv(xs, xdp); + if (!err) + xsk_flush(xs); + else + xs->rx_dropped++; + + return err; +} + +static unsigned int xsk_poll(struct file *file, struct socket *sock, + struct poll_table_struct *wait) +{ + unsigned int mask = datagram_poll(file, sock, wait); + struct sock *sk = sock->sk; + struct xdp_sock *xs = xdp_sk(sk); + + if (xs->rx && !xskq_empty_desc(xs->rx)) + mask |= POLLIN | POLLRDNORM; + + return mask; +} + static int xsk_init_queue(u32 entries, struct xsk_queue **queue, bool umem_queue) { @@ -179,6 +247,9 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) } else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) { err = -EINVAL; goto out_unlock; + } else { + /* This xsk has its own umem. */ + xskq_set_umem(xs->umem->fq, &xs->umem->props); } /* Rebind? */ @@ -330,7 +401,7 @@ static const struct proto_ops xsk_proto_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll = sock_no_poll, + .poll = xsk_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 9ddd2ee07a84..0a9b92b4f93a 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -20,6 +20,8 @@ #include "xdp_umem_props.h" +#define RX_BATCH_SIZE 16 + struct xsk_queue { struct xdp_umem_props umem_props; u32 ring_mask; @@ -32,8 +34,118 @@ struct xsk_queue { u64 invalid_descs; }; +/* Common functions operating for both RXTX and umem queues */ + +static inline u32 xskq_nb_avail(struct xsk_queue *q, u32 dcnt) +{ + u32 entries = q->prod_tail - q->cons_tail; + + if (entries == 0) { + /* Refresh the local pointer */ + q->prod_tail = READ_ONCE(q->ring->producer); + entries = q->prod_tail - q->cons_tail; + } + + return (entries > dcnt) ? dcnt : entries; +} + +static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt) +{ + u32 free_entries = q->nentries - (producer - q->cons_tail); + + if (free_entries >= dcnt) + return free_entries; + + /* Refresh the local tail pointer */ + q->cons_tail = READ_ONCE(q->ring->consumer); + return q->nentries - (producer - q->cons_tail); +} + +/* UMEM queue */ + +static inline bool xskq_is_valid_id(struct xsk_queue *q, u32 idx) +{ + if (unlikely(idx >= q->umem_props.nframes)) { + q->invalid_descs++; + return false; + } + return true; +} + +static inline u32 *xskq_validate_id(struct xsk_queue *q) +{ + while (q->cons_tail != q->cons_head) { + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; + unsigned int idx = q->cons_tail & q->ring_mask; + + if (xskq_is_valid_id(q, ring->desc[idx])) + return &ring->desc[idx]; + + q->cons_tail++; + } + + return NULL; +} + +static inline u32 *xskq_peek_id(struct xsk_queue *q) +{ + struct xdp_umem_ring *ring; + + if (q->cons_tail == q->cons_head) { + WRITE_ONCE(q->ring->consumer, q->cons_tail); + q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE); + + /* Order consumer and data */ + smp_rmb(); + + return xskq_validate_id(q); + } + + ring = (struct xdp_umem_ring *)q->ring; + return &ring->desc[q->cons_tail & q->ring_mask]; +} + +static inline void xskq_discard_id(struct xsk_queue *q) +{ + q->cons_tail++; + (void)xskq_validate_id(q); +} + +/* Rx queue */ + +static inline int xskq_produce_batch_desc(struct xsk_queue *q, + u32 id, u32 len, u16 offset) +{ + struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; + unsigned int idx; + + if (xskq_nb_free(q, q->prod_head, 1) == 0) + return -ENOSPC; + + idx = (q->prod_head++) & q->ring_mask; + ring->desc[idx].idx = id; + ring->desc[idx].len = len; + ring->desc[idx].offset = offset; + + return 0; +} + +static inline void xskq_produce_flush_desc(struct xsk_queue *q) +{ + /* Order producer and data */ + smp_wmb(); + + q->prod_tail = q->prod_head, + WRITE_ONCE(q->ring->producer, q->prod_tail); +} + +static inline bool xskq_empty_desc(struct xsk_queue *q) +{ + return (xskq_nb_free(q, q->prod_tail, 1) == q->nentries); +} + void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props); struct xsk_queue *xskq_create(u32 nentries, bool umem_queue); -void xskq_destroy(struct xsk_queue *q); +void xskq_destroy(struct xsk_queue *q_ops); #endif /* _LINUX_XSK_QUEUE_H */ -- cgit v1.2.3-59-g8ed1b From fbfc504a24f53f7ebe128ab55cb5dba634f4ece8 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Wed, 2 May 2018 13:01:28 +0200 Subject: bpf: introduce new bpf AF_XDP map type BPF_MAP_TYPE_XSKMAP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The xskmap is yet another BPF map, very much inspired by dev/cpu/sockmap, and is a holder of AF_XDP sockets. A user application adds AF_XDP sockets into the map, and by using the bpf_redirect_map helper, an XDP program can redirect XDP frames to an AF_XDP socket. Note that a socket that is bound to certain ifindex/queue index will *only* accept XDP frames from that netdev/queue index. If an XDP program tries to redirect from a netdev/queue index other than what the socket is bound to, the frame will not be received on the socket. A socket can reside in multiple maps. v3: Fixed race and simplified code. v2: Removed one indirection in map lookup. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 25 +++++ include/linux/bpf_types.h | 3 + include/net/xdp_sock.h | 7 ++ include/uapi/linux/bpf.h | 1 + kernel/bpf/Makefile | 3 + kernel/bpf/verifier.c | 8 +- kernel/bpf/xskmap.c | 239 ++++++++++++++++++++++++++++++++++++++++++++++ net/xdp/xsk.c | 5 + 8 files changed, 289 insertions(+), 2 deletions(-) create mode 100644 kernel/bpf/xskmap.c (limited to 'net/xdp') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c553f6f9c6b0..68ecdb4eea09 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -676,6 +676,31 @@ static inline int sock_map_prog(struct bpf_map *map, } #endif +#if defined(CONFIG_XDP_SOCKETS) +struct xdp_sock; +struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, u32 key); +int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp, + struct xdp_sock *xs); +void __xsk_map_flush(struct bpf_map *map); +#else +struct xdp_sock; +static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, + u32 key) +{ + return NULL; +} + +static inline int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp, + struct xdp_sock *xs) +{ + return -EOPNOTSUPP; +} + +static inline void __xsk_map_flush(struct bpf_map *map) +{ +} +#endif + /* verifier prototypes for helper functions called from eBPF programs */ extern const struct bpf_func_proto bpf_map_lookup_elem_proto; extern const struct bpf_func_proto bpf_map_update_elem_proto; diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 2b28fcf6f6ae..d7df1b323082 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -49,4 +49,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) +#if defined(CONFIG_XDP_SOCKETS) +BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops) +#endif #endif diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index a0342dff6a4d..ce3a2ab16b8f 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -28,6 +28,7 @@ struct xdp_sock { struct xsk_queue *rx; struct net_device *dev; struct xdp_umem *umem; + struct list_head flush_node; u16 queue_id; /* Protects multiple processes in the control path */ struct mutex mutex; @@ -39,6 +40,7 @@ struct xdp_buff; int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); void xsk_flush(struct xdp_sock *xs); +bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs); #else static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { @@ -53,6 +55,11 @@ static inline int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) static inline void xsk_flush(struct xdp_sock *xs) { } + +static inline bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) +{ + return false; +} #endif /* CONFIG_XDP_SOCKETS */ #endif /* _LINUX_XDP_SOCK_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 8daef7326bb7..a3a495052511 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -116,6 +116,7 @@ enum bpf_map_type { BPF_MAP_TYPE_DEVMAP, BPF_MAP_TYPE_SOCKMAP, BPF_MAP_TYPE_CPUMAP, + BPF_MAP_TYPE_XSKMAP, }; enum bpf_prog_type { diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 35c485fa9ea3..f27f5496d6fe 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -8,6 +8,9 @@ obj-$(CONFIG_BPF_SYSCALL) += btf.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o obj-$(CONFIG_BPF_SYSCALL) += cpumap.o +ifeq ($(CONFIG_XDP_SOCKETS),y) +obj-$(CONFIG_BPF_SYSCALL) += xskmap.o +endif obj-$(CONFIG_BPF_SYSCALL) += offload.o ifeq ($(CONFIG_STREAM_PARSER),y) ifeq ($(CONFIG_INET),y) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 712d8655e916..0d91f18b2eb5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2070,8 +2070,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (func_id != BPF_FUNC_redirect_map) goto error; break; - /* Restrict bpf side of cpumap, open when use-cases appear */ + /* Restrict bpf side of cpumap and xskmap, open when use-cases + * appear. + */ case BPF_MAP_TYPE_CPUMAP: + case BPF_MAP_TYPE_XSKMAP: if (func_id != BPF_FUNC_redirect_map) goto error; break; @@ -2118,7 +2121,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, break; case BPF_FUNC_redirect_map: if (map->map_type != BPF_MAP_TYPE_DEVMAP && - map->map_type != BPF_MAP_TYPE_CPUMAP) + map->map_type != BPF_MAP_TYPE_CPUMAP && + map->map_type != BPF_MAP_TYPE_XSKMAP) goto error; break; case BPF_FUNC_sk_redirect_map: diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c new file mode 100644 index 000000000000..869dbb11b612 --- /dev/null +++ b/kernel/bpf/xskmap.c @@ -0,0 +1,239 @@ +// SPDX-License-Identifier: GPL-2.0 +/* XSKMAP used for AF_XDP sockets + * Copyright(c) 2018 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include +#include +#include +#include +#include + +struct xsk_map { + struct bpf_map map; + struct xdp_sock **xsk_map; + struct list_head __percpu *flush_list; +}; + +static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) +{ + int cpu, err = -EINVAL; + struct xsk_map *m; + u64 cost; + + if (!capable(CAP_NET_ADMIN)) + return ERR_PTR(-EPERM); + + if (attr->max_entries == 0 || attr->key_size != 4 || + attr->value_size != 4 || + attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)) + return ERR_PTR(-EINVAL); + + m = kzalloc(sizeof(*m), GFP_USER); + if (!m) + return ERR_PTR(-ENOMEM); + + bpf_map_init_from_attr(&m->map, attr); + + cost = (u64)m->map.max_entries * sizeof(struct xdp_sock *); + cost += sizeof(struct list_head) * num_possible_cpus(); + if (cost >= U32_MAX - PAGE_SIZE) + goto free_m; + + m->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + /* Notice returns -EPERM on if map size is larger than memlock limit */ + err = bpf_map_precharge_memlock(m->map.pages); + if (err) + goto free_m; + + m->flush_list = alloc_percpu(struct list_head); + if (!m->flush_list) + goto free_m; + + for_each_possible_cpu(cpu) + INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu)); + + m->xsk_map = bpf_map_area_alloc(m->map.max_entries * + sizeof(struct xdp_sock *), + m->map.numa_node); + if (!m->xsk_map) + goto free_percpu; + return &m->map; + +free_percpu: + free_percpu(m->flush_list); +free_m: + kfree(m); + return ERR_PTR(err); +} + +static void xsk_map_free(struct bpf_map *map) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + int i; + + synchronize_net(); + + for (i = 0; i < map->max_entries; i++) { + struct xdp_sock *xs; + + xs = m->xsk_map[i]; + if (!xs) + continue; + + sock_put((struct sock *)xs); + } + + free_percpu(m->flush_list); + bpf_map_area_free(m->xsk_map); + kfree(m); +} + +static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + u32 index = key ? *(u32 *)key : U32_MAX; + u32 *next = next_key; + + if (index >= m->map.max_entries) { + *next = 0; + return 0; + } + + if (index == m->map.max_entries - 1) + return -ENOENT; + *next = index + 1; + return 0; +} + +struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, u32 key) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + struct xdp_sock *xs; + + if (key >= map->max_entries) + return NULL; + + xs = READ_ONCE(m->xsk_map[key]); + return xs; +} + +int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp, + struct xdp_sock *xs) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + struct list_head *flush_list = this_cpu_ptr(m->flush_list); + int err; + + err = xsk_rcv(xs, xdp); + if (err) + return err; + + if (!xs->flush_node.prev) + list_add(&xs->flush_node, flush_list); + + return 0; +} + +void __xsk_map_flush(struct bpf_map *map) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + struct list_head *flush_list = this_cpu_ptr(m->flush_list); + struct xdp_sock *xs, *tmp; + + list_for_each_entry_safe(xs, tmp, flush_list, flush_node) { + xsk_flush(xs); + __list_del(xs->flush_node.prev, xs->flush_node.next); + xs->flush_node.prev = NULL; + } +} + +static void *xsk_map_lookup_elem(struct bpf_map *map, void *key) +{ + return NULL; +} + +static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + u32 i = *(u32 *)key, fd = *(u32 *)value; + struct xdp_sock *xs, *old_xs; + struct socket *sock; + int err; + + if (unlikely(map_flags > BPF_EXIST)) + return -EINVAL; + if (unlikely(i >= m->map.max_entries)) + return -E2BIG; + if (unlikely(map_flags == BPF_NOEXIST)) + return -EEXIST; + + sock = sockfd_lookup(fd, &err); + if (!sock) + return err; + + if (sock->sk->sk_family != PF_XDP) { + sockfd_put(sock); + return -EOPNOTSUPP; + } + + xs = (struct xdp_sock *)sock->sk; + + if (!xsk_is_setup_for_bpf_map(xs)) { + sockfd_put(sock); + return -EOPNOTSUPP; + } + + sock_hold(sock->sk); + + old_xs = xchg(&m->xsk_map[i], xs); + if (old_xs) { + /* Make sure we've flushed everything. */ + synchronize_net(); + sock_put((struct sock *)old_xs); + } + + sockfd_put(sock); + return 0; +} + +static int xsk_map_delete_elem(struct bpf_map *map, void *key) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + struct xdp_sock *old_xs; + int k = *(u32 *)key; + + if (k >= map->max_entries) + return -EINVAL; + + old_xs = xchg(&m->xsk_map[k], NULL); + if (old_xs) { + /* Make sure we've flushed everything. */ + synchronize_net(); + sock_put((struct sock *)old_xs); + } + + return 0; +} + +const struct bpf_map_ops xsk_map_ops = { + .map_alloc = xsk_map_alloc, + .map_free = xsk_map_free, + .map_get_next_key = xsk_map_get_next_key, + .map_lookup_elem = xsk_map_lookup_elem, + .map_update_elem = xsk_map_update_elem, + .map_delete_elem = xsk_map_delete_elem, +}; + + diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 4e1e6c581e1d..b931a0db5588 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -41,6 +41,11 @@ static struct xdp_sock *xdp_sk(struct sock *sk) return (struct xdp_sock *)sk; } +bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) +{ + return !!xs->rx; +} + static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { u32 *id, len = xdp->data_end - xdp->data; -- cgit v1.2.3-59-g8ed1b From fe2308328cd2f26ebc986f543796e7d13ae00bc4 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 2 May 2018 13:01:31 +0200 Subject: xsk: add umem completion queue support and mmap Here, we add another setsockopt for registered user memory (umem) called XDP_UMEM_COMPLETION_QUEUE. Using this socket option, the process can ask the kernel to allocate a queue (ring buffer) and also mmap it (XDP_UMEM_PGOFF_COMPLETION_QUEUE) into the process. The queue is used to explicitly pass ownership of umem frames from the kernel to user process. This will be used by the TX path to tell user space that a certain frame has been transmitted and user space can use it for something else, if it wishes. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov --- include/uapi/linux/if_xdp.h | 2 ++ net/xdp/xdp_umem.c | 7 ++++++- net/xdp/xdp_umem.h | 1 + net/xdp/xsk.c | 7 ++++++- 4 files changed, 15 insertions(+), 2 deletions(-) (limited to 'net/xdp') diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index e5091881f776..71581a139f26 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -36,6 +36,7 @@ struct sockaddr_xdp { #define XDP_RX_RING 1 #define XDP_UMEM_REG 3 #define XDP_UMEM_FILL_RING 4 +#define XDP_UMEM_COMPLETION_RING 5 struct xdp_umem_reg { __u64 addr; /* Start of packet data area */ @@ -47,6 +48,7 @@ struct xdp_umem_reg { /* Pgoff for mmaping the rings */ #define XDP_PGOFF_RX_RING 0 #define XDP_UMEM_PGOFF_FILL_RING 0x100000000 +#define XDP_UMEM_PGOFF_COMPLETION_RING 0x180000000 struct xdp_desc { __u32 idx; diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 9bac1ad570fa..881dfdefe235 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -70,6 +70,11 @@ static void xdp_umem_release(struct xdp_umem *umem) umem->fq = NULL; } + if (umem->cq) { + xskq_destroy(umem->cq); + umem->cq = NULL; + } + if (umem->pgs) { xdp_umem_unpin_pages(umem); @@ -251,5 +256,5 @@ out: bool xdp_umem_validate_queues(struct xdp_umem *umem) { - return umem->fq; + return (umem->fq && umem->cq); } diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index c7378a11721f..7e0b2fab8522 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -24,6 +24,7 @@ struct xdp_umem { struct xsk_queue *fq; + struct xsk_queue *cq; struct page **pgs; struct xdp_umem_props props; u32 npgs; diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index b931a0db5588..f4a2c5bc6da9 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -255,6 +255,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) } else { /* This xsk has its own umem. */ xskq_set_umem(xs->umem->fq, &xs->umem->props); + xskq_set_umem(xs->umem->cq, &xs->umem->props); } /* Rebind? */ @@ -334,6 +335,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, return 0; } case XDP_UMEM_FILL_RING: + case XDP_UMEM_COMPLETION_RING: { struct xsk_queue **q; int entries; @@ -345,7 +347,8 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, return -EFAULT; mutex_lock(&xs->mutex); - q = &xs->umem->fq; + q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq : + &xs->umem->cq; err = xsk_init_queue(entries, q, true); mutex_unlock(&xs->mutex); return err; @@ -375,6 +378,8 @@ static int xsk_mmap(struct file *file, struct socket *sock, if (offset == XDP_UMEM_PGOFF_FILL_RING) q = xs->umem->fq; + else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING) + q = xs->umem->cq; else return -EINVAL; } -- cgit v1.2.3-59-g8ed1b From f61459030ec7fffdaa3c462cc0f728eef11b4d05 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 2 May 2018 13:01:32 +0200 Subject: xsk: add Tx queue setup and mmap support Another setsockopt (XDP_TX_QUEUE) is added to let the process allocate a queue, where the user process can pass frames to be transmitted by the kernel. The mmapping of the queue is done using the XDP_PGOFF_TX_QUEUE offset. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov --- include/net/xdp_sock.h | 1 + include/uapi/linux/if_xdp.h | 2 ++ net/xdp/xsk.c | 8 ++++++-- 3 files changed, 9 insertions(+), 2 deletions(-) (limited to 'net/xdp') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index ce3a2ab16b8f..185f4928fbda 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -30,6 +30,7 @@ struct xdp_sock { struct xdp_umem *umem; struct list_head flush_node; u16 queue_id; + struct xsk_queue *tx ____cacheline_aligned_in_smp; /* Protects multiple processes in the control path */ struct mutex mutex; u64 rx_dropped; diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index 71581a139f26..e2ea878d025c 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -34,6 +34,7 @@ struct sockaddr_xdp { /* XDP socket options */ #define XDP_RX_RING 1 +#define XDP_TX_RING 2 #define XDP_UMEM_REG 3 #define XDP_UMEM_FILL_RING 4 #define XDP_UMEM_COMPLETION_RING 5 @@ -47,6 +48,7 @@ struct xdp_umem_reg { /* Pgoff for mmaping the rings */ #define XDP_PGOFF_RX_RING 0 +#define XDP_PGOFF_TX_RING 0x80000000 #define XDP_UMEM_PGOFF_FILL_RING 0x100000000 #define XDP_UMEM_PGOFF_COMPLETION_RING 0x180000000 diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index f4a2c5bc6da9..2d7b0c90d996 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -206,7 +206,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) goto out_release; } - if (!xs->rx) { + if (!xs->rx && !xs->tx) { err = -EINVAL; goto out_unlock; } @@ -291,6 +291,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, switch (optname) { case XDP_RX_RING: + case XDP_TX_RING: { struct xsk_queue **q; int entries; @@ -301,7 +302,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, return -EFAULT; mutex_lock(&xs->mutex); - q = &xs->rx; + q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx; err = xsk_init_queue(entries, q, false); mutex_unlock(&xs->mutex); return err; @@ -372,6 +373,8 @@ static int xsk_mmap(struct file *file, struct socket *sock, if (offset == XDP_PGOFF_RX_RING) { q = xs->rx; + } else if (offset == XDP_PGOFF_TX_RING) { + q = xs->tx; } else { if (!xs->umem) return -EINVAL; @@ -431,6 +434,7 @@ static void xsk_destruct(struct sock *sk) return; xskq_destroy(xs->rx); + xskq_destroy(xs->tx); xdp_put_umem(xs->umem); sk_refcnt_debug_dec(sk); -- cgit v1.2.3-59-g8ed1b From 35fcde7f8deb51b707b161bf19cbd22363aef2df Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 2 May 2018 13:01:34 +0200 Subject: xsk: support for Tx Here, Tx support is added. The user fills the Tx queue with frames to be sent by the kernel, and let's the kernel know using the sendmsg syscall. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov --- net/xdp/xsk.c | 111 ++++++++++++++++++++++++++++++++++++++++++++++++++-- net/xdp/xsk_queue.h | 93 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 200 insertions(+), 4 deletions(-) (limited to 'net/xdp') diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 2d7b0c90d996..b33c535c7996 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -36,6 +36,8 @@ #include "xsk_queue.h" #include "xdp_umem.h" +#define TX_BATCH_SIZE 16 + static struct xdp_sock *xdp_sk(struct sock *sk) { return (struct xdp_sock *)sk; @@ -101,6 +103,108 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) return err; } +static void xsk_destruct_skb(struct sk_buff *skb) +{ + u32 id = (u32)(long)skb_shinfo(skb)->destructor_arg; + struct xdp_sock *xs = xdp_sk(skb->sk); + + WARN_ON_ONCE(xskq_produce_id(xs->umem->cq, id)); + + sock_wfree(skb); +} + +static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, + size_t total_len) +{ + bool need_wait = !(m->msg_flags & MSG_DONTWAIT); + u32 max_batch = TX_BATCH_SIZE; + struct xdp_sock *xs = xdp_sk(sk); + bool sent_frame = false; + struct xdp_desc desc; + struct sk_buff *skb; + int err = 0; + + if (unlikely(!xs->tx)) + return -ENOBUFS; + if (need_wait) + return -EOPNOTSUPP; + + mutex_lock(&xs->mutex); + + while (xskq_peek_desc(xs->tx, &desc)) { + char *buffer; + u32 id, len; + + if (max_batch-- == 0) { + err = -EAGAIN; + goto out; + } + + if (xskq_reserve_id(xs->umem->cq)) { + err = -EAGAIN; + goto out; + } + + len = desc.len; + if (unlikely(len > xs->dev->mtu)) { + err = -EMSGSIZE; + goto out; + } + + skb = sock_alloc_send_skb(sk, len, !need_wait, &err); + if (unlikely(!skb)) { + err = -EAGAIN; + goto out; + } + + skb_put(skb, len); + id = desc.idx; + buffer = xdp_umem_get_data(xs->umem, id) + desc.offset; + err = skb_store_bits(skb, 0, buffer, len); + if (unlikely(err)) { + kfree_skb(skb); + goto out; + } + + skb->dev = xs->dev; + skb->priority = sk->sk_priority; + skb->mark = sk->sk_mark; + skb_shinfo(skb)->destructor_arg = (void *)(long)id; + skb->destructor = xsk_destruct_skb; + + err = dev_direct_xmit(skb, xs->queue_id); + /* Ignore NET_XMIT_CN as packet might have been sent */ + if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) { + err = -EAGAIN; + /* SKB consumed by dev_direct_xmit() */ + goto out; + } + + sent_frame = true; + xskq_discard_desc(xs->tx); + } + +out: + if (sent_frame) + sk->sk_write_space(sk); + + mutex_unlock(&xs->mutex); + return err; +} + +static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) +{ + struct sock *sk = sock->sk; + struct xdp_sock *xs = xdp_sk(sk); + + if (unlikely(!xs->dev)) + return -ENXIO; + if (unlikely(!(xs->dev->flags & IFF_UP))) + return -ENETDOWN; + + return xsk_generic_xmit(sk, m, total_len); +} + static unsigned int xsk_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait) { @@ -110,6 +214,8 @@ static unsigned int xsk_poll(struct file *file, struct socket *sock, if (xs->rx && !xskq_empty_desc(xs->rx)) mask |= POLLIN | POLLRDNORM; + if (xs->tx && !xskq_full_desc(xs->tx)) + mask |= POLLOUT | POLLWRNORM; return mask; } @@ -270,6 +376,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) xs->queue_id = sxdp->sxdp_queue_id; xskq_set_umem(xs->rx, &xs->umem->props); + xskq_set_umem(xs->tx, &xs->umem->props); out_unlock: if (err) @@ -383,8 +490,6 @@ static int xsk_mmap(struct file *file, struct socket *sock, q = xs->umem->fq; else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING) q = xs->umem->cq; - else - return -EINVAL; } if (!q) @@ -420,7 +525,7 @@ static const struct proto_ops xsk_proto_ops = { .shutdown = sock_no_shutdown, .setsockopt = xsk_setsockopt, .getsockopt = sock_no_getsockopt, - .sendmsg = sock_no_sendmsg, + .sendmsg = xsk_sendmsg, .recvmsg = sock_no_recvmsg, .mmap = xsk_mmap, .sendpage = sock_no_sendpage, diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 0a9b92b4f93a..3497e8808608 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -111,7 +111,93 @@ static inline void xskq_discard_id(struct xsk_queue *q) (void)xskq_validate_id(q); } -/* Rx queue */ +static inline int xskq_produce_id(struct xsk_queue *q, u32 id) +{ + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; + + ring->desc[q->prod_tail++ & q->ring_mask] = id; + + /* Order producer and data */ + smp_wmb(); + + WRITE_ONCE(q->ring->producer, q->prod_tail); + return 0; +} + +static inline int xskq_reserve_id(struct xsk_queue *q) +{ + if (xskq_nb_free(q, q->prod_head, 1) == 0) + return -ENOSPC; + + q->prod_head++; + return 0; +} + +/* Rx/Tx queue */ + +static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d) +{ + u32 buff_len; + + if (unlikely(d->idx >= q->umem_props.nframes)) { + q->invalid_descs++; + return false; + } + + buff_len = q->umem_props.frame_size; + if (unlikely(d->len > buff_len || d->len == 0 || + d->offset > buff_len || d->offset + d->len > buff_len)) { + q->invalid_descs++; + return false; + } + + return true; +} + +static inline struct xdp_desc *xskq_validate_desc(struct xsk_queue *q, + struct xdp_desc *desc) +{ + while (q->cons_tail != q->cons_head) { + struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; + unsigned int idx = q->cons_tail & q->ring_mask; + + if (xskq_is_valid_desc(q, &ring->desc[idx])) { + if (desc) + *desc = ring->desc[idx]; + return desc; + } + + q->cons_tail++; + } + + return NULL; +} + +static inline struct xdp_desc *xskq_peek_desc(struct xsk_queue *q, + struct xdp_desc *desc) +{ + struct xdp_rxtx_ring *ring; + + if (q->cons_tail == q->cons_head) { + WRITE_ONCE(q->ring->consumer, q->cons_tail); + q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE); + + /* Order consumer and data */ + smp_rmb(); + + return xskq_validate_desc(q, desc); + } + + ring = (struct xdp_rxtx_ring *)q->ring; + *desc = ring->desc[q->cons_tail & q->ring_mask]; + return desc; +} + +static inline void xskq_discard_desc(struct xsk_queue *q) +{ + q->cons_tail++; + (void)xskq_validate_desc(q, NULL); +} static inline int xskq_produce_batch_desc(struct xsk_queue *q, u32 id, u32 len, u16 offset) @@ -139,6 +225,11 @@ static inline void xskq_produce_flush_desc(struct xsk_queue *q) WRITE_ONCE(q->ring->producer, q->prod_tail); } +static inline bool xskq_full_desc(struct xsk_queue *q) +{ + return (xskq_nb_avail(q, q->nentries) == q->nentries); +} + static inline bool xskq_empty_desc(struct xsk_queue *q) { return (xskq_nb_free(q, q->prod_tail, 1) == q->nentries); -- cgit v1.2.3-59-g8ed1b From af75d9e02d08dc55ce6a1e42e485465c630d7349 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 2 May 2018 13:01:35 +0200 Subject: xsk: statistics support In this commit, a new getsockopt is added: XDP_STATISTICS. This is used to obtain stats from the sockets. v2: getsockopt now returns size of stats structure. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov --- include/uapi/linux/if_xdp.h | 7 +++++++ net/xdp/xsk.c | 45 ++++++++++++++++++++++++++++++++++++++++++++- net/xdp/xsk_queue.h | 5 +++++ 3 files changed, 56 insertions(+), 1 deletion(-) (limited to 'net/xdp') diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index e2ea878d025c..77b88c4efe98 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -38,6 +38,7 @@ struct sockaddr_xdp { #define XDP_UMEM_REG 3 #define XDP_UMEM_FILL_RING 4 #define XDP_UMEM_COMPLETION_RING 5 +#define XDP_STATISTICS 6 struct xdp_umem_reg { __u64 addr; /* Start of packet data area */ @@ -46,6 +47,12 @@ struct xdp_umem_reg { __u32 frame_headroom; /* Frame head room */ }; +struct xdp_statistics { + __u64 rx_dropped; /* Dropped for reasons other than invalid desc */ + __u64 rx_invalid_descs; /* Dropped due to invalid descriptor */ + __u64 tx_invalid_descs; /* Dropped due to invalid descriptor */ +}; + /* Pgoff for mmaping the rings */ #define XDP_PGOFF_RX_RING 0 #define XDP_PGOFF_TX_RING 0x80000000 diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index b33c535c7996..009c5af5bba5 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -468,6 +468,49 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, return -ENOPROTOOPT; } +static int xsk_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + struct xdp_sock *xs = xdp_sk(sk); + int len; + + if (level != SOL_XDP) + return -ENOPROTOOPT; + + if (get_user(len, optlen)) + return -EFAULT; + if (len < 0) + return -EINVAL; + + switch (optname) { + case XDP_STATISTICS: + { + struct xdp_statistics stats; + + if (len < sizeof(stats)) + return -EINVAL; + + mutex_lock(&xs->mutex); + stats.rx_dropped = xs->rx_dropped; + stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx); + stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx); + mutex_unlock(&xs->mutex); + + if (copy_to_user(optval, &stats, sizeof(stats))) + return -EFAULT; + if (put_user(sizeof(stats), optlen)) + return -EFAULT; + + return 0; + } + default: + break; + } + + return -EOPNOTSUPP; +} + static int xsk_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) { @@ -524,7 +567,7 @@ static const struct proto_ops xsk_proto_ops = { .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = xsk_setsockopt, - .getsockopt = sock_no_getsockopt, + .getsockopt = xsk_getsockopt, .sendmsg = xsk_sendmsg, .recvmsg = sock_no_recvmsg, .mmap = xsk_mmap, diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 3497e8808608..7aa9a535db0e 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -36,6 +36,11 @@ struct xsk_queue { /* Common functions operating for both RXTX and umem queues */ +static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q) +{ + return q ? q->invalid_descs : 0; +} + static inline u32 xskq_nb_avail(struct xsk_queue *q, u32 dcnt) { u32 entries = q->prod_tail - q->cons_tail; -- cgit v1.2.3-59-g8ed1b From ea7e3435297c6ba3c8ae51db6ea48f1ed657dc5c Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Mon, 7 May 2018 19:43:50 +0200 Subject: xsk: fix 64-bit division MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit i386 builds report: net/xdp/xdp_umem.o: In function `xdp_umem_reg': xdp_umem.c:(.text+0x47e): undefined reference to `__udivdi3' This fix uses div_u64 instead of the GCC built-in. Fixes: c0c77d8fb787 ("xsk: add user memory registration support sockopt") Signed-off-by: Björn Töpel Reported-by: Randy Dunlap Tested-by: Randy Dunlap Signed-off-by: Daniel Borkmann --- net/xdp/xdp_umem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/xdp') diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 881dfdefe235..2b47a1dd7c6c 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -209,7 +209,7 @@ int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) if ((addr + size) < addr) return -EINVAL; - nframes = size / frame_size; + nframes = (unsigned int)div_u64(size, frame_size); if (nframes == 0 || nframes > UINT_MAX) return -EINVAL; -- cgit v1.2.3-59-g8ed1b From dac09149d992995adbef0f472093fbb6940a8653 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Fri, 18 May 2018 14:00:21 +0200 Subject: xsk: clean up SPDX headers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Clean up SPDX-License-Identifier and removing licensing leftovers. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- include/net/xdp_sock.h | 13 ++----------- include/uapi/linux/if_xdp.h | 13 ++----------- kernel/bpf/xskmap.c | 9 --------- net/xdp/xdp_umem.c | 9 --------- net/xdp/xdp_umem.h | 13 ++----------- net/xdp/xdp_umem_props.h | 13 ++----------- net/xdp/xsk.c | 9 --------- net/xdp/xsk_queue.c | 9 --------- net/xdp/xsk_queue.h | 13 ++----------- samples/bpf/xdpsock_user.c | 12 +----------- 10 files changed, 11 insertions(+), 102 deletions(-) (limited to 'net/xdp') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 185f4928fbda..7a647c56ec15 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -1,15 +1,6 @@ -/* SPDX-License-Identifier: GPL-2.0 - * AF_XDP internal functions +/* SPDX-License-Identifier: GPL-2.0 */ +/* AF_XDP internal functions * Copyright(c) 2018 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #ifndef _LINUX_XDP_SOCK_H diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index 77b88c4efe98..56db977221d2 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -1,17 +1,8 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note - * +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* * if_xdp: XDP socket user-space interface * Copyright(c) 2018 Intel Corporation. * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * * Author(s): Björn Töpel * Magnus Karlsson */ diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index cb3a12137404..b3c557476a8d 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -1,15 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* XSKMAP used for AF_XDP sockets * Copyright(c) 2018 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #include diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 2b47a1dd7c6c..df4ea97c433b 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -1,15 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* XDP user-space packet buffer * Copyright(c) 2018 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #include diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index 7e0b2fab8522..70fe225baa51 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -1,15 +1,6 @@ -/* SPDX-License-Identifier: GPL-2.0 - * XDP user-space packet buffer +/* SPDX-License-Identifier: GPL-2.0 */ +/* XDP user-space packet buffer * Copyright(c) 2018 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #ifndef XDP_UMEM_H_ diff --git a/net/xdp/xdp_umem_props.h b/net/xdp/xdp_umem_props.h index 77fb5daf29f3..2cf8ec485fd2 100644 --- a/net/xdp/xdp_umem_props.h +++ b/net/xdp/xdp_umem_props.h @@ -1,15 +1,6 @@ -/* SPDX-License-Identifier: GPL-2.0 - * XDP user-space packet buffer +/* SPDX-License-Identifier: GPL-2.0 */ +/* XDP user-space packet buffer * Copyright(c) 2018 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #ifndef XDP_UMEM_PROPS_H_ diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 009c5af5bba5..b8d1cb4d78c0 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -5,15 +5,6 @@ * applications. * Copyright(c) 2018 Intel Corporation. * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * * Author(s): Björn Töpel * Magnus Karlsson */ diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c index d012e5e23591..9f605d22dad4 100644 --- a/net/xdp/xsk_queue.c +++ b/net/xdp/xsk_queue.c @@ -1,15 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* XDP user-space ring structure * Copyright(c) 2018 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #include diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 7aa9a535db0e..928d464e57b9 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -1,15 +1,6 @@ -/* SPDX-License-Identifier: GPL-2.0 - * XDP user-space ring structure +/* SPDX-License-Identifier: GPL-2.0 */ +/* XDP user-space ring structure * Copyright(c) 2018 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #ifndef _LINUX_XSK_QUEUE_H diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index 7fe60f6f7d53..60a882a2296c 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -1,15 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright(c) 2017 - 2018 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - */ +/* Copyright(c) 2017 - 2018 Intel Corporation. */ #include #include -- cgit v1.2.3-59-g8ed1b From 54b85c27fea4ce2be32a69bc2357847fe0367c64 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Fri, 18 May 2018 14:00:22 +0200 Subject: xsk: remove newline at end of file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Minor cleanup, remove newline at end of Makefile. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- net/xdp/Makefile | 1 - 1 file changed, 1 deletion(-) (limited to 'net/xdp') diff --git a/net/xdp/Makefile b/net/xdp/Makefile index 074fb2b2d51c..04f073146256 100644 --- a/net/xdp/Makefile +++ b/net/xdp/Makefile @@ -1,2 +1 @@ obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o - -- cgit v1.2.3-59-g8ed1b From da60cf00c1a576a459defed2edfb88c858510b64 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Fri, 18 May 2018 14:00:23 +0200 Subject: xsk: fixed some cases of unnecessary parentheses MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Removed some cases of unnecessary parentheses. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- net/xdp/xdp_umem.c | 4 ++-- net/xdp/xsk_queue.c | 3 +-- net/xdp/xsk_queue.h | 4 ++-- 3 files changed, 5 insertions(+), 6 deletions(-) (limited to 'net/xdp') diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index df4ea97c433b..c47909c74899 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -20,7 +20,7 @@ int xdp_umem_create(struct xdp_umem **umem) { *umem = kzalloc(sizeof(**umem), GFP_KERNEL); - if (!(*umem)) + if (!*umem) return -ENOMEM; return 0; @@ -247,5 +247,5 @@ out: bool xdp_umem_validate_queues(struct xdp_umem *umem) { - return (umem->fq && umem->cq); + return umem->fq && umem->cq; } diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c index 9f605d22dad4..ebe85e59507e 100644 --- a/net/xdp/xsk_queue.c +++ b/net/xdp/xsk_queue.c @@ -22,8 +22,7 @@ static u32 xskq_umem_get_ring_size(struct xsk_queue *q) static u32 xskq_rxtx_get_ring_size(struct xsk_queue *q) { - return (sizeof(struct xdp_ring) + - q->nentries * sizeof(struct xdp_desc)); + return sizeof(struct xdp_ring) + q->nentries * sizeof(struct xdp_desc); } struct xsk_queue *xskq_create(u32 nentries, bool umem_queue) diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 928d464e57b9..62e43be407d8 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -223,12 +223,12 @@ static inline void xskq_produce_flush_desc(struct xsk_queue *q) static inline bool xskq_full_desc(struct xsk_queue *q) { - return (xskq_nb_avail(q, q->nentries) == q->nentries); + return xskq_nb_avail(q, q->nentries) == q->nentries; } static inline bool xskq_empty_desc(struct xsk_queue *q) { - return (xskq_nb_free(q, q->prod_tail, 1) == q->nentries); + return xskq_nb_free(q, q->prod_tail, 1) == q->nentries; } void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props); -- cgit v1.2.3-59-g8ed1b From c2f4374b9661db28090a63de1ccc40552c4de8fa Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Fri, 18 May 2018 14:00:24 +0200 Subject: xsk: proper '=' alignment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Properly align xsk_proto_ops initialization. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- net/xdp/xsk.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'net/xdp') diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index b8d1cb4d78c0..817340f7725d 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -545,24 +545,24 @@ static struct proto xsk_proto = { }; static const struct proto_ops xsk_proto_ops = { - .family = PF_XDP, - .owner = THIS_MODULE, - .release = xsk_release, - .bind = xsk_bind, - .connect = sock_no_connect, - .socketpair = sock_no_socketpair, - .accept = sock_no_accept, - .getname = sock_no_getname, - .poll = xsk_poll, - .ioctl = sock_no_ioctl, - .listen = sock_no_listen, - .shutdown = sock_no_shutdown, - .setsockopt = xsk_setsockopt, - .getsockopt = xsk_getsockopt, - .sendmsg = xsk_sendmsg, - .recvmsg = sock_no_recvmsg, - .mmap = xsk_mmap, - .sendpage = sock_no_sendpage, + .family = PF_XDP, + .owner = THIS_MODULE, + .release = xsk_release, + .bind = xsk_bind, + .connect = sock_no_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = sock_no_getname, + .poll = xsk_poll, + .ioctl = sock_no_ioctl, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = xsk_setsockopt, + .getsockopt = xsk_getsockopt, + .sendmsg = xsk_sendmsg, + .recvmsg = sock_no_recvmsg, + .mmap = xsk_mmap, + .sendpage = sock_no_sendpage, }; static void xsk_destruct(struct sock *sk) -- cgit v1.2.3-59-g8ed1b From 959b71db53e310bc63be22bfa3401aef30c6035d Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Tue, 22 May 2018 09:34:56 +0200 Subject: xsk: remove rebind support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Supporting rebind, i.e. after a successful bind the process can call bind again without closing the socket, makes the AF_XDP setup state machine more complex. Constrain the state space, by not supporting rebind. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- net/xdp/xsk.c | 30 +++++++++--------------------- 1 file changed, 9 insertions(+), 21 deletions(-) (limited to 'net/xdp') diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 817340f7725d..cb1acd7009f4 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -227,14 +227,6 @@ static int xsk_init_queue(u32 entries, struct xsk_queue **queue, return 0; } -static void __xsk_release(struct xdp_sock *xs) -{ - /* Wait for driver to stop using the xdp socket. */ - synchronize_net(); - - dev_put(xs->dev); -} - static int xsk_release(struct socket *sock) { struct sock *sk = sock->sk; @@ -251,7 +243,9 @@ static int xsk_release(struct socket *sock) local_bh_enable(); if (xs->dev) { - __xsk_release(xs); + /* Wait for driver to stop using the xdp socket. */ + synchronize_net(); + dev_put(xs->dev); xs->dev = NULL; } @@ -285,9 +279,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr; struct sock *sk = sock->sk; - struct net_device *dev, *dev_curr; struct xdp_sock *xs = xdp_sk(sk); - struct xdp_umem *old_umem = NULL; + struct net_device *dev; int err = 0; if (addr_len < sizeof(struct sockaddr_xdp)) @@ -296,7 +289,11 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) return -EINVAL; mutex_lock(&xs->mutex); - dev_curr = xs->dev; + if (xs->dev) { + err = -EBUSY; + goto out_release; + } + dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex); if (!dev) { err = -ENODEV; @@ -343,7 +340,6 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) } xdp_get_umem(umem_xs->umem); - old_umem = xs->umem; xs->umem = umem_xs->umem; sockfd_put(sock); } else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) { @@ -355,14 +351,6 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) xskq_set_umem(xs->umem->cq, &xs->umem->props); } - /* Rebind? */ - if (dev_curr && (dev_curr != dev || - xs->queue_id != sxdp->sxdp_queue_id)) { - __xsk_release(xs); - if (old_umem) - xdp_put_umem(old_umem); - } - xs->dev = dev; xs->queue_id = sxdp->sxdp_queue_id; -- cgit v1.2.3-59-g8ed1b From 2e59dd5e4f6e884d1f7a70acf9845d223429026e Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Tue, 22 May 2018 09:34:58 +0200 Subject: xsk: proper queue id check at bind Validate the queue id against both Rx and Tx on the netdev. Also, make sure that the queue exists at xmit time. Reported-by: Jesper Dangaard Brouer Tested-by: Jesper Dangaard Brouer Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann --- net/xdp/xsk.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net/xdp') diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index cb1acd7009f4..29707354cf78 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -142,6 +142,11 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, goto out; } + if (xs->queue_id >= xs->dev->real_num_tx_queues) { + err = -ENXIO; + goto out; + } + skb = sock_alloc_send_skb(sk, len, !need_wait, &err); if (unlikely(!skb)) { err = -EAGAIN; @@ -305,7 +310,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) goto out_unlock; } - if (sxdp->sxdp_queue_id >= dev->num_rx_queues) { + if ((xs->rx && sxdp->sxdp_queue_id >= dev->real_num_rx_queues) || + (xs->tx && sxdp->sxdp_queue_id >= dev->real_num_tx_queues)) { err = -EINVAL; goto out_unlock; } -- cgit v1.2.3-59-g8ed1b From b3a9e0be436960072ddf327d9d82f50ee2f620e0 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Tue, 22 May 2018 09:34:59 +0200 Subject: xsk: remove explicit ring structure from uapi MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In this commit we remove the explicit ring structure from the the uapi. It is tricky for an uapi to depend on a certain L1 cache line size, since it can differ for variants of the same architecture. Now, we let the user application determine the offsets of the producer, consumer and descriptors by asking the socket via getsockopt. A typical flow would be (Rx ring): struct xdp_mmap_offsets off; struct xdp_desc *ring; u32 *prod, *cons; void *map; ... getsockopt(fd, SOL_XDP, XDP_MMAP_OFFSETS, &off, &optlen); map = mmap(NULL, off.rx.desc + NUM_DESCS * sizeof(struct xdp_desc), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, sfd, XDP_PGOFF_RX_RING); prod = map + off.rx.producer; cons = map + off.rx.consumer; ring = map + off.rx.desc; Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- include/uapi/linux/if_xdp.h | 44 ++++++++++++++++++++++---------------------- net/xdp/xsk.c | 29 +++++++++++++++++++++++++++++ net/xdp/xsk_queue.h | 17 +++++++++++++++++ 3 files changed, 68 insertions(+), 22 deletions(-) (limited to 'net/xdp') diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index c46609a00168..4737cfe222f5 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -23,13 +23,27 @@ struct sockaddr_xdp { __u32 sxdp_shared_umem_fd; }; +struct xdp_ring_offset { + __u64 producer; + __u64 consumer; + __u64 desc; +}; + +struct xdp_mmap_offsets { + struct xdp_ring_offset rx; + struct xdp_ring_offset tx; + struct xdp_ring_offset fr; /* Fill */ + struct xdp_ring_offset cr; /* Completion */ +}; + /* XDP socket options */ -#define XDP_RX_RING 1 -#define XDP_TX_RING 2 -#define XDP_UMEM_REG 3 -#define XDP_UMEM_FILL_RING 4 -#define XDP_UMEM_COMPLETION_RING 5 -#define XDP_STATISTICS 6 +#define XDP_MMAP_OFFSETS 1 +#define XDP_RX_RING 2 +#define XDP_TX_RING 3 +#define XDP_UMEM_REG 4 +#define XDP_UMEM_FILL_RING 5 +#define XDP_UMEM_COMPLETION_RING 6 +#define XDP_STATISTICS 7 struct xdp_umem_reg { __u64 addr; /* Start of packet data area */ @@ -50,6 +64,7 @@ struct xdp_statistics { #define XDP_UMEM_PGOFF_FILL_RING 0x100000000 #define XDP_UMEM_PGOFF_COMPLETION_RING 0x180000000 +/* Rx/Tx descriptor */ struct xdp_desc { __u32 idx; __u32 len; @@ -58,21 +73,6 @@ struct xdp_desc { __u8 padding[5]; }; -struct xdp_ring { - __u32 producer __attribute__((aligned(64))); - __u32 consumer __attribute__((aligned(64))); -}; - -/* Used for the RX and TX queues for packets */ -struct xdp_rxtx_ring { - struct xdp_ring ptrs; - struct xdp_desc desc[0] __attribute__((aligned(64))); -}; - -/* Used for the fill and completion queues for buffers */ -struct xdp_umem_ring { - struct xdp_ring ptrs; - __u32 desc[0] __attribute__((aligned(64))); -}; +/* UMEM descriptor is __u32 */ #endif /* _LINUX_IF_XDP_H */ diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 29707354cf78..378dd9287da5 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -489,6 +489,35 @@ static int xsk_getsockopt(struct socket *sock, int level, int optname, return 0; } + case XDP_MMAP_OFFSETS: + { + struct xdp_mmap_offsets off; + + if (len < sizeof(off)) + return -EINVAL; + + off.rx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer); + off.rx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer); + off.rx.desc = offsetof(struct xdp_rxtx_ring, desc); + off.tx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer); + off.tx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer); + off.tx.desc = offsetof(struct xdp_rxtx_ring, desc); + + off.fr.producer = offsetof(struct xdp_umem_ring, ptrs.producer); + off.fr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer); + off.fr.desc = offsetof(struct xdp_umem_ring, desc); + off.cr.producer = offsetof(struct xdp_umem_ring, ptrs.producer); + off.cr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer); + off.cr.desc = offsetof(struct xdp_umem_ring, desc); + + len = sizeof(off); + if (copy_to_user(optval, &off, len)) + return -EFAULT; + if (put_user(len, optlen)) + return -EFAULT; + + return 0; + } default: break; } diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 62e43be407d8..cb8e5be35110 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -13,6 +13,23 @@ #define RX_BATCH_SIZE 16 +struct xdp_ring { + u32 producer ____cacheline_aligned_in_smp; + u32 consumer ____cacheline_aligned_in_smp; +}; + +/* Used for the RX and TX queues for packets */ +struct xdp_rxtx_ring { + struct xdp_ring ptrs; + struct xdp_desc desc[0] ____cacheline_aligned_in_smp; +}; + +/* Used for the fill and completion queues for buffers */ +struct xdp_umem_ring { + struct xdp_ring ptrs; + u32 desc[0] ____cacheline_aligned_in_smp; +}; + struct xsk_queue { struct xdp_umem_props umem_props; u32 ring_mask; -- cgit v1.2.3-59-g8ed1b From 37b076933a8e38e72ffd3c40d3eeb5949f38baf3 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Tue, 22 May 2018 09:35:01 +0200 Subject: xsk: add missing write- and data-dependency barrier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Here, we add a missing write-barrier, and use READ_ONCE for the data-dependency barrier. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- net/xdp/xsk.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'net/xdp') diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 378dd9287da5..01f010ec0c05 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -228,6 +228,8 @@ static int xsk_init_queue(u32 entries, struct xsk_queue **queue, if (!q) return -ENOMEM; + /* Make sure queue is ready before it can be seen by others */ + smp_wmb(); *queue = q; return 0; } @@ -532,21 +534,23 @@ static int xsk_mmap(struct file *file, struct socket *sock, unsigned long size = vma->vm_end - vma->vm_start; struct xdp_sock *xs = xdp_sk(sock->sk); struct xsk_queue *q = NULL; + struct xdp_umem *umem; unsigned long pfn; struct page *qpg; if (offset == XDP_PGOFF_RX_RING) { - q = xs->rx; + q = READ_ONCE(xs->rx); } else if (offset == XDP_PGOFF_TX_RING) { - q = xs->tx; + q = READ_ONCE(xs->tx); } else { - if (!xs->umem) + umem = READ_ONCE(xs->umem); + if (!umem) return -EINVAL; if (offset == XDP_UMEM_PGOFF_FILL_RING) - q = xs->umem->fq; + q = READ_ONCE(umem->fq); else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING) - q = xs->umem->cq; + q = READ_ONCE(umem->cq); } if (!q) -- cgit v1.2.3-59-g8ed1b From a49049ea257656f27ffe424224f4a362b8b1234a Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Tue, 22 May 2018 09:35:02 +0200 Subject: xsk: simplified umem setup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As suggested by Daniel Borkmann, the umem setup code was a too defensive and complex. Here, we reduce the number of checks. Also, the memory pinning is now folded into the umem creation, and we do correct locking. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- net/xdp/xdp_umem.c | 79 ++++++++++++++++++++++++++---------------------------- net/xdp/xdp_umem.h | 3 +-- net/xdp/xsk.c | 24 ++++++++--------- 3 files changed, 51 insertions(+), 55 deletions(-) (limited to 'net/xdp') diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index c47909c74899..faa6ffbaf6ab 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -16,39 +16,25 @@ #define XDP_UMEM_MIN_FRAME_SIZE 2048 -int xdp_umem_create(struct xdp_umem **umem) -{ - *umem = kzalloc(sizeof(**umem), GFP_KERNEL); - - if (!*umem) - return -ENOMEM; - - return 0; -} - static void xdp_umem_unpin_pages(struct xdp_umem *umem) { unsigned int i; - if (umem->pgs) { - for (i = 0; i < umem->npgs; i++) { - struct page *page = umem->pgs[i]; - - set_page_dirty_lock(page); - put_page(page); - } + for (i = 0; i < umem->npgs; i++) { + struct page *page = umem->pgs[i]; - kfree(umem->pgs); - umem->pgs = NULL; + set_page_dirty_lock(page); + put_page(page); } + + kfree(umem->pgs); + umem->pgs = NULL; } static void xdp_umem_unaccount_pages(struct xdp_umem *umem) { - if (umem->user) { - atomic_long_sub(umem->npgs, &umem->user->locked_vm); - free_uid(umem->user); - } + atomic_long_sub(umem->npgs, &umem->user->locked_vm); + free_uid(umem->user); } static void xdp_umem_release(struct xdp_umem *umem) @@ -66,22 +52,18 @@ static void xdp_umem_release(struct xdp_umem *umem) umem->cq = NULL; } - if (umem->pgs) { - xdp_umem_unpin_pages(umem); - - task = get_pid_task(umem->pid, PIDTYPE_PID); - put_pid(umem->pid); - if (!task) - goto out; - mm = get_task_mm(task); - put_task_struct(task); - if (!mm) - goto out; + xdp_umem_unpin_pages(umem); - mmput(mm); - umem->pgs = NULL; - } + task = get_pid_task(umem->pid, PIDTYPE_PID); + put_pid(umem->pid); + if (!task) + goto out; + mm = get_task_mm(task); + put_task_struct(task); + if (!mm) + goto out; + mmput(mm); xdp_umem_unaccount_pages(umem); out: kfree(umem); @@ -167,16 +149,13 @@ static int xdp_umem_account_pages(struct xdp_umem *umem) return 0; } -int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) +static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) { u32 frame_size = mr->frame_size, frame_headroom = mr->frame_headroom; u64 addr = mr->addr, size = mr->len; unsigned int nframes, nfpp; int size_chk, err; - if (!umem) - return -EINVAL; - if (frame_size < XDP_UMEM_MIN_FRAME_SIZE || frame_size > PAGE_SIZE) { /* Strictly speaking we could support this, if: * - huge pages, or* @@ -245,6 +224,24 @@ out: return err; } +struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr) +{ + struct xdp_umem *umem; + int err; + + umem = kzalloc(sizeof(*umem), GFP_KERNEL); + if (!umem) + return ERR_PTR(-ENOMEM); + + err = xdp_umem_reg(umem, mr); + if (err) { + kfree(umem); + return ERR_PTR(err); + } + + return umem; +} + bool xdp_umem_validate_queues(struct xdp_umem *umem) { return umem->fq && umem->cq; diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index 70fe225baa51..9802287ff19d 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -50,9 +50,8 @@ static inline char *xdp_umem_get_data_with_headroom(struct xdp_umem *umem, } bool xdp_umem_validate_queues(struct xdp_umem *umem); -int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr); void xdp_get_umem(struct xdp_umem *umem); void xdp_put_umem(struct xdp_umem *umem); -int xdp_umem_create(struct xdp_umem **umem); +struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr); #endif /* XDP_UMEM_H_ */ diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 01f010ec0c05..cce0e4f8a536 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -406,25 +406,23 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, struct xdp_umem_reg mr; struct xdp_umem *umem; - if (xs->umem) - return -EBUSY; - if (copy_from_user(&mr, optval, sizeof(mr))) return -EFAULT; mutex_lock(&xs->mutex); - err = xdp_umem_create(&umem); + if (xs->umem) { + mutex_unlock(&xs->mutex); + return -EBUSY; + } - err = xdp_umem_reg(umem, &mr); - if (err) { - kfree(umem); + umem = xdp_umem_create(&mr); + if (IS_ERR(umem)) { mutex_unlock(&xs->mutex); - return err; + return PTR_ERR(umem); } /* Make sure umem is ready before it can be seen by others */ smp_wmb(); - xs->umem = umem; mutex_unlock(&xs->mutex); return 0; @@ -435,13 +433,15 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, struct xsk_queue **q; int entries; - if (!xs->umem) - return -EINVAL; - if (copy_from_user(&entries, optval, sizeof(entries))) return -EFAULT; mutex_lock(&xs->mutex); + if (!xs->umem) { + mutex_unlock(&xs->mutex); + return -EINVAL; + } + q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq : &xs->umem->cq; err = xsk_init_queue(entries, q, true); -- cgit v1.2.3-59-g8ed1b From d3b42f1422d9c050bf5a2c660c045af2ab5d3e72 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Tue, 22 May 2018 09:35:03 +0200 Subject: xsk: convert atomic_t to refcount_t MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce refcount_t, in favor of atomic_t. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- net/xdp/xdp_umem.c | 6 +++--- net/xdp/xdp_umem.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net/xdp') diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index faa6ffbaf6ab..87998818116f 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -78,7 +78,7 @@ static void xdp_umem_release_deferred(struct work_struct *work) void xdp_get_umem(struct xdp_umem *umem) { - atomic_inc(&umem->users); + refcount_inc(&umem->users); } void xdp_put_umem(struct xdp_umem *umem) @@ -86,7 +86,7 @@ void xdp_put_umem(struct xdp_umem *umem) if (!umem) return; - if (atomic_dec_and_test(&umem->users)) { + if (refcount_dec_and_test(&umem->users)) { INIT_WORK(&umem->work, xdp_umem_release_deferred); schedule_work(&umem->work); } @@ -206,7 +206,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) umem->frame_size_log2 = ilog2(frame_size); umem->nfpp_mask = nfpp - 1; umem->nfpplog2 = ilog2(nfpp); - atomic_set(&umem->users, 1); + refcount_set(&umem->users, 1); err = xdp_umem_account_pages(umem); if (err) diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index 9802287ff19d..0881cf456230 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -27,7 +27,7 @@ struct xdp_umem { struct pid *pid; unsigned long address; size_t size; - atomic_t users; + refcount_t users; struct work_struct work; }; -- cgit v1.2.3-59-g8ed1b From 4e64c835254095f55044d393e628dd3e92fca304 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Mon, 4 Jun 2018 13:57:11 +0200 Subject: xsk: proper fill queue descriptor validation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously the fill queue descriptor was not copied to kernel space prior validating it, making it possible for userland to change the descriptor post-kernel-validation. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- net/xdp/xsk.c | 11 +++++------ net/xdp/xsk_queue.h | 32 +++++++++----------------------- 2 files changed, 14 insertions(+), 29 deletions(-) (limited to 'net/xdp') diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index cce0e4f8a536..43554eb56fe6 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -41,20 +41,19 @@ bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { - u32 *id, len = xdp->data_end - xdp->data; + u32 id, len = xdp->data_end - xdp->data; void *buffer; - int err = 0; + int err; if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) return -EINVAL; - id = xskq_peek_id(xs->umem->fq); - if (!id) + if (!xskq_peek_id(xs->umem->fq, &id)) return -ENOSPC; - buffer = xdp_umem_get_data_with_headroom(xs->umem, *id); + buffer = xdp_umem_get_data_with_headroom(xs->umem, id); memcpy(buffer, xdp->data, len); - err = xskq_produce_batch_desc(xs->rx, *id, len, + err = xskq_produce_batch_desc(xs->rx, id, len, xs->umem->frame_headroom); if (!err) xskq_discard_id(xs->umem->fq); diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index cb8e5be35110..b5924e7aeb2b 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -85,14 +85,15 @@ static inline bool xskq_is_valid_id(struct xsk_queue *q, u32 idx) return true; } -static inline u32 *xskq_validate_id(struct xsk_queue *q) +static inline u32 *xskq_validate_id(struct xsk_queue *q, u32 *id) { while (q->cons_tail != q->cons_head) { struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; unsigned int idx = q->cons_tail & q->ring_mask; - if (xskq_is_valid_id(q, ring->desc[idx])) - return &ring->desc[idx]; + *id = READ_ONCE(ring->desc[idx]); + if (xskq_is_valid_id(q, *id)) + return id; q->cons_tail++; } @@ -100,28 +101,22 @@ static inline u32 *xskq_validate_id(struct xsk_queue *q) return NULL; } -static inline u32 *xskq_peek_id(struct xsk_queue *q) +static inline u32 *xskq_peek_id(struct xsk_queue *q, u32 *id) { - struct xdp_umem_ring *ring; - if (q->cons_tail == q->cons_head) { WRITE_ONCE(q->ring->consumer, q->cons_tail); q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE); /* Order consumer and data */ smp_rmb(); - - return xskq_validate_id(q); } - ring = (struct xdp_umem_ring *)q->ring; - return &ring->desc[q->cons_tail & q->ring_mask]; + return xskq_validate_id(q, id); } static inline void xskq_discard_id(struct xsk_queue *q) { q->cons_tail++; - (void)xskq_validate_id(q); } static inline int xskq_produce_id(struct xsk_queue *q, u32 id) @@ -174,11 +169,9 @@ static inline struct xdp_desc *xskq_validate_desc(struct xsk_queue *q, struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; unsigned int idx = q->cons_tail & q->ring_mask; - if (xskq_is_valid_desc(q, &ring->desc[idx])) { - if (desc) - *desc = ring->desc[idx]; + *desc = READ_ONCE(ring->desc[idx]); + if (xskq_is_valid_desc(q, desc)) return desc; - } q->cons_tail++; } @@ -189,27 +182,20 @@ static inline struct xdp_desc *xskq_validate_desc(struct xsk_queue *q, static inline struct xdp_desc *xskq_peek_desc(struct xsk_queue *q, struct xdp_desc *desc) { - struct xdp_rxtx_ring *ring; - if (q->cons_tail == q->cons_head) { WRITE_ONCE(q->ring->consumer, q->cons_tail); q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE); /* Order consumer and data */ smp_rmb(); - - return xskq_validate_desc(q, desc); } - ring = (struct xdp_rxtx_ring *)q->ring; - *desc = ring->desc[q->cons_tail & q->ring_mask]; - return desc; + return xskq_validate_desc(q, desc); } static inline void xskq_discard_desc(struct xsk_queue *q) { q->cons_tail++; - (void)xskq_validate_desc(q, NULL); } static inline int xskq_produce_batch_desc(struct xsk_queue *q, -- cgit v1.2.3-59-g8ed1b From a509a95536a86ef84deb16c656d741437791b414 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Mon, 4 Jun 2018 13:57:12 +0200 Subject: xsk: proper Rx drop statistics update MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, rx_dropped could be updated incorrectly, e.g. if the XDP program redirected the frame to a socket bound to a different queue than where the XDP program was executing. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- net/xdp/xsk.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net/xdp') diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 43554eb56fe6..966307ce4b8e 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -48,8 +48,10 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) return -EINVAL; - if (!xskq_peek_id(xs->umem->fq, &id)) + if (!xskq_peek_id(xs->umem->fq, &id)) { + xs->rx_dropped++; return -ENOSPC; + } buffer = xdp_umem_get_data_with_headroom(xs->umem, id); memcpy(buffer, xdp->data, len); @@ -57,6 +59,8 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) xs->umem->frame_headroom); if (!err) xskq_discard_id(xs->umem->fq); + else + xs->rx_dropped++; return err; } @@ -68,8 +72,6 @@ int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) err = __xsk_rcv(xs, xdp); if (likely(!err)) xdp_return_buff(xdp); - else - xs->rx_dropped++; return err; } @@ -87,8 +89,6 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) err = __xsk_rcv(xs, xdp); if (!err) xsk_flush(xs); - else - xs->rx_dropped++; return err; } -- cgit v1.2.3-59-g8ed1b From bbff2f321a864ee07c9d3d1245af498023146951 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Mon, 4 Jun 2018 13:57:13 +0200 Subject: xsk: new descriptor addressing scheme MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, AF_XDP only supports a fixed frame-size memory scheme where each frame is referenced via an index (idx). A user passes the frame index to the kernel, and the kernel acts upon the data. Some NICs, however, do not have a fixed frame-size model, instead they have a model where a memory window is passed to the hardware and multiple frames are filled into that window (referred to as the "type-writer" model). By changing the descriptor format from the current frame index addressing scheme, AF_XDP can in the future be extended to support these kinds of NICs. In the index-based model, an idx refers to a frame of size frame_size. Addressing a frame in the UMEM is done by offseting the UMEM starting address by a global offset, idx * frame_size + offset. Communicating via the fill- and completion-rings are done by means of idx. In this commit, the idx is removed in favor of an address (addr), which is a relative address ranging over the UMEM. To convert an idx-based address to the new addr is simply: addr = idx * frame_size + offset. We also stop referring to the UMEM "frame" as a frame. Instead it is simply called a chunk. To transfer ownership of a chunk to the kernel, the addr of the chunk is passed in the fill-ring. Note, that the kernel will mask addr to make it chunk aligned, so there is no need for userspace to do that. E.g., for a chunk size of 2k, passing an addr of 2048, 2050 or 3000 to the fill-ring will refer to the same chunk. On the completion-ring, the addr will match that of the Tx descriptor, passed to the kernel. Changing the descriptor format to use chunks/addr will allow for future changes to move to a type-writer based model, where multiple frames can reside in one chunk. In this model passing one single chunk into the fill-ring, would potentially result in multiple Rx descriptors. This commit changes the uapi of AF_XDP sockets, and updates the documentation. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- Documentation/networking/af_xdp.rst | 101 +++++++++++++++++++++--------------- include/uapi/linux/if_xdp.h | 12 ++--- net/xdp/xdp_umem.c | 33 ++++++------ net/xdp/xdp_umem.h | 27 +++------- net/xdp/xdp_umem_props.h | 4 +- net/xdp/xsk.c | 30 ++++++----- net/xdp/xsk_queue.c | 2 +- net/xdp/xsk_queue.h | 43 +++++++-------- 8 files changed, 123 insertions(+), 129 deletions(-) (limited to 'net/xdp') diff --git a/Documentation/networking/af_xdp.rst b/Documentation/networking/af_xdp.rst index 91928d9ee4bf..ff929cfab4f4 100644 --- a/Documentation/networking/af_xdp.rst +++ b/Documentation/networking/af_xdp.rst @@ -12,7 +12,7 @@ packet processing. This document assumes that the reader is familiar with BPF and XDP. If not, the Cilium project has an excellent reference guide at -http://cilium.readthedocs.io/en/doc-1.0/bpf/. +http://cilium.readthedocs.io/en/latest/bpf/. Using the XDP_REDIRECT action from an XDP program, the program can redirect ingress frames to other XDP enabled netdevs, using the @@ -33,22 +33,22 @@ for a while due to a possible retransmit, the descriptor that points to that packet can be changed to point to another and reused right away. This again avoids copying data. -The UMEM consists of a number of equally size frames and each frame -has a unique frame id. A descriptor in one of the rings references a -frame by referencing its frame id. The user space allocates memory for -this UMEM using whatever means it feels is most appropriate (malloc, -mmap, huge pages, etc). This memory area is then registered with the -kernel using the new setsockopt XDP_UMEM_REG. The UMEM also has two -rings: the FILL ring and the COMPLETION ring. The fill ring is used by -the application to send down frame ids for the kernel to fill in with -RX packet data. References to these frames will then appear in the RX -ring once each packet has been received. The completion ring, on the -other hand, contains frame ids that the kernel has transmitted -completely and can now be used again by user space, for either TX or -RX. Thus, the frame ids appearing in the completion ring are ids that -were previously transmitted using the TX ring. In summary, the RX and -FILL rings are used for the RX path and the TX and COMPLETION rings -are used for the TX path. +The UMEM consists of a number of equally sized chunks. A descriptor in +one of the rings references a frame by referencing its addr. The addr +is simply an offset within the entire UMEM region. The user space +allocates memory for this UMEM using whatever means it feels is most +appropriate (malloc, mmap, huge pages, etc). This memory area is then +registered with the kernel using the new setsockopt XDP_UMEM_REG. The +UMEM also has two rings: the FILL ring and the COMPLETION ring. The +fill ring is used by the application to send down addr for the kernel +to fill in with RX packet data. References to these frames will then +appear in the RX ring once each packet has been received. The +completion ring, on the other hand, contains frame addr that the +kernel has transmitted completely and can now be used again by user +space, for either TX or RX. Thus, the frame addrs appearing in the +completion ring are addrs that were previously transmitted using the +TX ring. In summary, the RX and FILL rings are used for the RX path +and the TX and COMPLETION rings are used for the TX path. The socket is then finally bound with a bind() call to a device and a specific queue id on that device, and it is not until bind is @@ -59,13 +59,13 @@ wants to do this, it simply skips the registration of the UMEM and its corresponding two rings, sets the XDP_SHARED_UMEM flag in the bind call and submits the XSK of the process it would like to share UMEM with as well as its own newly created XSK socket. The new process will -then receive frame id references in its own RX ring that point to this -shared UMEM. Note that since the ring structures are single-consumer / -single-producer (for performance reasons), the new process has to -create its own socket with associated RX and TX rings, since it cannot -share this with the other process. This is also the reason that there -is only one set of FILL and COMPLETION rings per UMEM. It is the -responsibility of a single process to handle the UMEM. +then receive frame addr references in its own RX ring that point to +this shared UMEM. Note that since the ring structures are +single-consumer / single-producer (for performance reasons), the new +process has to create its own socket with associated RX and TX rings, +since it cannot share this with the other process. This is also the +reason that there is only one set of FILL and COMPLETION rings per +UMEM. It is the responsibility of a single process to handle the UMEM. How is then packets distributed from an XDP program to the XSKs? There is a BPF map called XSKMAP (or BPF_MAP_TYPE_XSKMAP in full). The @@ -102,10 +102,10 @@ UMEM UMEM is a region of virtual contiguous memory, divided into equal-sized frames. An UMEM is associated to a netdev and a specific -queue id of that netdev. It is created and configured (frame size, -frame headroom, start address and size) by using the XDP_UMEM_REG -setsockopt system call. A UMEM is bound to a netdev and queue id, via -the bind() system call. +queue id of that netdev. It is created and configured (chunk size, +headroom, start address and size) by using the XDP_UMEM_REG setsockopt +system call. A UMEM is bound to a netdev and queue id, via the bind() +system call. An AF_XDP is socket linked to a single UMEM, but one UMEM can have multiple AF_XDP sockets. To share an UMEM created via one socket A, @@ -147,13 +147,17 @@ UMEM Fill Ring ~~~~~~~~~~~~~~ The Fill ring is used to transfer ownership of UMEM frames from -user-space to kernel-space. The UMEM indicies are passed in the -ring. As an example, if the UMEM is 64k and each frame is 4k, then the -UMEM has 16 frames and can pass indicies between 0 and 15. +user-space to kernel-space. The UMEM addrs are passed in the ring. As +an example, if the UMEM is 64k and each chunk is 4k, then the UMEM has +16 chunks and can pass addrs between 0 and 64k. Frames passed to the kernel are used for the ingress path (RX rings). -The user application produces UMEM indicies to this ring. +The user application produces UMEM addrs to this ring. Note that the +kernel will mask the incoming addr. E.g. for a chunk size of 2k, the +log2(2048) LSB of the addr will be masked off, meaning that 2048, 2050 +and 3000 refers to the same chunk. + UMEM Completetion Ring ~~~~~~~~~~~~~~~~~~~~~~ @@ -165,16 +169,15 @@ used. Frames passed from the kernel to user-space are frames that has been sent (TX ring) and can be used by user-space again. -The user application consumes UMEM indicies from this ring. +The user application consumes UMEM addrs from this ring. RX Ring ~~~~~~~ The RX ring is the receiving side of a socket. Each entry in the ring -is a struct xdp_desc descriptor. The descriptor contains UMEM index -(idx), the length of the data (len), the offset into the frame -(offset). +is a struct xdp_desc descriptor. The descriptor contains UMEM offset +(addr) and the length of the data (len). If no frames have been passed to kernel via the Fill ring, no descriptors will (or can) appear on the RX ring. @@ -221,38 +224,50 @@ side is xdpsock_user.c and the XDP side xdpsock_kern.c. Naive ring dequeue and enqueue could look like this:: + // struct xdp_rxtx_ring { + // __u32 *producer; + // __u32 *consumer; + // struct xdp_desc *desc; + // }; + + // struct xdp_umem_ring { + // __u32 *producer; + // __u32 *consumer; + // __u64 *desc; + // }; + // typedef struct xdp_rxtx_ring RING; // typedef struct xdp_umem_ring RING; // typedef struct xdp_desc RING_TYPE; - // typedef __u32 RING_TYPE; + // typedef __u64 RING_TYPE; int dequeue_one(RING *ring, RING_TYPE *item) { - __u32 entries = ring->ptrs.producer - ring->ptrs.consumer; + __u32 entries = *ring->producer - *ring->consumer; if (entries == 0) return -1; // read-barrier! - *item = ring->desc[ring->ptrs.consumer & (RING_SIZE - 1)]; - ring->ptrs.consumer++; + *item = ring->desc[*ring->consumer & (RING_SIZE - 1)]; + (*ring->consumer)++; return 0; } int enqueue_one(RING *ring, const RING_TYPE *item) { - u32 free_entries = RING_SIZE - (ring->ptrs.producer - ring->ptrs.consumer); + u32 free_entries = RING_SIZE - (*ring->producer - *ring->consumer); if (free_entries == 0) return -1; - ring->desc[ring->ptrs.producer & (RING_SIZE - 1)] = *item; + ring->desc[*ring->producer & (RING_SIZE - 1)] = *item; // write-barrier! - ring->ptrs.producer++; + (*ring->producer)++; return 0; } diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index 4737cfe222f5..e411d6f9ac65 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -48,8 +48,8 @@ struct xdp_mmap_offsets { struct xdp_umem_reg { __u64 addr; /* Start of packet data area */ __u64 len; /* Length of packet data area */ - __u32 frame_size; /* Frame size */ - __u32 frame_headroom; /* Frame head room */ + __u32 chunk_size; + __u32 headroom; }; struct xdp_statistics { @@ -66,13 +66,11 @@ struct xdp_statistics { /* Rx/Tx descriptor */ struct xdp_desc { - __u32 idx; + __u64 addr; __u32 len; - __u16 offset; - __u8 flags; - __u8 padding[5]; + __u32 options; }; -/* UMEM descriptor is __u32 */ +/* UMEM descriptor is __u64 */ #endif /* _LINUX_IF_XDP_H */ diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 87998818116f..9ad791ff4739 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -14,7 +14,7 @@ #include "xdp_umem.h" -#define XDP_UMEM_MIN_FRAME_SIZE 2048 +#define XDP_UMEM_MIN_CHUNK_SIZE 2048 static void xdp_umem_unpin_pages(struct xdp_umem *umem) { @@ -151,12 +151,12 @@ static int xdp_umem_account_pages(struct xdp_umem *umem) static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) { - u32 frame_size = mr->frame_size, frame_headroom = mr->frame_headroom; + u32 chunk_size = mr->chunk_size, headroom = mr->headroom; + unsigned int chunks, chunks_per_page; u64 addr = mr->addr, size = mr->len; - unsigned int nframes, nfpp; int size_chk, err; - if (frame_size < XDP_UMEM_MIN_FRAME_SIZE || frame_size > PAGE_SIZE) { + if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) { /* Strictly speaking we could support this, if: * - huge pages, or* * - using an IOMMU, or @@ -166,7 +166,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) return -EINVAL; } - if (!is_power_of_2(frame_size)) + if (!is_power_of_2(chunk_size)) return -EINVAL; if (!PAGE_ALIGNED(addr)) { @@ -179,33 +179,30 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) if ((addr + size) < addr) return -EINVAL; - nframes = (unsigned int)div_u64(size, frame_size); - if (nframes == 0 || nframes > UINT_MAX) + chunks = (unsigned int)div_u64(size, chunk_size); + if (chunks == 0) return -EINVAL; - nfpp = PAGE_SIZE / frame_size; - if (nframes < nfpp || nframes % nfpp) + chunks_per_page = PAGE_SIZE / chunk_size; + if (chunks < chunks_per_page || chunks % chunks_per_page) return -EINVAL; - frame_headroom = ALIGN(frame_headroom, 64); + headroom = ALIGN(headroom, 64); - size_chk = frame_size - frame_headroom - XDP_PACKET_HEADROOM; + size_chk = chunk_size - headroom - XDP_PACKET_HEADROOM; if (size_chk < 0) return -EINVAL; umem->pid = get_task_pid(current, PIDTYPE_PID); - umem->size = (size_t)size; umem->address = (unsigned long)addr; - umem->props.frame_size = frame_size; - umem->props.nframes = nframes; - umem->frame_headroom = frame_headroom; + umem->props.chunk_mask = ~((u64)chunk_size - 1); + umem->props.size = size; + umem->headroom = headroom; + umem->chunk_size_nohr = chunk_size - headroom; umem->npgs = size / PAGE_SIZE; umem->pgs = NULL; umem->user = NULL; - umem->frame_size_log2 = ilog2(frame_size); - umem->nfpp_mask = nfpp - 1; - umem->nfpplog2 = ilog2(nfpp); refcount_set(&umem->users, 1); err = xdp_umem_account_pages(umem); diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index 0881cf456230..aeadd1bcb72d 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -18,35 +18,20 @@ struct xdp_umem { struct xsk_queue *cq; struct page **pgs; struct xdp_umem_props props; - u32 npgs; - u32 frame_headroom; - u32 nfpp_mask; - u32 nfpplog2; - u32 frame_size_log2; + u32 headroom; + u32 chunk_size_nohr; struct user_struct *user; struct pid *pid; unsigned long address; - size_t size; refcount_t users; struct work_struct work; + u32 npgs; }; -static inline char *xdp_umem_get_data(struct xdp_umem *umem, u32 idx) -{ - u64 pg, off; - char *data; - - pg = idx >> umem->nfpplog2; - off = (idx & umem->nfpp_mask) << umem->frame_size_log2; - - data = page_address(umem->pgs[pg]); - return data + off; -} - -static inline char *xdp_umem_get_data_with_headroom(struct xdp_umem *umem, - u32 idx) +static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr) { - return xdp_umem_get_data(umem, idx) + umem->frame_headroom; + return page_address(umem->pgs[addr >> PAGE_SHIFT]) + + (addr & (PAGE_SIZE - 1)); } bool xdp_umem_validate_queues(struct xdp_umem *umem); diff --git a/net/xdp/xdp_umem_props.h b/net/xdp/xdp_umem_props.h index 2cf8ec485fd2..40eab10dfc49 100644 --- a/net/xdp/xdp_umem_props.h +++ b/net/xdp/xdp_umem_props.h @@ -7,8 +7,8 @@ #define XDP_UMEM_PROPS_H_ struct xdp_umem_props { - u32 frame_size; - u32 nframes; + u64 chunk_mask; + u64 size; }; #endif /* XDP_UMEM_PROPS_H_ */ diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 966307ce4b8e..4688c750df1d 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -41,24 +41,27 @@ bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { - u32 id, len = xdp->data_end - xdp->data; + u32 len = xdp->data_end - xdp->data; void *buffer; + u64 addr; int err; if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) return -EINVAL; - if (!xskq_peek_id(xs->umem->fq, &id)) { + if (!xskq_peek_addr(xs->umem->fq, &addr) || + len > xs->umem->chunk_size_nohr) { xs->rx_dropped++; return -ENOSPC; } - buffer = xdp_umem_get_data_with_headroom(xs->umem, id); + addr += xs->umem->headroom; + + buffer = xdp_umem_get_data(xs->umem, addr); memcpy(buffer, xdp->data, len); - err = xskq_produce_batch_desc(xs->rx, id, len, - xs->umem->frame_headroom); + err = xskq_produce_batch_desc(xs->rx, addr, len); if (!err) - xskq_discard_id(xs->umem->fq); + xskq_discard_addr(xs->umem->fq); else xs->rx_dropped++; @@ -95,10 +98,10 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) static void xsk_destruct_skb(struct sk_buff *skb) { - u32 id = (u32)(long)skb_shinfo(skb)->destructor_arg; + u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg; struct xdp_sock *xs = xdp_sk(skb->sk); - WARN_ON_ONCE(xskq_produce_id(xs->umem->cq, id)); + WARN_ON_ONCE(xskq_produce_addr(xs->umem->cq, addr)); sock_wfree(skb); } @@ -123,14 +126,15 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, while (xskq_peek_desc(xs->tx, &desc)) { char *buffer; - u32 id, len; + u64 addr; + u32 len; if (max_batch-- == 0) { err = -EAGAIN; goto out; } - if (xskq_reserve_id(xs->umem->cq)) { + if (xskq_reserve_addr(xs->umem->cq)) { err = -EAGAIN; goto out; } @@ -153,8 +157,8 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, } skb_put(skb, len); - id = desc.idx; - buffer = xdp_umem_get_data(xs->umem, id) + desc.offset; + addr = desc.addr; + buffer = xdp_umem_get_data(xs->umem, addr); err = skb_store_bits(skb, 0, buffer, len); if (unlikely(err)) { kfree_skb(skb); @@ -164,7 +168,7 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, skb->dev = xs->dev; skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; - skb_shinfo(skb)->destructor_arg = (void *)(long)id; + skb_shinfo(skb)->destructor_arg = (void *)(long)addr; skb->destructor = xsk_destruct_skb; err = dev_direct_xmit(skb, xs->queue_id); diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c index ebe85e59507e..6c32e92e98fc 100644 --- a/net/xdp/xsk_queue.c +++ b/net/xdp/xsk_queue.c @@ -17,7 +17,7 @@ void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props) static u32 xskq_umem_get_ring_size(struct xsk_queue *q) { - return sizeof(struct xdp_umem_ring) + q->nentries * sizeof(u32); + return sizeof(struct xdp_umem_ring) + q->nentries * sizeof(u64); } static u32 xskq_rxtx_get_ring_size(struct xsk_queue *q) diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index b5924e7aeb2b..337e5ad3b10e 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -27,7 +27,7 @@ struct xdp_rxtx_ring { /* Used for the fill and completion queues for buffers */ struct xdp_umem_ring { struct xdp_ring ptrs; - u32 desc[0] ____cacheline_aligned_in_smp; + u64 desc[0] ____cacheline_aligned_in_smp; }; struct xsk_queue { @@ -76,24 +76,25 @@ static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt) /* UMEM queue */ -static inline bool xskq_is_valid_id(struct xsk_queue *q, u32 idx) +static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr) { - if (unlikely(idx >= q->umem_props.nframes)) { + if (addr >= q->umem_props.size) { q->invalid_descs++; return false; } + return true; } -static inline u32 *xskq_validate_id(struct xsk_queue *q, u32 *id) +static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr) { while (q->cons_tail != q->cons_head) { struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; unsigned int idx = q->cons_tail & q->ring_mask; - *id = READ_ONCE(ring->desc[idx]); - if (xskq_is_valid_id(q, *id)) - return id; + *addr = READ_ONCE(ring->desc[idx]) & q->umem_props.chunk_mask; + if (xskq_is_valid_addr(q, *addr)) + return addr; q->cons_tail++; } @@ -101,7 +102,7 @@ static inline u32 *xskq_validate_id(struct xsk_queue *q, u32 *id) return NULL; } -static inline u32 *xskq_peek_id(struct xsk_queue *q, u32 *id) +static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr) { if (q->cons_tail == q->cons_head) { WRITE_ONCE(q->ring->consumer, q->cons_tail); @@ -111,19 +112,19 @@ static inline u32 *xskq_peek_id(struct xsk_queue *q, u32 *id) smp_rmb(); } - return xskq_validate_id(q, id); + return xskq_validate_addr(q, addr); } -static inline void xskq_discard_id(struct xsk_queue *q) +static inline void xskq_discard_addr(struct xsk_queue *q) { q->cons_tail++; } -static inline int xskq_produce_id(struct xsk_queue *q, u32 id) +static inline int xskq_produce_addr(struct xsk_queue *q, u64 addr) { struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; - ring->desc[q->prod_tail++ & q->ring_mask] = id; + ring->desc[q->prod_tail++ & q->ring_mask] = addr; /* Order producer and data */ smp_wmb(); @@ -132,7 +133,7 @@ static inline int xskq_produce_id(struct xsk_queue *q, u32 id) return 0; } -static inline int xskq_reserve_id(struct xsk_queue *q) +static inline int xskq_reserve_addr(struct xsk_queue *q) { if (xskq_nb_free(q, q->prod_head, 1) == 0) return -ENOSPC; @@ -145,16 +146,11 @@ static inline int xskq_reserve_id(struct xsk_queue *q) static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d) { - u32 buff_len; - - if (unlikely(d->idx >= q->umem_props.nframes)) { - q->invalid_descs++; + if (!xskq_is_valid_addr(q, d->addr)) return false; - } - buff_len = q->umem_props.frame_size; - if (unlikely(d->len > buff_len || d->len == 0 || - d->offset > buff_len || d->offset + d->len > buff_len)) { + if (((d->addr + d->len) & q->umem_props.chunk_mask) != + (d->addr & q->umem_props.chunk_mask)) { q->invalid_descs++; return false; } @@ -199,7 +195,7 @@ static inline void xskq_discard_desc(struct xsk_queue *q) } static inline int xskq_produce_batch_desc(struct xsk_queue *q, - u32 id, u32 len, u16 offset) + u64 addr, u32 len) { struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; unsigned int idx; @@ -208,9 +204,8 @@ static inline int xskq_produce_batch_desc(struct xsk_queue *q, return -ENOSPC; idx = (q->prod_head++) & q->ring_mask; - ring->desc[idx].idx = id; + ring->desc[idx].addr = addr; ring->desc[idx].len = len; - ring->desc[idx].offset = offset; return 0; } -- cgit v1.2.3-59-g8ed1b From e61e62b9e2cc14b336f330f37f517f9d373ff31e Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Mon, 4 Jun 2018 14:05:51 +0200 Subject: xsk: moved struct xdp_umem definition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Moved struct xdp_umem to xdp_sock.h, in order to prepare for zero-copy support. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- include/net/xdp_sock.h | 24 +++++++++++++++++++++++- net/xdp/xdp_umem.c | 1 + net/xdp/xdp_umem.h | 22 +--------------------- net/xdp/xsk_queue.h | 3 +-- 4 files changed, 26 insertions(+), 24 deletions(-) (limited to 'net/xdp') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 7a647c56ec15..3a6cd88f179d 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -6,12 +6,34 @@ #ifndef _LINUX_XDP_SOCK_H #define _LINUX_XDP_SOCK_H +#include +#include #include +#include #include struct net_device; struct xsk_queue; -struct xdp_umem; + +struct xdp_umem_props { + u64 chunk_mask; + u64 size; +}; + +struct xdp_umem { + struct xsk_queue *fq; + struct xsk_queue *cq; + struct page **pgs; + struct xdp_umem_props props; + u32 headroom; + u32 chunk_size_nohr; + struct user_struct *user; + struct pid *pid; + unsigned long address; + refcount_t users; + struct work_struct work; + u32 npgs; +}; struct xdp_sock { /* struct sock must be the first member of struct xdp_sock */ diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 9ad791ff4739..2793a503223e 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -13,6 +13,7 @@ #include #include "xdp_umem.h" +#include "xsk_queue.h" #define XDP_UMEM_MIN_CHUNK_SIZE 2048 diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index aeadd1bcb72d..9433e8af650a 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -6,27 +6,7 @@ #ifndef XDP_UMEM_H_ #define XDP_UMEM_H_ -#include -#include -#include - -#include "xsk_queue.h" -#include "xdp_umem_props.h" - -struct xdp_umem { - struct xsk_queue *fq; - struct xsk_queue *cq; - struct page **pgs; - struct xdp_umem_props props; - u32 headroom; - u32 chunk_size_nohr; - struct user_struct *user; - struct pid *pid; - unsigned long address; - refcount_t users; - struct work_struct work; - u32 npgs; -}; +#include static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr) { diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 337e5ad3b10e..5246ed420a16 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -8,8 +8,7 @@ #include #include - -#include "xdp_umem_props.h" +#include #define RX_BATCH_SIZE 16 -- cgit v1.2.3-59-g8ed1b From 8aef7340ae9695912a411886452ae9773206e845 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Mon, 4 Jun 2018 14:05:52 +0200 Subject: xsk: introduce xdp_umem_page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The xdp_umem_page holds the address for a page. Trade memory for faster lookup. Later, we'll add DMA address here as well. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- include/net/xdp_sock.h | 7 ++++++- net/xdp/xdp_umem.c | 15 ++++++++++++++- net/xdp/xdp_umem.h | 3 +-- 3 files changed, 21 insertions(+), 4 deletions(-) (limited to 'net/xdp') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 3a6cd88f179d..caf343a7e224 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -20,10 +20,14 @@ struct xdp_umem_props { u64 size; }; +struct xdp_umem_page { + void *addr; +}; + struct xdp_umem { struct xsk_queue *fq; struct xsk_queue *cq; - struct page **pgs; + struct xdp_umem_page *pages; struct xdp_umem_props props; u32 headroom; u32 chunk_size_nohr; @@ -32,6 +36,7 @@ struct xdp_umem { unsigned long address; refcount_t users; struct work_struct work; + struct page **pgs; u32 npgs; }; diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 2793a503223e..aca826011f6c 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -65,6 +65,9 @@ static void xdp_umem_release(struct xdp_umem *umem) goto out; mmput(mm); + kfree(umem->pages); + umem->pages = NULL; + xdp_umem_unaccount_pages(umem); out: kfree(umem); @@ -155,7 +158,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) u32 chunk_size = mr->chunk_size, headroom = mr->headroom; unsigned int chunks, chunks_per_page; u64 addr = mr->addr, size = mr->len; - int size_chk, err; + int size_chk, err, i; if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) { /* Strictly speaking we could support this, if: @@ -213,6 +216,16 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) err = xdp_umem_pin_pages(umem); if (err) goto out_account; + + umem->pages = kcalloc(umem->npgs, sizeof(*umem->pages), GFP_KERNEL); + if (!umem->pages) { + err = -ENOMEM; + goto out_account; + } + + for (i = 0; i < umem->npgs; i++) + umem->pages[i].addr = page_address(umem->pgs[i]); + return 0; out_account: diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index 9433e8af650a..40e8fa4a92af 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -10,8 +10,7 @@ static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr) { - return page_address(umem->pgs[addr >> PAGE_SHIFT]) + - (addr & (PAGE_SIZE - 1)); + return umem->pages[addr >> PAGE_SHIFT].addr + (addr & (PAGE_SIZE - 1)); } bool xdp_umem_validate_queues(struct xdp_umem *umem); -- cgit v1.2.3-59-g8ed1b From 173d3adb6f437037f216270955886ca9878187a5 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Mon, 4 Jun 2018 14:05:55 +0200 Subject: xsk: add zero-copy support for Rx MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend the xsk_rcv to support the new MEM_TYPE_ZERO_COPY memory, and wireup ndo_bpf call in bind. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- include/net/xdp_sock.h | 6 +++ include/uapi/linux/if_xdp.h | 4 +- net/xdp/xdp_umem.c | 77 ++++++++++++++++++++++++++++++++++++ net/xdp/xdp_umem.h | 3 ++ net/xdp/xsk.c | 96 +++++++++++++++++++++++++++++++++++---------- 5 files changed, 165 insertions(+), 21 deletions(-) (limited to 'net/xdp') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index caf343a7e224..d93d3aac3fc9 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -22,6 +22,7 @@ struct xdp_umem_props { struct xdp_umem_page { void *addr; + dma_addr_t dma; }; struct xdp_umem { @@ -38,6 +39,9 @@ struct xdp_umem { struct work_struct work; struct page **pgs; u32 npgs; + struct net_device *dev; + u16 queue_id; + bool zc; }; struct xdp_sock { @@ -60,6 +64,8 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); void xsk_flush(struct xdp_sock *xs); bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs); +u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr); +void xsk_umem_discard_addr(struct xdp_umem *umem); #else static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index e411d6f9ac65..1fa0e977ea8d 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -13,7 +13,9 @@ #include /* Options for the sxdp_flags field */ -#define XDP_SHARED_UMEM 1 +#define XDP_SHARED_UMEM (1 << 0) +#define XDP_COPY (1 << 1) /* Force copy-mode */ +#define XDP_ZEROCOPY (1 << 2) /* Force zero-copy mode */ struct sockaddr_xdp { __u16 sxdp_family; diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index aca826011f6c..f729d79b8d91 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -17,6 +17,81 @@ #define XDP_UMEM_MIN_CHUNK_SIZE 2048 +int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, + u32 queue_id, u16 flags) +{ + bool force_zc, force_copy; + struct netdev_bpf bpf; + int err; + + force_zc = flags & XDP_ZEROCOPY; + force_copy = flags & XDP_COPY; + + if (force_zc && force_copy) + return -EINVAL; + + if (force_copy) + return 0; + + dev_hold(dev); + + if (dev->netdev_ops->ndo_bpf) { + bpf.command = XDP_QUERY_XSK_UMEM; + + rtnl_lock(); + err = dev->netdev_ops->ndo_bpf(dev, &bpf); + rtnl_unlock(); + + if (err) { + dev_put(dev); + return force_zc ? -ENOTSUPP : 0; + } + + bpf.command = XDP_SETUP_XSK_UMEM; + bpf.xsk.umem = umem; + bpf.xsk.queue_id = queue_id; + + rtnl_lock(); + err = dev->netdev_ops->ndo_bpf(dev, &bpf); + rtnl_unlock(); + + if (err) { + dev_put(dev); + return force_zc ? err : 0; /* fail or fallback */ + } + + umem->dev = dev; + umem->queue_id = queue_id; + umem->zc = true; + return 0; + } + + dev_put(dev); + return force_zc ? -ENOTSUPP : 0; /* fail or fallback */ +} + +void xdp_umem_clear_dev(struct xdp_umem *umem) +{ + struct netdev_bpf bpf; + int err; + + if (umem->dev) { + bpf.command = XDP_SETUP_XSK_UMEM; + bpf.xsk.umem = NULL; + bpf.xsk.queue_id = umem->queue_id; + + rtnl_lock(); + err = umem->dev->netdev_ops->ndo_bpf(umem->dev, &bpf); + rtnl_unlock(); + + if (err) + WARN(1, "failed to disable umem!\n"); + + dev_put(umem->dev); + umem->dev = NULL; + } +} + static void xdp_umem_unpin_pages(struct xdp_umem *umem) { unsigned int i; @@ -43,6 +118,8 @@ static void xdp_umem_release(struct xdp_umem *umem) struct task_struct *task; struct mm_struct *mm; + xdp_umem_clear_dev(umem); + if (umem->fq) { xskq_destroy(umem->fq); umem->fq = NULL; diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index 40e8fa4a92af..674508a32a4d 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -13,6 +13,9 @@ static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr) return umem->pages[addr >> PAGE_SHIFT].addr + (addr & (PAGE_SIZE - 1)); } +int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, + u32 queue_id, u16 flags); +void xdp_umem_clear_dev(struct xdp_umem *umem); bool xdp_umem_validate_queues(struct xdp_umem *umem); void xdp_get_umem(struct xdp_umem *umem); void xdp_put_umem(struct xdp_umem *umem); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 4688c750df1d..ab64bd8260ea 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -36,19 +36,28 @@ static struct xdp_sock *xdp_sk(struct sock *sk) bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) { - return !!xs->rx; + return READ_ONCE(xs->rx) && READ_ONCE(xs->umem) && + READ_ONCE(xs->umem->fq); } -static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr) +{ + return xskq_peek_addr(umem->fq, addr); +} +EXPORT_SYMBOL(xsk_umem_peek_addr); + +void xsk_umem_discard_addr(struct xdp_umem *umem) +{ + xskq_discard_addr(umem->fq); +} +EXPORT_SYMBOL(xsk_umem_discard_addr); + +static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { - u32 len = xdp->data_end - xdp->data; void *buffer; u64 addr; int err; - if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) - return -EINVAL; - if (!xskq_peek_addr(xs->umem->fq, &addr) || len > xs->umem->chunk_size_nohr) { xs->rx_dropped++; @@ -60,25 +69,41 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) buffer = xdp_umem_get_data(xs->umem, addr); memcpy(buffer, xdp->data, len); err = xskq_produce_batch_desc(xs->rx, addr, len); - if (!err) + if (!err) { xskq_discard_addr(xs->umem->fq); - else - xs->rx_dropped++; + xdp_return_buff(xdp); + return 0; + } + xs->rx_dropped++; return err; } -int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { - int err; + int err = xskq_produce_batch_desc(xs->rx, (u64)xdp->handle, len); - err = __xsk_rcv(xs, xdp); - if (likely(!err)) + if (err) { xdp_return_buff(xdp); + xs->rx_dropped++; + } return err; } +int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +{ + u32 len; + + if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) + return -EINVAL; + + len = xdp->data_end - xdp->data; + + return (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) ? + __xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len); +} + void xsk_flush(struct xdp_sock *xs) { xskq_produce_flush_desc(xs->rx); @@ -87,12 +112,29 @@ void xsk_flush(struct xdp_sock *xs) int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { + u32 len = xdp->data_end - xdp->data; + void *buffer; + u64 addr; int err; - err = __xsk_rcv(xs, xdp); - if (!err) + if (!xskq_peek_addr(xs->umem->fq, &addr) || + len > xs->umem->chunk_size_nohr) { + xs->rx_dropped++; + return -ENOSPC; + } + + addr += xs->umem->headroom; + + buffer = xdp_umem_get_data(xs->umem, addr); + memcpy(buffer, xdp->data, len); + err = xskq_produce_batch_desc(xs->rx, addr, len); + if (!err) { + xskq_discard_addr(xs->umem->fq); xsk_flush(xs); + return 0; + } + xs->rx_dropped++; return err; } @@ -291,6 +333,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); struct net_device *dev; + u32 flags, qid; int err = 0; if (addr_len < sizeof(struct sockaddr_xdp)) @@ -315,16 +358,26 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) goto out_unlock; } - if ((xs->rx && sxdp->sxdp_queue_id >= dev->real_num_rx_queues) || - (xs->tx && sxdp->sxdp_queue_id >= dev->real_num_tx_queues)) { + qid = sxdp->sxdp_queue_id; + + if ((xs->rx && qid >= dev->real_num_rx_queues) || + (xs->tx && qid >= dev->real_num_tx_queues)) { err = -EINVAL; goto out_unlock; } - if (sxdp->sxdp_flags & XDP_SHARED_UMEM) { + flags = sxdp->sxdp_flags; + + if (flags & XDP_SHARED_UMEM) { struct xdp_sock *umem_xs; struct socket *sock; + if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY)) { + /* Cannot specify flags for shared sockets. */ + err = -EINVAL; + goto out_unlock; + } + if (xs->umem) { /* We have already our own. */ err = -EINVAL; @@ -343,8 +396,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) err = -EBADF; sockfd_put(sock); goto out_unlock; - } else if (umem_xs->dev != dev || - umem_xs->queue_id != sxdp->sxdp_queue_id) { + } else if (umem_xs->dev != dev || umem_xs->queue_id != qid) { err = -EINVAL; sockfd_put(sock); goto out_unlock; @@ -360,6 +412,10 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) /* This xsk has its own umem. */ xskq_set_umem(xs->umem->fq, &xs->umem->props); xskq_set_umem(xs->umem->cq, &xs->umem->props); + + err = xdp_umem_assign_dev(xs->umem, dev, qid, flags); + if (err) + goto out_unlock; } xs->dev = dev; -- cgit v1.2.3-59-g8ed1b From ac98d8aab61baf785eb8f099b36daf34fc76a70e Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Mon, 4 Jun 2018 14:05:57 +0200 Subject: xsk: wire upp Tx zero-copy functions Here we add the functionality required to support zero-copy Tx, and also exposes various zero-copy related functions for the netdevs. Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann --- include/net/xdp_sock.h | 9 +++++++ net/xdp/xdp_umem.c | 29 +++++++++++++++++++-- net/xdp/xdp_umem.h | 8 +++++- net/xdp/xsk.c | 70 +++++++++++++++++++++++++++++++++++++++++++++----- net/xdp/xsk_queue.h | 32 ++++++++++++++++++++++- 5 files changed, 137 insertions(+), 11 deletions(-) (limited to 'net/xdp') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index d93d3aac3fc9..9fe472f2ac95 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -42,6 +43,8 @@ struct xdp_umem { struct net_device *dev; u16 queue_id; bool zc; + spinlock_t xsk_list_lock; + struct list_head xsk_list; }; struct xdp_sock { @@ -53,6 +56,8 @@ struct xdp_sock { struct list_head flush_node; u16 queue_id; struct xsk_queue *tx ____cacheline_aligned_in_smp; + struct list_head list; + bool zc; /* Protects multiple processes in the control path */ struct mutex mutex; u64 rx_dropped; @@ -64,8 +69,12 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); void xsk_flush(struct xdp_sock *xs); bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs); +/* Used from netdev driver */ u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr); void xsk_umem_discard_addr(struct xdp_umem *umem); +void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries); +bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len); +void xsk_umem_consume_tx_done(struct xdp_umem *umem); #else static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index f729d79b8d91..7eb4948a38d2 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -17,6 +17,29 @@ #define XDP_UMEM_MIN_CHUNK_SIZE 2048 +void xdp_add_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs) +{ + unsigned long flags; + + spin_lock_irqsave(&umem->xsk_list_lock, flags); + list_add_rcu(&xs->list, &umem->xsk_list); + spin_unlock_irqrestore(&umem->xsk_list_lock, flags); +} + +void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs) +{ + unsigned long flags; + + if (xs->dev) { + spin_lock_irqsave(&umem->xsk_list_lock, flags); + list_del_rcu(&xs->list); + spin_unlock_irqrestore(&umem->xsk_list_lock, flags); + + if (umem->zc) + synchronize_net(); + } +} + int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, u32 queue_id, u16 flags) { @@ -35,7 +58,7 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, dev_hold(dev); - if (dev->netdev_ops->ndo_bpf) { + if (dev->netdev_ops->ndo_bpf && dev->netdev_ops->ndo_xsk_async_xmit) { bpf.command = XDP_QUERY_XSK_UMEM; rtnl_lock(); @@ -70,7 +93,7 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, return force_zc ? -ENOTSUPP : 0; /* fail or fallback */ } -void xdp_umem_clear_dev(struct xdp_umem *umem) +static void xdp_umem_clear_dev(struct xdp_umem *umem) { struct netdev_bpf bpf; int err; @@ -283,6 +306,8 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) umem->npgs = size / PAGE_SIZE; umem->pgs = NULL; umem->user = NULL; + INIT_LIST_HEAD(&umem->xsk_list); + spin_lock_init(&umem->xsk_list_lock); refcount_set(&umem->users, 1); diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index 674508a32a4d..f11560334f88 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -13,12 +13,18 @@ static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr) return umem->pages[addr >> PAGE_SHIFT].addr + (addr & (PAGE_SIZE - 1)); } +static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr) +{ + return umem->pages[addr >> PAGE_SHIFT].dma + (addr & (PAGE_SIZE - 1)); +} + int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, u32 queue_id, u16 flags); -void xdp_umem_clear_dev(struct xdp_umem *umem); bool xdp_umem_validate_queues(struct xdp_umem *umem); void xdp_get_umem(struct xdp_umem *umem); void xdp_put_umem(struct xdp_umem *umem); +void xdp_add_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs); +void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs); struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr); #endif /* XDP_UMEM_H_ */ diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index ab64bd8260ea..ddca4bf1cfc8 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -138,6 +139,59 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) return err; } +void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries) +{ + xskq_produce_flush_addr_n(umem->cq, nb_entries); +} +EXPORT_SYMBOL(xsk_umem_complete_tx); + +void xsk_umem_consume_tx_done(struct xdp_umem *umem) +{ + struct xdp_sock *xs; + + rcu_read_lock(); + list_for_each_entry_rcu(xs, &umem->xsk_list, list) { + xs->sk.sk_write_space(&xs->sk); + } + rcu_read_unlock(); +} +EXPORT_SYMBOL(xsk_umem_consume_tx_done); + +bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len) +{ + struct xdp_desc desc; + struct xdp_sock *xs; + + rcu_read_lock(); + list_for_each_entry_rcu(xs, &umem->xsk_list, list) { + if (!xskq_peek_desc(xs->tx, &desc)) + continue; + + if (xskq_produce_addr_lazy(umem->cq, desc.addr)) + goto out; + + *dma = xdp_umem_get_dma(umem, desc.addr); + *len = desc.len; + + xskq_discard_desc(xs->tx); + rcu_read_unlock(); + return true; + } + +out: + rcu_read_unlock(); + return false; +} +EXPORT_SYMBOL(xsk_umem_consume_tx); + +static int xsk_zc_xmit(struct sock *sk) +{ + struct xdp_sock *xs = xdp_sk(sk); + struct net_device *dev = xs->dev; + + return dev->netdev_ops->ndo_xsk_async_xmit(dev, xs->queue_id); +} + static void xsk_destruct_skb(struct sk_buff *skb) { u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg; @@ -151,7 +205,6 @@ static void xsk_destruct_skb(struct sk_buff *skb) static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, size_t total_len) { - bool need_wait = !(m->msg_flags & MSG_DONTWAIT); u32 max_batch = TX_BATCH_SIZE; struct xdp_sock *xs = xdp_sk(sk); bool sent_frame = false; @@ -161,8 +214,6 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, if (unlikely(!xs->tx)) return -ENOBUFS; - if (need_wait) - return -EOPNOTSUPP; mutex_lock(&xs->mutex); @@ -192,7 +243,7 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, goto out; } - skb = sock_alloc_send_skb(sk, len, !need_wait, &err); + skb = sock_alloc_send_skb(sk, len, 1, &err); if (unlikely(!skb)) { err = -EAGAIN; goto out; @@ -235,6 +286,7 @@ out: static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { + bool need_wait = !(m->msg_flags & MSG_DONTWAIT); struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); @@ -242,8 +294,10 @@ static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) return -ENXIO; if (unlikely(!(xs->dev->flags & IFF_UP))) return -ENETDOWN; + if (need_wait) + return -EOPNOTSUPP; - return xsk_generic_xmit(sk, m, total_len); + return (xs->zc) ? xsk_zc_xmit(sk) : xsk_generic_xmit(sk, m, total_len); } static unsigned int xsk_poll(struct file *file, struct socket *sock, @@ -419,10 +473,11 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) } xs->dev = dev; - xs->queue_id = sxdp->sxdp_queue_id; - + xs->zc = xs->umem->zc; + xs->queue_id = qid; xskq_set_umem(xs->rx, &xs->umem->props); xskq_set_umem(xs->tx, &xs->umem->props); + xdp_add_sk_umem(xs->umem, xs); out_unlock: if (err) @@ -660,6 +715,7 @@ static void xsk_destruct(struct sock *sk) xskq_destroy(xs->rx); xskq_destroy(xs->tx); + xdp_del_sk_umem(xs->umem, xs); xdp_put_umem(xs->umem); sk_refcnt_debug_dec(sk); diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 5246ed420a16..ef6a6f0ec949 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -11,6 +11,7 @@ #include #define RX_BATCH_SIZE 16 +#define LAZY_UPDATE_THRESHOLD 128 struct xdp_ring { u32 producer ____cacheline_aligned_in_smp; @@ -61,9 +62,14 @@ static inline u32 xskq_nb_avail(struct xsk_queue *q, u32 dcnt) return (entries > dcnt) ? dcnt : entries; } +static inline u32 xskq_nb_free_lazy(struct xsk_queue *q, u32 producer) +{ + return q->nentries - (producer - q->cons_tail); +} + static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt) { - u32 free_entries = q->nentries - (producer - q->cons_tail); + u32 free_entries = xskq_nb_free_lazy(q, producer); if (free_entries >= dcnt) return free_entries; @@ -123,6 +129,9 @@ static inline int xskq_produce_addr(struct xsk_queue *q, u64 addr) { struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; + if (xskq_nb_free(q, q->prod_tail, LAZY_UPDATE_THRESHOLD) == 0) + return -ENOSPC; + ring->desc[q->prod_tail++ & q->ring_mask] = addr; /* Order producer and data */ @@ -132,6 +141,27 @@ static inline int xskq_produce_addr(struct xsk_queue *q, u64 addr) return 0; } +static inline int xskq_produce_addr_lazy(struct xsk_queue *q, u64 addr) +{ + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; + + if (xskq_nb_free(q, q->prod_head, LAZY_UPDATE_THRESHOLD) == 0) + return -ENOSPC; + + ring->desc[q->prod_head++ & q->ring_mask] = addr; + return 0; +} + +static inline void xskq_produce_flush_addr_n(struct xsk_queue *q, + u32 nb_entries) +{ + /* Order producer and data */ + smp_wmb(); + + q->prod_tail += nb_entries; + WRITE_ONCE(q->ring->producer, q->prod_tail); +} + static inline int xskq_reserve_addr(struct xsk_queue *q) { if (xskq_nb_free(q, q->prod_head, 1) == 0) -- cgit v1.2.3-59-g8ed1b