aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMichal Kalderon <michal.kalderon@marvell.com>2019-10-30 11:44:11 +0200
committerJason Gunthorpe <jgg@mellanox.com>2019-11-06 13:08:00 -0400
commit3411f9f01b76bd88aa6e0e013847ab6479cb4f24 (patch)
tree2c5234ba44b6a69e48e83bf407828ce33966d9ab
parentRDMA/core: Move core content from ib_uverbs to ib_core (diff)
downloadlinux-dev-3411f9f01b76bd88aa6e0e013847ab6479cb4f24.tar.xz
linux-dev-3411f9f01b76bd88aa6e0e013847ab6479cb4f24.zip
RDMA/core: Create mmap database and cookie helper functions
Create some common API's for adding entries to a xa_mmap. Searching for an entry and freeing one. The general approach is copied from the EFA driver and improved to be more general and do more to help the drivers. Integration with the core allows a reference counted scheme with a free function so that the driver can know when its mmaps are all gone. This significant new functionality will be helpful for drivers to have the correct lifetime model for mmap objects. Link: https://lore.kernel.org/r/20191030094417.16866-3-michal.kalderon@marvell.com Signed-off-by: Ariel Elior <ariel.elior@marvell.com> Signed-off-by: Michal Kalderon <michal.kalderon@marvell.com> Reviewed-by: Jason Gunthorpe <jgg@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
-rw-r--r--drivers/infiniband/core/device.c1
-rw-r--r--drivers/infiniband/core/ib_core_uverbs.c236
-rw-r--r--drivers/infiniband/core/rdma_core.c1
-rw-r--r--drivers/infiniband/core/uverbs_cmd.c2
-rw-r--r--include/rdma/ib_verbs.h35
5 files changed, 275 insertions, 0 deletions
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index f8d383ceae05..e785bebaf16e 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -2642,6 +2642,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
SET_DEVICE_OP(dev_ops, map_mr_sg_pi);
SET_DEVICE_OP(dev_ops, map_phys_fmr);
SET_DEVICE_OP(dev_ops, mmap);
+ SET_DEVICE_OP(dev_ops, mmap_free);
SET_DEVICE_OP(dev_ops, modify_ah);
SET_DEVICE_OP(dev_ops, modify_cq);
SET_DEVICE_OP(dev_ops, modify_device);
diff --git a/drivers/infiniband/core/ib_core_uverbs.c b/drivers/infiniband/core/ib_core_uverbs.c
index b74d2a2fb342..aacd84a45de6 100644
--- a/drivers/infiniband/core/ib_core_uverbs.c
+++ b/drivers/infiniband/core/ib_core_uverbs.c
@@ -71,3 +71,239 @@ int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma,
return 0;
}
EXPORT_SYMBOL(rdma_user_mmap_io);
+
+/**
+ * rdma_user_mmap_entry_get_pgoff() - Get an entry from the mmap_xa
+ *
+ * @ucontext: associated user context
+ * @pgoff: The mmap offset >> PAGE_SHIFT
+ *
+ * This function is called when a user tries to mmap with an offset (returned
+ * by rdma_user_mmap_get_offset()) it initially received from the driver. The
+ * rdma_user_mmap_entry was created by the function
+ * rdma_user_mmap_entry_insert(). This function increases the refcnt of the
+ * entry so that it won't be deleted from the xarray in the meantime.
+ *
+ * Return an reference to an entry if exists or NULL if there is no
+ * match. rdma_user_mmap_entry_put() must be called to put the reference.
+ */
+struct rdma_user_mmap_entry *
+rdma_user_mmap_entry_get_pgoff(struct ib_ucontext *ucontext,
+ unsigned long pgoff)
+{
+ struct rdma_user_mmap_entry *entry;
+
+ if (pgoff > U32_MAX)
+ return NULL;
+
+ xa_lock(&ucontext->mmap_xa);
+
+ entry = xa_load(&ucontext->mmap_xa, pgoff);
+
+ /*
+ * If refcount is zero, entry is already being deleted, driver_removed
+ * indicates that the no further mmaps are possible and we waiting for
+ * the active VMAs to be closed.
+ */
+ if (!entry || entry->start_pgoff != pgoff || entry->driver_removed ||
+ !kref_get_unless_zero(&entry->ref))
+ goto err;
+
+ xa_unlock(&ucontext->mmap_xa);
+
+ ibdev_dbg(ucontext->device, "mmap: pgoff[%#lx] npages[%#zx] returned\n",
+ pgoff, entry->npages);
+
+ return entry;
+
+err:
+ xa_unlock(&ucontext->mmap_xa);
+ return NULL;
+}
+EXPORT_SYMBOL(rdma_user_mmap_entry_get_pgoff);
+
+/**
+ * rdma_user_mmap_entry_get() - Get an entry from the mmap_xa
+ *
+ * @ucontext: associated user context
+ * @vma: the vma being mmap'd into
+ *
+ * This function is like rdma_user_mmap_entry_get_pgoff() except that it also
+ * checks that the VMA is correct.
+ */
+struct rdma_user_mmap_entry *
+rdma_user_mmap_entry_get(struct ib_ucontext *ucontext,
+ struct vm_area_struct *vma)
+{
+ struct rdma_user_mmap_entry *entry;
+
+ if (!(vma->vm_flags & VM_SHARED))
+ return NULL;
+ entry = rdma_user_mmap_entry_get_pgoff(ucontext, vma->vm_pgoff);
+ if (!entry)
+ return NULL;
+ if (entry->npages * PAGE_SIZE != vma->vm_end - vma->vm_start) {
+ rdma_user_mmap_entry_put(entry);
+ return NULL;
+ }
+ return entry;
+}
+EXPORT_SYMBOL(rdma_user_mmap_entry_get);
+
+static void rdma_user_mmap_entry_free(struct kref *kref)
+{
+ struct rdma_user_mmap_entry *entry =
+ container_of(kref, struct rdma_user_mmap_entry, ref);
+ struct ib_ucontext *ucontext = entry->ucontext;
+ unsigned long i;
+
+ /*
+ * Erase all entries occupied by this single entry, this is deferred
+ * until all VMA are closed so that the mmap offsets remain unique.
+ */
+ xa_lock(&ucontext->mmap_xa);
+ for (i = 0; i < entry->npages; i++)
+ __xa_erase(&ucontext->mmap_xa, entry->start_pgoff + i);
+ xa_unlock(&ucontext->mmap_xa);
+
+ ibdev_dbg(ucontext->device, "mmap: pgoff[%#lx] npages[%#zx] removed\n",
+ entry->start_pgoff, entry->npages);
+
+ if (ucontext->device->ops.mmap_free)
+ ucontext->device->ops.mmap_free(entry);
+}
+
+/**
+ * rdma_user_mmap_entry_put() - Drop reference to the mmap entry
+ *
+ * @entry: an entry in the mmap_xa
+ *
+ * This function is called when the mapping is closed if it was
+ * an io mapping or when the driver is done with the entry for
+ * some other reason.
+ * Should be called after rdma_user_mmap_entry_get was called
+ * and entry is no longer needed. This function will erase the
+ * entry and free it if its refcnt reaches zero.
+ */
+void rdma_user_mmap_entry_put(struct rdma_user_mmap_entry *entry)
+{
+ kref_put(&entry->ref, rdma_user_mmap_entry_free);
+}
+EXPORT_SYMBOL(rdma_user_mmap_entry_put);
+
+/**
+ * rdma_user_mmap_entry_remove() - Drop reference to entry and
+ * mark it as unmmapable
+ *
+ * @entry: the entry to insert into the mmap_xa
+ *
+ * Drivers can call this to prevent userspace from creating more mappings for
+ * entry, however existing mmaps continue to exist and ops->mmap_free() will
+ * not be called until all user mmaps are destroyed.
+ */
+void rdma_user_mmap_entry_remove(struct rdma_user_mmap_entry *entry)
+{
+ if (!entry)
+ return;
+
+ entry->driver_removed = true;
+ kref_put(&entry->ref, rdma_user_mmap_entry_free);
+}
+EXPORT_SYMBOL(rdma_user_mmap_entry_remove);
+
+/**
+ * rdma_user_mmap_entry_insert() - Insert an entry to the mmap_xa
+ *
+ * @ucontext: associated user context.
+ * @entry: the entry to insert into the mmap_xa
+ * @length: length of the address that will be mmapped
+ *
+ * This function should be called by drivers that use the rdma_user_mmap
+ * interface for implementing their mmap syscall A database of mmap offsets is
+ * handled in the core and helper functions are provided to insert entries
+ * into the database and extract entries when the user calls mmap with the
+ * given offset. The function allocates a unique page offset that should be
+ * provided to user, the user will use the offset to retrieve information such
+ * as address to be mapped and how.
+ *
+ * Return: 0 on success and -ENOMEM on failure
+ */
+int rdma_user_mmap_entry_insert(struct ib_ucontext *ucontext,
+ struct rdma_user_mmap_entry *entry,
+ size_t length)
+{
+ struct ib_uverbs_file *ufile = ucontext->ufile;
+ XA_STATE(xas, &ucontext->mmap_xa, 0);
+ u32 xa_first, xa_last, npages;
+ int err;
+ u32 i;
+
+ if (!entry)
+ return -EINVAL;
+
+ kref_init(&entry->ref);
+ entry->ucontext = ucontext;
+
+ /*
+ * We want the whole allocation to be done without interruption from a
+ * different thread. The allocation requires finding a free range and
+ * storing. During the xa_insert the lock could be released, possibly
+ * allowing another thread to choose the same range.
+ */
+ mutex_lock(&ufile->umap_lock);
+
+ xa_lock(&ucontext->mmap_xa);
+
+ /* We want to find an empty range */
+ npages = (u32)DIV_ROUND_UP(length, PAGE_SIZE);
+ entry->npages = npages;
+ while (true) {
+ /* First find an empty index */
+ xas_find_marked(&xas, U32_MAX, XA_FREE_MARK);
+ if (xas.xa_node == XAS_RESTART)
+ goto err_unlock;
+
+ xa_first = xas.xa_index;
+
+ /* Is there enough room to have the range? */
+ if (check_add_overflow(xa_first, npages, &xa_last))
+ goto err_unlock;
+
+ /*
+ * Now look for the next present entry. If an entry doesn't
+ * exist, we found an empty range and can proceed.
+ */
+ xas_next_entry(&xas, xa_last - 1);
+ if (xas.xa_node == XAS_BOUNDS || xas.xa_index >= xa_last)
+ break;
+ }
+
+ for (i = xa_first; i < xa_last; i++) {
+ err = __xa_insert(&ucontext->mmap_xa, i, entry, GFP_KERNEL);
+ if (err)
+ goto err_undo;
+ }
+
+ /*
+ * Internally the kernel uses a page offset, in libc this is a byte
+ * offset. Drivers should not return pgoff to userspace.
+ */
+ entry->start_pgoff = xa_first;
+ xa_unlock(&ucontext->mmap_xa);
+ mutex_unlock(&ufile->umap_lock);
+
+ ibdev_dbg(ucontext->device, "mmap: pgoff[%#lx] npages[%#x] inserted\n",
+ entry->start_pgoff, npages);
+
+ return 0;
+
+err_undo:
+ for (; i > xa_first; i--)
+ __xa_erase(&ucontext->mmap_xa, i - 1);
+
+err_unlock:
+ xa_unlock(&ucontext->mmap_xa);
+ mutex_unlock(&ufile->umap_lock);
+ return -ENOMEM;
+}
+EXPORT_SYMBOL(rdma_user_mmap_entry_insert);
diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c
index ccf4d069c25c..6c72773faf29 100644
--- a/drivers/infiniband/core/rdma_core.c
+++ b/drivers/infiniband/core/rdma_core.c
@@ -817,6 +817,7 @@ static void ufile_destroy_ucontext(struct ib_uverbs_file *ufile,
rdma_restrack_del(&ucontext->res);
ib_dev->ops.dealloc_ucontext(ucontext);
+ WARN_ON(!xa_empty(&ucontext->mmap_xa));
kfree(ucontext);
ufile->ucontext = NULL;
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 14a80fd9f464..06ed32c8662f 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -252,6 +252,8 @@ static int ib_uverbs_get_context(struct uverbs_attr_bundle *attrs)
ucontext->closing = false;
ucontext->cleanup_retryable = false;
+ xa_init_flags(&ucontext->mmap_xa, XA_FLAGS_ALLOC);
+
ret = get_unused_fd_flags(O_CLOEXEC);
if (ret < 0)
goto err_free;
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 0626b62ed107..8865ec28180a 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1473,6 +1473,7 @@ struct ib_ucontext {
* Implementation details of the RDMA core, don't use in drivers:
*/
struct rdma_restrack_entry res;
+ struct xarray mmap_xa;
};
struct ib_uobject {
@@ -2258,6 +2259,21 @@ struct iw_cm_conn_param;
#define DECLARE_RDMA_OBJ_SIZE(ib_struct) size_t size_##ib_struct
+struct rdma_user_mmap_entry {
+ struct kref ref;
+ struct ib_ucontext *ucontext;
+ unsigned long start_pgoff;
+ size_t npages;
+ bool driver_removed;
+};
+
+/* Return the offset (in bytes) the user should pass to libc's mmap() */
+static inline u64
+rdma_user_mmap_get_offset(const struct rdma_user_mmap_entry *entry)
+{
+ return (u64)entry->start_pgoff << PAGE_SHIFT;
+}
+
/**
* struct ib_device_ops - InfiniBand device operations
* This structure defines all the InfiniBand device operations, providers will
@@ -2370,6 +2386,13 @@ struct ib_device_ops {
struct ib_udata *udata);
void (*dealloc_ucontext)(struct ib_ucontext *context);
int (*mmap)(struct ib_ucontext *context, struct vm_area_struct *vma);
+ /**
+ * This will be called once refcount of an entry in mmap_xa reaches
+ * zero. The type of the memory that was mapped may differ between
+ * entries and is opaque to the rdma_user_mmap interface.
+ * Therefore needs to be implemented by the driver in mmap_free.
+ */
+ void (*mmap_free)(struct rdma_user_mmap_entry *entry);
void (*disassociate_ucontext)(struct ib_ucontext *ibcontext);
int (*alloc_pd)(struct ib_pd *pd, struct ib_udata *udata);
void (*dealloc_pd)(struct ib_pd *pd, struct ib_udata *udata);
@@ -2815,6 +2838,18 @@ static inline int rdma_user_mmap_io(struct ib_ucontext *ucontext,
return -EINVAL;
}
#endif
+int rdma_user_mmap_entry_insert(struct ib_ucontext *ucontext,
+ struct rdma_user_mmap_entry *entry,
+ size_t length);
+struct rdma_user_mmap_entry *
+rdma_user_mmap_entry_get_pgoff(struct ib_ucontext *ucontext,
+ unsigned long pgoff);
+struct rdma_user_mmap_entry *
+rdma_user_mmap_entry_get(struct ib_ucontext *ucontext,
+ struct vm_area_struct *vma);
+void rdma_user_mmap_entry_put(struct rdma_user_mmap_entry *entry);
+
+void rdma_user_mmap_entry_remove(struct rdma_user_mmap_entry *entry);
static inline int ib_copy_from_udata(void *dest, struct ib_udata *udata, size_t len)
{