aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/infiniband/core/uverbs_main.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/infiniband/core/uverbs_main.c')
-rw-r--r--drivers/infiniband/core/uverbs_main.c340
1 files changed, 276 insertions, 64 deletions
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index 50152c1b1004..6d373f5515b7 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -45,6 +45,7 @@
#include <linux/cdev.h>
#include <linux/anon_inodes.h>
#include <linux/slab.h>
+#include <linux/sched/mm.h>
#include <linux/uaccess.h>
@@ -72,7 +73,7 @@ enum {
static dev_t dynamic_uverbs_dev;
static struct class *uverbs_class;
-static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES);
+static DEFINE_IDA(uverbs_ida);
static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file,
const char __user *buf, int in_len,
@@ -169,20 +170,16 @@ int uverbs_dealloc_mw(struct ib_mw *mw)
return ret;
}
-static void ib_uverbs_release_dev(struct kobject *kobj)
+static void ib_uverbs_release_dev(struct device *device)
{
struct ib_uverbs_device *dev =
- container_of(kobj, struct ib_uverbs_device, kobj);
+ container_of(device, struct ib_uverbs_device, dev);
uverbs_destroy_api(dev->uapi);
cleanup_srcu_struct(&dev->disassociate_srcu);
kfree(dev);
}
-static struct kobj_type ib_uverbs_dev_ktype = {
- .release = ib_uverbs_release_dev,
-};
-
static void ib_uverbs_release_async_event_file(struct kref *ref)
{
struct ib_uverbs_async_event_file *file =
@@ -265,7 +262,7 @@ void ib_uverbs_release_file(struct kref *ref)
if (atomic_dec_and_test(&file->device->refcount))
ib_uverbs_comp_dev(file->device);
- kobject_put(&file->device->kobj);
+ put_device(&file->device->dev);
kfree(file);
}
@@ -817,6 +814,226 @@ out:
}
/*
+ * Each time we map IO memory into user space this keeps track of the mapping.
+ * When the device is hot-unplugged we 'zap' the mmaps in user space to point
+ * to the zero page and allow the hot unplug to proceed.
+ *
+ * This is necessary for cases like PCI physical hot unplug as the actual BAR
+ * memory may vanish after this and access to it from userspace could MCE.
+ *
+ * RDMA drivers supporting disassociation must have their user space designed
+ * to cope in some way with their IO pages going to the zero page.
+ */
+struct rdma_umap_priv {
+ struct vm_area_struct *vma;
+ struct list_head list;
+};
+
+static const struct vm_operations_struct rdma_umap_ops;
+
+static void rdma_umap_priv_init(struct rdma_umap_priv *priv,
+ struct vm_area_struct *vma)
+{
+ struct ib_uverbs_file *ufile = vma->vm_file->private_data;
+
+ priv->vma = vma;
+ vma->vm_private_data = priv;
+ vma->vm_ops = &rdma_umap_ops;
+
+ mutex_lock(&ufile->umap_lock);
+ list_add(&priv->list, &ufile->umaps);
+ mutex_unlock(&ufile->umap_lock);
+}
+
+/*
+ * The VMA has been dup'd, initialize the vm_private_data with a new tracking
+ * struct
+ */
+static void rdma_umap_open(struct vm_area_struct *vma)
+{
+ struct ib_uverbs_file *ufile = vma->vm_file->private_data;
+ struct rdma_umap_priv *opriv = vma->vm_private_data;
+ struct rdma_umap_priv *priv;
+
+ if (!opriv)
+ return;
+
+ /* We are racing with disassociation */
+ if (!down_read_trylock(&ufile->hw_destroy_rwsem))
+ goto out_zap;
+ /*
+ * Disassociation already completed, the VMA should already be zapped.
+ */
+ if (!ufile->ucontext)
+ goto out_unlock;
+
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ goto out_unlock;
+ rdma_umap_priv_init(priv, vma);
+
+ up_read(&ufile->hw_destroy_rwsem);
+ return;
+
+out_unlock:
+ up_read(&ufile->hw_destroy_rwsem);
+out_zap:
+ /*
+ * We can't allow the VMA to be created with the actual IO pages, that
+ * would break our API contract, and it can't be stopped at this
+ * point, so zap it.
+ */
+ vma->vm_private_data = NULL;
+ zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start);
+}
+
+static void rdma_umap_close(struct vm_area_struct *vma)
+{
+ struct ib_uverbs_file *ufile = vma->vm_file->private_data;
+ struct rdma_umap_priv *priv = vma->vm_private_data;
+
+ if (!priv)
+ return;
+
+ /*
+ * The vma holds a reference on the struct file that created it, which
+ * in turn means that the ib_uverbs_file is guaranteed to exist at
+ * this point.
+ */
+ mutex_lock(&ufile->umap_lock);
+ list_del(&priv->list);
+ mutex_unlock(&ufile->umap_lock);
+ kfree(priv);
+}
+
+static const struct vm_operations_struct rdma_umap_ops = {
+ .open = rdma_umap_open,
+ .close = rdma_umap_close,
+};
+
+static struct rdma_umap_priv *rdma_user_mmap_pre(struct ib_ucontext *ucontext,
+ struct vm_area_struct *vma,
+ unsigned long size)
+{
+ struct ib_uverbs_file *ufile = ucontext->ufile;
+ struct rdma_umap_priv *priv;
+
+ if (vma->vm_end - vma->vm_start != size)
+ return ERR_PTR(-EINVAL);
+
+ /* Driver is using this wrong, must be called by ib_uverbs_mmap */
+ if (WARN_ON(!vma->vm_file ||
+ vma->vm_file->private_data != ufile))
+ return ERR_PTR(-EINVAL);
+ lockdep_assert_held(&ufile->device->disassociate_srcu);
+
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ return ERR_PTR(-ENOMEM);
+ return priv;
+}
+
+/*
+ * Map IO memory into a process. This is to be called by drivers as part of
+ * their mmap() functions if they wish to send something like PCI-E BAR memory
+ * to userspace.
+ */
+int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma,
+ unsigned long pfn, unsigned long size, pgprot_t prot)
+{
+ struct rdma_umap_priv *priv = rdma_user_mmap_pre(ucontext, vma, size);
+
+ if (IS_ERR(priv))
+ return PTR_ERR(priv);
+
+ vma->vm_page_prot = prot;
+ if (io_remap_pfn_range(vma, vma->vm_start, pfn, size, prot)) {
+ kfree(priv);
+ return -EAGAIN;
+ }
+
+ rdma_umap_priv_init(priv, vma);
+ return 0;
+}
+EXPORT_SYMBOL(rdma_user_mmap_io);
+
+/*
+ * The page case is here for a slightly different reason, the driver expects
+ * to be able to free the page it is sharing to user space when it destroys
+ * its ucontext, which means we need to zap the user space references.
+ *
+ * We could handle this differently by providing an API to allocate a shared
+ * page and then only freeing the shared page when the last ufile is
+ * destroyed.
+ */
+int rdma_user_mmap_page(struct ib_ucontext *ucontext,
+ struct vm_area_struct *vma, struct page *page,
+ unsigned long size)
+{
+ struct rdma_umap_priv *priv = rdma_user_mmap_pre(ucontext, vma, size);
+
+ if (IS_ERR(priv))
+ return PTR_ERR(priv);
+
+ if (remap_pfn_range(vma, vma->vm_start, page_to_pfn(page), size,
+ vma->vm_page_prot)) {
+ kfree(priv);
+ return -EAGAIN;
+ }
+
+ rdma_umap_priv_init(priv, vma);
+ return 0;
+}
+EXPORT_SYMBOL(rdma_user_mmap_page);
+
+void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile)
+{
+ struct rdma_umap_priv *priv, *next_priv;
+
+ lockdep_assert_held(&ufile->hw_destroy_rwsem);
+
+ while (1) {
+ struct mm_struct *mm = NULL;
+
+ /* Get an arbitrary mm pointer that hasn't been cleaned yet */
+ mutex_lock(&ufile->umap_lock);
+ if (!list_empty(&ufile->umaps)) {
+ mm = list_first_entry(&ufile->umaps,
+ struct rdma_umap_priv, list)
+ ->vma->vm_mm;
+ mmget(mm);
+ }
+ mutex_unlock(&ufile->umap_lock);
+ if (!mm)
+ return;
+
+ /*
+ * The umap_lock is nested under mmap_sem since it used within
+ * the vma_ops callbacks, so we have to clean the list one mm
+ * at a time to get the lock ordering right. Typically there
+ * will only be one mm, so no big deal.
+ */
+ down_write(&mm->mmap_sem);
+ mutex_lock(&ufile->umap_lock);
+ list_for_each_entry_safe (priv, next_priv, &ufile->umaps,
+ list) {
+ struct vm_area_struct *vma = priv->vma;
+
+ if (vma->vm_mm != mm)
+ continue;
+ list_del_init(&priv->list);
+
+ zap_vma_ptes(vma, vma->vm_start,
+ vma->vm_end - vma->vm_start);
+ vma->vm_flags &= ~(VM_SHARED | VM_MAYSHARE);
+ }
+ mutex_unlock(&ufile->umap_lock);
+ up_write(&mm->mmap_sem);
+ mmput(mm);
+ }
+}
+
+/*
* ib_uverbs_open() does not need the BKL:
*
* - the ib_uverbs_device structures are properly reference counted and
@@ -839,6 +1056,7 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
if (!atomic_inc_not_zero(&dev->refcount))
return -ENXIO;
+ get_device(&dev->dev);
srcu_key = srcu_read_lock(&dev->disassociate_srcu);
mutex_lock(&dev->lists_mutex);
ib_dev = srcu_dereference(dev->ib_dev,
@@ -876,9 +1094,10 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
spin_lock_init(&file->uobjects_lock);
INIT_LIST_HEAD(&file->uobjects);
init_rwsem(&file->hw_destroy_rwsem);
+ mutex_init(&file->umap_lock);
+ INIT_LIST_HEAD(&file->umaps);
filp->private_data = file;
- kobject_get(&dev->kobj);
list_add_tail(&file->list, &dev->uverbs_file_list);
mutex_unlock(&dev->lists_mutex);
srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
@@ -899,6 +1118,7 @@ err:
if (atomic_dec_and_test(&dev->refcount))
ib_uverbs_comp_dev(dev);
+ put_device(&dev->dev);
return ret;
}
@@ -909,10 +1129,7 @@ static int ib_uverbs_close(struct inode *inode, struct file *filp)
uverbs_destroy_ufile_hw(file, RDMA_REMOVE_CLOSE);
mutex_lock(&file->device->lists_mutex);
- if (!file->is_closed) {
- list_del(&file->list);
- file->is_closed = 1;
- }
+ list_del_init(&file->list);
mutex_unlock(&file->device->lists_mutex);
if (file->async_file)
@@ -951,37 +1168,34 @@ static struct ib_client uverbs_client = {
.remove = ib_uverbs_remove_one
};
-static ssize_t show_ibdev(struct device *device, struct device_attribute *attr,
+static ssize_t ibdev_show(struct device *device, struct device_attribute *attr,
char *buf)
{
+ struct ib_uverbs_device *dev =
+ container_of(device, struct ib_uverbs_device, dev);
int ret = -ENODEV;
int srcu_key;
- struct ib_uverbs_device *dev = dev_get_drvdata(device);
struct ib_device *ib_dev;
- if (!dev)
- return -ENODEV;
-
srcu_key = srcu_read_lock(&dev->disassociate_srcu);
ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu);
if (ib_dev)
- ret = sprintf(buf, "%s\n", ib_dev->name);
+ ret = sprintf(buf, "%s\n", dev_name(&ib_dev->dev));
srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
return ret;
}
-static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
+static DEVICE_ATTR_RO(ibdev);
-static ssize_t show_dev_abi_version(struct device *device,
- struct device_attribute *attr, char *buf)
+static ssize_t abi_version_show(struct device *device,
+ struct device_attribute *attr, char *buf)
{
- struct ib_uverbs_device *dev = dev_get_drvdata(device);
+ struct ib_uverbs_device *dev =
+ container_of(device, struct ib_uverbs_device, dev);
int ret = -ENODEV;
int srcu_key;
struct ib_device *ib_dev;
- if (!dev)
- return -ENODEV;
srcu_key = srcu_read_lock(&dev->disassociate_srcu);
ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu);
if (ib_dev)
@@ -990,7 +1204,17 @@ static ssize_t show_dev_abi_version(struct device *device,
return ret;
}
-static DEVICE_ATTR(abi_version, S_IRUGO, show_dev_abi_version, NULL);
+static DEVICE_ATTR_RO(abi_version);
+
+static struct attribute *ib_dev_attrs[] = {
+ &dev_attr_abi_version.attr,
+ &dev_attr_ibdev.attr,
+ NULL,
+};
+
+static const struct attribute_group dev_attr_group = {
+ .attrs = ib_dev_attrs,
+};
static CLASS_ATTR_STRING(abi_version, S_IRUGO,
__stringify(IB_USER_VERBS_ABI_VERSION));
@@ -1028,65 +1252,56 @@ static void ib_uverbs_add_one(struct ib_device *device)
return;
}
+ device_initialize(&uverbs_dev->dev);
+ uverbs_dev->dev.class = uverbs_class;
+ uverbs_dev->dev.parent = device->dev.parent;
+ uverbs_dev->dev.release = ib_uverbs_release_dev;
+ uverbs_dev->groups[0] = &dev_attr_group;
+ uverbs_dev->dev.groups = uverbs_dev->groups;
atomic_set(&uverbs_dev->refcount, 1);
init_completion(&uverbs_dev->comp);
uverbs_dev->xrcd_tree = RB_ROOT;
mutex_init(&uverbs_dev->xrcd_tree_mutex);
- kobject_init(&uverbs_dev->kobj, &ib_uverbs_dev_ktype);
mutex_init(&uverbs_dev->lists_mutex);
INIT_LIST_HEAD(&uverbs_dev->uverbs_file_list);
INIT_LIST_HEAD(&uverbs_dev->uverbs_events_file_list);
+ rcu_assign_pointer(uverbs_dev->ib_dev, device);
+ uverbs_dev->num_comp_vectors = device->num_comp_vectors;
- devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES);
- if (devnum >= IB_UVERBS_MAX_DEVICES)
+ devnum = ida_alloc_max(&uverbs_ida, IB_UVERBS_MAX_DEVICES - 1,
+ GFP_KERNEL);
+ if (devnum < 0)
goto err;
uverbs_dev->devnum = devnum;
- set_bit(devnum, dev_map);
if (devnum >= IB_UVERBS_NUM_FIXED_MINOR)
base = dynamic_uverbs_dev + devnum - IB_UVERBS_NUM_FIXED_MINOR;
else
base = IB_UVERBS_BASE_DEV + devnum;
- rcu_assign_pointer(uverbs_dev->ib_dev, device);
- uverbs_dev->num_comp_vectors = device->num_comp_vectors;
-
if (ib_uverbs_create_uapi(device, uverbs_dev))
goto err_uapi;
- cdev_init(&uverbs_dev->cdev, NULL);
+ uverbs_dev->dev.devt = base;
+ dev_set_name(&uverbs_dev->dev, "uverbs%d", uverbs_dev->devnum);
+
+ cdev_init(&uverbs_dev->cdev,
+ device->mmap ? &uverbs_mmap_fops : &uverbs_fops);
uverbs_dev->cdev.owner = THIS_MODULE;
- uverbs_dev->cdev.ops = device->mmap ? &uverbs_mmap_fops : &uverbs_fops;
- cdev_set_parent(&uverbs_dev->cdev, &uverbs_dev->kobj);
- kobject_set_name(&uverbs_dev->cdev.kobj, "uverbs%d", uverbs_dev->devnum);
- if (cdev_add(&uverbs_dev->cdev, base, 1))
- goto err_cdev;
-
- uverbs_dev->dev = device_create(uverbs_class, device->dev.parent,
- uverbs_dev->cdev.dev, uverbs_dev,
- "uverbs%d", uverbs_dev->devnum);
- if (IS_ERR(uverbs_dev->dev))
- goto err_cdev;
-
- if (device_create_file(uverbs_dev->dev, &dev_attr_ibdev))
- goto err_class;
- if (device_create_file(uverbs_dev->dev, &dev_attr_abi_version))
- goto err_class;
- ib_set_client_data(device, &uverbs_client, uverbs_dev);
+ ret = cdev_device_add(&uverbs_dev->cdev, &uverbs_dev->dev);
+ if (ret)
+ goto err_uapi;
+ ib_set_client_data(device, &uverbs_client, uverbs_dev);
return;
-err_class:
- device_destroy(uverbs_class, uverbs_dev->cdev.dev);
-err_cdev:
- cdev_del(&uverbs_dev->cdev);
err_uapi:
- clear_bit(devnum, dev_map);
+ ida_free(&uverbs_ida, devnum);
err:
if (atomic_dec_and_test(&uverbs_dev->refcount))
ib_uverbs_comp_dev(uverbs_dev);
wait_for_completion(&uverbs_dev->comp);
- kobject_put(&uverbs_dev->kobj);
+ put_device(&uverbs_dev->dev);
return;
}
@@ -1107,8 +1322,7 @@ static void ib_uverbs_free_hw_resources(struct ib_uverbs_device *uverbs_dev,
while (!list_empty(&uverbs_dev->uverbs_file_list)) {
file = list_first_entry(&uverbs_dev->uverbs_file_list,
struct ib_uverbs_file, list);
- file->is_closed = 1;
- list_del(&file->list);
+ list_del_init(&file->list);
kref_get(&file->ref);
/* We must release the mutex before going ahead and calling
@@ -1156,10 +1370,8 @@ static void ib_uverbs_remove_one(struct ib_device *device, void *client_data)
if (!uverbs_dev)
return;
- dev_set_drvdata(uverbs_dev->dev, NULL);
- device_destroy(uverbs_class, uverbs_dev->cdev.dev);
- cdev_del(&uverbs_dev->cdev);
- clear_bit(uverbs_dev->devnum, dev_map);
+ cdev_device_del(&uverbs_dev->cdev, &uverbs_dev->dev);
+ ida_free(&uverbs_ida, uverbs_dev->devnum);
if (device->disassociate_ucontext) {
/* We disassociate HW resources and immediately return.
@@ -1182,7 +1394,7 @@ static void ib_uverbs_remove_one(struct ib_device *device, void *client_data)
if (wait_clients)
wait_for_completion(&uverbs_dev->comp);
- kobject_put(&uverbs_dev->kobj);
+ put_device(&uverbs_dev->dev);
}
static char *uverbs_devnode(struct device *dev, umode_t *mode)