aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/infiniband/core
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/infiniband/core')
-rw-r--r--drivers/infiniband/core/Makefile2
-rw-r--r--drivers/infiniband/core/cache.c8
-rw-r--r--drivers/infiniband/core/cm.c66
-rw-r--r--drivers/infiniband/core/cm_msgs.h32
-rw-r--r--drivers/infiniband/core/cma.c107
-rw-r--r--drivers/infiniband/core/core_priv.h11
-rw-r--r--drivers/infiniband/core/counters.c40
-rw-r--r--drivers/infiniband/core/device.c50
-rw-r--r--drivers/infiniband/core/ib_core_uverbs.c335
-rw-r--r--drivers/infiniband/core/iwpm_util.h5
-rw-r--r--drivers/infiniband/core/mad.c31
-rw-r--r--drivers/infiniband/core/nldev.c141
-rw-r--r--drivers/infiniband/core/rdma_core.c1
-rw-r--r--drivers/infiniband/core/restrack.c20
-rw-r--r--drivers/infiniband/core/restrack.h1
-rw-r--r--drivers/infiniband/core/rw.c25
-rw-r--r--drivers/infiniband/core/sa_query.c2
-rw-r--r--drivers/infiniband/core/sysfs.c12
-rw-r--r--drivers/infiniband/core/umem.c12
-rw-r--r--drivers/infiniband/core/umem_odp.c38
-rw-r--r--drivers/infiniband/core/uverbs_cmd.c2
-rw-r--r--drivers/infiniband/core/uverbs_ioctl.c3
-rw-r--r--drivers/infiniband/core/uverbs_main.c84
-rw-r--r--drivers/infiniband/core/verbs.c12
24 files changed, 671 insertions, 369 deletions
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index 09881bd5f12d..9a8871e21545 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -11,7 +11,7 @@ ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \
device.o fmr_pool.o cache.o netlink.o \
roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \
multicast.o mad.o smi.o agent.o mad_rmpp.o \
- nldev.o restrack.o counters.o
+ nldev.o restrack.o counters.o ib_core_uverbs.o
ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o
ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o
diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
index 00fb3eacda19..d535995711c3 100644
--- a/drivers/infiniband/core/cache.c
+++ b/drivers/infiniband/core/cache.c
@@ -819,22 +819,16 @@ static void cleanup_gid_table_port(struct ib_device *ib_dev, u8 port,
struct ib_gid_table *table)
{
int i;
- bool deleted = false;
if (!table)
return;
mutex_lock(&table->lock);
for (i = 0; i < table->sz; ++i) {
- if (is_gid_entry_valid(table->data_vec[i])) {
+ if (is_gid_entry_valid(table->data_vec[i]))
del_gid(ib_dev, port, table, i);
- deleted = true;
- }
}
mutex_unlock(&table->lock);
-
- if (deleted)
- dispatch_gid_change_event(ib_dev, port);
}
void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port,
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 5920c0085d35..455b3659d84b 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -1,36 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
/*
* Copyright (c) 2004-2007 Intel Corporation. All rights reserved.
* Copyright (c) 2004 Topspin Corporation. All rights reserved.
* Copyright (c) 2004, 2005 Voltaire Corporation. All rights reserved.
* Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses. You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- *
- * - Redistributions of source code must retain the above
- * copyright notice, this list of conditions and the following
- * disclaimer.
- *
- * - Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials
- * provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * Copyright (c) 2019, Mellanox Technologies inc. All rights reserved.
*/
#include <linux/completion.h>
@@ -246,7 +220,7 @@ struct cm_work {
};
struct cm_timewait_info {
- struct cm_work work; /* Must be first. */
+ struct cm_work work;
struct list_head list;
struct rb_node remote_qp_node;
struct rb_node remote_id_node;
@@ -263,7 +237,7 @@ struct cm_id_private {
struct rb_node sidr_id_node;
spinlock_t lock; /* Do not acquire inside cm.lock */
struct completion comp;
- atomic_t refcount;
+ refcount_t refcount;
/* Number of clients sharing this ib_cm_id. Only valid for listeners.
* Protected by the cm.lock spinlock. */
int listen_sharecount;
@@ -308,7 +282,7 @@ static void cm_work_handler(struct work_struct *work);
static inline void cm_deref_id(struct cm_id_private *cm_id_priv)
{
- if (atomic_dec_and_test(&cm_id_priv->refcount))
+ if (refcount_dec_and_test(&cm_id_priv->refcount))
complete(&cm_id_priv->comp);
}
@@ -365,7 +339,7 @@ static int cm_alloc_msg(struct cm_id_private *cm_id_priv,
m->ah = ah;
m->retries = cm_id_priv->max_cm_retries;
- atomic_inc(&cm_id_priv->refcount);
+ refcount_inc(&cm_id_priv->refcount);
m->context[0] = cm_id_priv;
*msg = m;
@@ -626,7 +600,7 @@ static struct cm_id_private * cm_get_id(__be32 local_id, __be32 remote_id)
cm_id_priv = xa_load(&cm.local_id_table, cm_local_id(local_id));
if (cm_id_priv) {
if (cm_id_priv->id.remote_id == remote_id)
- atomic_inc(&cm_id_priv->refcount);
+ refcount_inc(&cm_id_priv->refcount);
else
cm_id_priv = NULL;
}
@@ -883,7 +857,7 @@ struct ib_cm_id *ib_create_cm_id(struct ib_device *device,
INIT_LIST_HEAD(&cm_id_priv->prim_list);
INIT_LIST_HEAD(&cm_id_priv->altr_list);
atomic_set(&cm_id_priv->work_count, -1);
- atomic_set(&cm_id_priv->refcount, 1);
+ refcount_set(&cm_id_priv->refcount, 1);
return &cm_id_priv->id;
error:
@@ -1230,7 +1204,7 @@ struct ib_cm_id *ib_cm_insert_listen(struct ib_device *device,
spin_unlock_irqrestore(&cm.lock, flags);
return ERR_PTR(-EINVAL);
}
- atomic_inc(&cm_id_priv->refcount);
+ refcount_inc(&cm_id_priv->refcount);
++cm_id_priv->listen_sharecount;
spin_unlock_irqrestore(&cm.lock, flags);
@@ -1525,14 +1499,6 @@ static int cm_issue_rej(struct cm_port *port,
return ret;
}
-static inline int cm_is_active_peer(__be64 local_ca_guid, __be64 remote_ca_guid,
- __be32 local_qpn, __be32 remote_qpn)
-{
- return (be64_to_cpu(local_ca_guid) > be64_to_cpu(remote_ca_guid) ||
- ((local_ca_guid == remote_ca_guid) &&
- (be32_to_cpu(local_qpn) > be32_to_cpu(remote_qpn))));
-}
-
static bool cm_req_has_alt_path(struct cm_req_msg *req_msg)
{
return ((req_msg->alt_local_lid) ||
@@ -1895,8 +1861,8 @@ static struct cm_id_private * cm_match_req(struct cm_work *work,
NULL, 0);
goto out;
}
- atomic_inc(&listen_cm_id_priv->refcount);
- atomic_inc(&cm_id_priv->refcount);
+ refcount_inc(&listen_cm_id_priv->refcount);
+ refcount_inc(&cm_id_priv->refcount);
cm_id_priv->id.state = IB_CM_REQ_RCVD;
atomic_inc(&cm_id_priv->work_count);
spin_unlock_irq(&cm.lock);
@@ -2052,7 +2018,7 @@ static int cm_req_handler(struct cm_work *work)
return 0;
rejected:
- atomic_dec(&cm_id_priv->refcount);
+ refcount_dec(&cm_id_priv->refcount);
cm_deref_id(listen_cm_id_priv);
free_timeinfo:
kfree(cm_id_priv->timewait_info);
@@ -2826,7 +2792,7 @@ static struct cm_id_private * cm_acquire_rejected_id(struct cm_rej_msg *rej_msg)
cm_local_id(timewait_info->work.local_id));
if (cm_id_priv) {
if (cm_id_priv->id.remote_id == remote_id)
- atomic_inc(&cm_id_priv->refcount);
+ refcount_inc(&cm_id_priv->refcount);
else
cm_id_priv = NULL;
}
@@ -3434,7 +3400,7 @@ static int cm_timewait_handler(struct cm_work *work)
struct cm_id_private *cm_id_priv;
int ret;
- timewait_info = (struct cm_timewait_info *)work;
+ timewait_info = container_of(work, struct cm_timewait_info, work);
spin_lock_irq(&cm.lock);
list_del(&timewait_info->list);
spin_unlock_irq(&cm.lock);
@@ -3596,8 +3562,8 @@ static int cm_sidr_req_handler(struct cm_work *work)
cm_reject_sidr_req(cm_id_priv, IB_SIDR_UNSUPPORTED);
goto out; /* No match. */
}
- atomic_inc(&cur_cm_id_priv->refcount);
- atomic_inc(&cm_id_priv->refcount);
+ refcount_inc(&cur_cm_id_priv->refcount);
+ refcount_inc(&cm_id_priv->refcount);
spin_unlock_irq(&cm.lock);
cm_id_priv->id.cm_handler = cur_cm_id_priv->id.cm_handler;
diff --git a/drivers/infiniband/core/cm_msgs.h b/drivers/infiniband/core/cm_msgs.h
index 3d16d614aff6..92d7260ac913 100644
--- a/drivers/infiniband/core/cm_msgs.h
+++ b/drivers/infiniband/core/cm_msgs.h
@@ -1,37 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
/*
* Copyright (c) 2004, 2011 Intel Corporation. All rights reserved.
* Copyright (c) 2004 Topspin Corporation. All rights reserved.
* Copyright (c) 2004 Voltaire Corporation. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses. You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING the madirectory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- * Redistribution and use source and binary forms, with or
- * withmodification, are permitted provided that the following
- * conditions are met:
- *
- * - Redistributions of source code must retathe above
- * copyright notice, this list of conditions and the following
- * disclaimer.
- *
- * - Redistributions binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer the documentation and/or other materials
- * provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHWARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS THE
- * SOFTWARE.
+ * Copyright (c) 2019, Mellanox Technologies inc. All rights reserved.
*/
-#if !defined(CM_MSGS_H)
+#ifndef CM_MSGS_H
#define CM_MSGS_H
#include <rdma/ib_mad.h>
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index d78f67623f24..25f2b70fd8ef 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -1,36 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
/*
* Copyright (c) 2005 Voltaire Inc. All rights reserved.
* Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved.
- * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved.
+ * Copyright (c) 1999-2019, Mellanox Technologies, Inc. All rights reserved.
* Copyright (c) 2005-2006 Intel Corporation. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses. You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- *
- * - Redistributions of source code must retain the above
- * copyright notice, this list of conditions and the following
- * disclaimer.
- *
- * - Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials
- * provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
*/
#include <linux/completion.h>
@@ -2531,7 +2504,9 @@ EXPORT_SYMBOL(rdma_set_service_type);
* This function should be called before rdma_connect() on active side,
* and on passive side before rdma_accept(). It is applicable to primary
* path only. The timeout will affect the local side of the QP, it is not
- * negotiated with remote side and zero disables the timer.
+ * negotiated with remote side and zero disables the timer. In case it is
+ * set before rdma_resolve_route, the value will also be used to determine
+ * PacketLifeTime for RoCE.
*
* Return: 0 for success
*/
@@ -2828,22 +2803,65 @@ static int cma_resolve_iw_route(struct rdma_id_private *id_priv)
return 0;
}
-static int iboe_tos_to_sl(struct net_device *ndev, int tos)
+static int get_vlan_ndev_tc(struct net_device *vlan_ndev, int prio)
{
- int prio;
struct net_device *dev;
- prio = rt_tos2priority(tos);
- dev = is_vlan_dev(ndev) ? vlan_dev_real_dev(ndev) : ndev;
+ dev = vlan_dev_real_dev(vlan_ndev);
if (dev->num_tc)
return netdev_get_prio_tc_map(dev, prio);
-#if IS_ENABLED(CONFIG_VLAN_8021Q)
+ return (vlan_dev_get_egress_qos_mask(vlan_ndev, prio) &
+ VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
+}
+
+struct iboe_prio_tc_map {
+ int input_prio;
+ int output_tc;
+ bool found;
+};
+
+static int get_lower_vlan_dev_tc(struct net_device *dev, void *data)
+{
+ struct iboe_prio_tc_map *map = data;
+
+ if (is_vlan_dev(dev))
+ map->output_tc = get_vlan_ndev_tc(dev, map->input_prio);
+ else if (dev->num_tc)
+ map->output_tc = netdev_get_prio_tc_map(dev, map->input_prio);
+ else
+ map->output_tc = 0;
+ /* We are interested only in first level VLAN device, so always
+ * return 1 to stop iterating over next level devices.
+ */
+ map->found = true;
+ return 1;
+}
+
+static int iboe_tos_to_sl(struct net_device *ndev, int tos)
+{
+ struct iboe_prio_tc_map prio_tc_map = {};
+ int prio = rt_tos2priority(tos);
+
+ /* If VLAN device, get it directly from the VLAN netdev */
if (is_vlan_dev(ndev))
- return (vlan_dev_get_egress_qos_mask(ndev, prio) &
- VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
-#endif
- return 0;
+ return get_vlan_ndev_tc(ndev, prio);
+
+ prio_tc_map.input_prio = prio;
+ rcu_read_lock();
+ netdev_walk_all_lower_dev_rcu(ndev,
+ get_lower_vlan_dev_tc,
+ &prio_tc_map);
+ rcu_read_unlock();
+ /* If map is found from lower device, use it; Otherwise
+ * continue with the current netdevice to get priority to tc map.
+ */
+ if (prio_tc_map.found)
+ return prio_tc_map.output_tc;
+ else if (ndev->num_tc)
+ return netdev_get_prio_tc_map(ndev, prio);
+ else
+ return 0;
}
static int cma_resolve_iboe_route(struct rdma_id_private *id_priv)
@@ -2897,7 +2915,16 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv)
route->path_rec->rate = iboe_get_rate(ndev);
dev_put(ndev);
route->path_rec->packet_life_time_selector = IB_SA_EQ;
- route->path_rec->packet_life_time = CMA_IBOE_PACKET_LIFETIME;
+ /* In case ACK timeout is set, use this value to calculate
+ * PacketLifeTime. As per IBTA 12.7.34,
+ * local ACK timeout = (2 * PacketLifeTime + Local CA’s ACK delay).
+ * Assuming a negligible local ACK delay, we can use
+ * PacketLifeTime = local ACK timeout/2
+ * as a reasonable approximation for RoCE networks.
+ */
+ route->path_rec->packet_life_time = id_priv->timeout_set ?
+ id_priv->timeout - 1 : CMA_IBOE_PACKET_LIFETIME;
+
if (!route->path_rec->mtu) {
ret = -EINVAL;
goto err2;
diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index 9d07378b5b42..3645e092e1c7 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -388,4 +388,15 @@ int ib_device_set_netns_put(struct sk_buff *skb,
int rdma_nl_net_init(struct rdma_dev_net *rnet);
void rdma_nl_net_exit(struct rdma_dev_net *rnet);
+
+struct rdma_umap_priv {
+ struct vm_area_struct *vma;
+ struct list_head list;
+ struct rdma_user_mmap_entry *entry;
+};
+
+void rdma_umap_priv_init(struct rdma_umap_priv *priv,
+ struct vm_area_struct *vma,
+ struct rdma_user_mmap_entry *entry);
+
#endif /* _CORE_PRIV_H */
diff --git a/drivers/infiniband/core/counters.c b/drivers/infiniband/core/counters.c
index 680ad27f497d..8434ec082c3a 100644
--- a/drivers/infiniband/core/counters.c
+++ b/drivers/infiniband/core/counters.c
@@ -149,11 +149,18 @@ static bool auto_mode_match(struct ib_qp *qp, struct rdma_counter *counter,
struct auto_mode_param *param = &counter->mode.param;
bool match = true;
- if (!rdma_is_visible_in_pid_ns(&qp->res))
- return false;
-
- /* Ensure that counter belongs to the right PID */
- if (task_pid_nr(counter->res.task) != task_pid_nr(qp->res.task))
+ /*
+ * Ensure that counter belongs to the right PID. This operation can
+ * race with user space which kills the process and leaves QP and
+ * counters orphans.
+ *
+ * It is not a big deal because exitted task will leave both QP and
+ * counter in the same bucket of zombie process. Just ensure that
+ * process is still alive before procedding.
+ *
+ */
+ if (task_pid_nr(counter->res.task) != task_pid_nr(qp->res.task) ||
+ !task_pid_nr(qp->res.task))
return false;
if (auto_mask & RDMA_COUNTER_MASK_QP_TYPE)
@@ -229,9 +236,6 @@ static struct rdma_counter *rdma_get_counter_auto_mode(struct ib_qp *qp,
rt = &dev->res[RDMA_RESTRACK_COUNTER];
xa_lock(&rt->xa);
xa_for_each(&rt->xa, id, res) {
- if (!rdma_is_visible_in_pid_ns(res))
- continue;
-
counter = container_of(res, struct rdma_counter, res);
if ((counter->device != qp->device) || (counter->port != port))
goto next;
@@ -412,9 +416,6 @@ static struct ib_qp *rdma_counter_get_qp(struct ib_device *dev, u32 qp_num)
if (IS_ERR(res))
return NULL;
- if (!rdma_is_visible_in_pid_ns(res))
- goto err;
-
qp = container_of(res, struct ib_qp, res);
if (qp->qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW))
goto err;
@@ -445,11 +446,6 @@ static struct rdma_counter *rdma_get_counter_by_id(struct ib_device *dev,
if (IS_ERR(res))
return NULL;
- if (!rdma_is_visible_in_pid_ns(res)) {
- rdma_restrack_put(res);
- return NULL;
- }
-
counter = container_of(res, struct rdma_counter, res);
kref_get(&counter->kref);
rdma_restrack_put(res);
@@ -463,10 +459,15 @@ static struct rdma_counter *rdma_get_counter_by_id(struct ib_device *dev,
int rdma_counter_bind_qpn(struct ib_device *dev, u8 port,
u32 qp_num, u32 counter_id)
{
+ struct rdma_port_counter *port_counter;
struct rdma_counter *counter;
struct ib_qp *qp;
int ret;
+ port_counter = &dev->port_data[port].port_counter;
+ if (port_counter->mode.mode == RDMA_COUNTER_MODE_AUTO)
+ return -EINVAL;
+
qp = rdma_counter_get_qp(dev, qp_num);
if (!qp)
return -ENOENT;
@@ -503,6 +504,7 @@ err:
int rdma_counter_bind_qpn_alloc(struct ib_device *dev, u8 port,
u32 qp_num, u32 *counter_id)
{
+ struct rdma_port_counter *port_counter;
struct rdma_counter *counter;
struct ib_qp *qp;
int ret;
@@ -510,9 +512,13 @@ int rdma_counter_bind_qpn_alloc(struct ib_device *dev, u8 port,
if (!rdma_is_port_valid(dev, port))
return -EINVAL;
- if (!dev->port_data[port].port_counter.hstats)
+ port_counter = &dev->port_data[port].port_counter;
+ if (!port_counter->hstats)
return -EOPNOTSUPP;
+ if (port_counter->mode.mode == RDMA_COUNTER_MODE_AUTO)
+ return -EINVAL;
+
qp = rdma_counter_get_qp(dev, qp_num);
if (!qp)
return -ENOENT;
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 50a92442c4f7..9ebb09a75049 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -128,17 +128,14 @@ module_param_named(netns_mode, ib_devices_shared_netns, bool, 0444);
MODULE_PARM_DESC(netns_mode,
"Share device among net namespaces; default=1 (shared)");
/**
- * rdma_dev_access_netns() - Return whether a rdma device can be accessed
+ * rdma_dev_access_netns() - Return whether an rdma device can be accessed
* from a specified net namespace or not.
- * @device: Pointer to rdma device which needs to be checked
+ * @dev: Pointer to rdma device which needs to be checked
* @net: Pointer to net namesapce for which access to be checked
*
- * rdma_dev_access_netns() - Return whether a rdma device can be accessed
- * from a specified net namespace or not. When
- * rdma device is in shared mode, it ignores the
- * net namespace. When rdma device is exclusive
- * to a net namespace, rdma device net namespace is
- * checked against the specified one.
+ * When the rdma device is in shared mode, it ignores the net namespace.
+ * When the rdma device is exclusive to a net namespace, rdma device net
+ * namespace is checked against the specified one.
*/
bool rdma_dev_access_netns(const struct ib_device *dev, const struct net *net)
{
@@ -1199,9 +1196,21 @@ static void setup_dma_device(struct ib_device *device)
WARN_ON_ONCE(!parent);
device->dma_device = parent;
}
- /* Setup default max segment size for all IB devices */
- dma_set_max_seg_size(device->dma_device, SZ_2G);
+ if (!device->dev.dma_parms) {
+ if (parent) {
+ /*
+ * The caller did not provide DMA parameters, so
+ * 'parent' probably represents a PCI device. The PCI
+ * core sets the maximum segment size to 64
+ * KB. Increase this parameter to 2 GB.
+ */
+ device->dev.dma_parms = parent->dma_parms;
+ dma_set_max_seg_size(device->dma_device, SZ_2G);
+ } else {
+ WARN_ON_ONCE(true);
+ }
+ }
}
/*
@@ -1317,7 +1326,9 @@ out:
/**
* ib_register_device - Register an IB device with IB core
- * @device:Device to register
+ * @device: Device to register
+ * @name: unique string device name. This may include a '%' which will
+ * cause a unique index to be added to the passed device name.
*
* Low-level drivers use ib_register_device() to register their
* devices with the IB core. All registered clients will receive a
@@ -1444,7 +1455,7 @@ out:
/**
* ib_unregister_device - Unregister an IB device
- * @device: The device to unregister
+ * @ib_dev: The device to unregister
*
* Unregister an IB device. All clients will receive a remove callback.
*
@@ -1466,7 +1477,7 @@ EXPORT_SYMBOL(ib_unregister_device);
/**
* ib_unregister_device_and_put - Unregister a device while holding a 'get'
- * device: The device to unregister
+ * @ib_dev: The device to unregister
*
* This is the same as ib_unregister_device(), except it includes an internal
* ib_device_put() that should match a 'get' obtained by the caller.
@@ -1536,7 +1547,7 @@ static void ib_unregister_work(struct work_struct *work)
/**
* ib_unregister_device_queued - Unregister a device using a work queue
- * device: The device to unregister
+ * @ib_dev: The device to unregister
*
* This schedules an asynchronous unregistration using a WQ for the device. A
* driver should use this to avoid holding locks while doing unregistration,
@@ -2366,7 +2377,7 @@ int ib_modify_device(struct ib_device *device,
struct ib_device_modify *device_modify)
{
if (!device->ops.modify_device)
- return -ENOSYS;
+ return -EOPNOTSUPP;
return device->ops.modify_device(device, device_modify_mask,
device_modify);
@@ -2397,8 +2408,12 @@ int ib_modify_port(struct ib_device *device,
rc = device->ops.modify_port(device, port_num,
port_modify_mask,
port_modify);
+ else if (rdma_protocol_roce(device, port_num) &&
+ ((port_modify->set_port_cap_mask & ~IB_PORT_CM_SUP) == 0 ||
+ (port_modify->clr_port_cap_mask & ~IB_PORT_CM_SUP) == 0))
+ rc = 0;
else
- rc = rdma_protocol_roce(device, port_num) ? 0 : -ENOSYS;
+ rc = -EOPNOTSUPP;
return rc;
}
EXPORT_SYMBOL(ib_modify_port);
@@ -2607,6 +2622,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
SET_DEVICE_OP(dev_ops, drain_sq);
SET_DEVICE_OP(dev_ops, enable_driver);
SET_DEVICE_OP(dev_ops, fill_res_entry);
+ SET_DEVICE_OP(dev_ops, fill_stat_entry);
SET_DEVICE_OP(dev_ops, get_dev_fw_str);
SET_DEVICE_OP(dev_ops, get_dma_mr);
SET_DEVICE_OP(dev_ops, get_hw_stats);
@@ -2615,6 +2631,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
SET_DEVICE_OP(dev_ops, get_port_immutable);
SET_DEVICE_OP(dev_ops, get_vector_affinity);
SET_DEVICE_OP(dev_ops, get_vf_config);
+ SET_DEVICE_OP(dev_ops, get_vf_guid);
SET_DEVICE_OP(dev_ops, get_vf_stats);
SET_DEVICE_OP(dev_ops, init_port);
SET_DEVICE_OP(dev_ops, invalidate_range);
@@ -2630,6 +2647,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
SET_DEVICE_OP(dev_ops, map_mr_sg_pi);
SET_DEVICE_OP(dev_ops, map_phys_fmr);
SET_DEVICE_OP(dev_ops, mmap);
+ SET_DEVICE_OP(dev_ops, mmap_free);
SET_DEVICE_OP(dev_ops, modify_ah);
SET_DEVICE_OP(dev_ops, modify_cq);
SET_DEVICE_OP(dev_ops, modify_device);
diff --git a/drivers/infiniband/core/ib_core_uverbs.c b/drivers/infiniband/core/ib_core_uverbs.c
new file mode 100644
index 000000000000..f509c478b469
--- /dev/null
+++ b/drivers/infiniband/core/ib_core_uverbs.c
@@ -0,0 +1,335 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2019 Marvell. All rights reserved.
+ */
+#include <linux/xarray.h>
+#include "uverbs.h"
+#include "core_priv.h"
+
+/**
+ * rdma_umap_priv_init() - Initialize the private data of a vma
+ *
+ * @priv: The already allocated private data
+ * @vma: The vm area struct that needs private data
+ * @entry: entry into the mmap_xa that needs to be linked with
+ * this vma
+ *
+ * Each time we map IO memory into user space this keeps track of the
+ * mapping. When the device is hot-unplugged we 'zap' the mmaps in user space
+ * to point to the zero page and allow the hot unplug to proceed.
+ *
+ * This is necessary for cases like PCI physical hot unplug as the actual BAR
+ * memory may vanish after this and access to it from userspace could MCE.
+ *
+ * RDMA drivers supporting disassociation must have their user space designed
+ * to cope in some way with their IO pages going to the zero page.
+ *
+ */
+void rdma_umap_priv_init(struct rdma_umap_priv *priv,
+ struct vm_area_struct *vma,
+ struct rdma_user_mmap_entry *entry)
+{
+ struct ib_uverbs_file *ufile = vma->vm_file->private_data;
+
+ priv->vma = vma;
+ if (entry) {
+ kref_get(&entry->ref);
+ priv->entry = entry;
+ }
+ vma->vm_private_data = priv;
+ /* vm_ops is setup in ib_uverbs_mmap() to avoid module dependencies */
+
+ mutex_lock(&ufile->umap_lock);
+ list_add(&priv->list, &ufile->umaps);
+ mutex_unlock(&ufile->umap_lock);
+}
+EXPORT_SYMBOL(rdma_umap_priv_init);
+
+/**
+ * rdma_user_mmap_io() - Map IO memory into a process
+ *
+ * @ucontext: associated user context
+ * @vma: the vma related to the current mmap call
+ * @pfn: pfn to map
+ * @size: size to map
+ * @prot: pgprot to use in remap call
+ * @entry: mmap_entry retrieved from rdma_user_mmap_entry_get(), or NULL
+ * if mmap_entry is not used by the driver
+ *
+ * This is to be called by drivers as part of their mmap() functions if they
+ * wish to send something like PCI-E BAR memory to userspace.
+ *
+ * Return -EINVAL on wrong flags or size, -EAGAIN on failure to map. 0 on
+ * success.
+ */
+int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma,
+ unsigned long pfn, unsigned long size, pgprot_t prot,
+ struct rdma_user_mmap_entry *entry)
+{
+ struct ib_uverbs_file *ufile = ucontext->ufile;
+ struct rdma_umap_priv *priv;
+
+ if (!(vma->vm_flags & VM_SHARED))
+ return -EINVAL;
+
+ if (vma->vm_end - vma->vm_start != size)
+ return -EINVAL;
+
+ /* Driver is using this wrong, must be called by ib_uverbs_mmap */
+ if (WARN_ON(!vma->vm_file ||
+ vma->vm_file->private_data != ufile))
+ return -EINVAL;
+ lockdep_assert_held(&ufile->device->disassociate_srcu);
+
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ return -ENOMEM;
+
+ vma->vm_page_prot = prot;
+ if (io_remap_pfn_range(vma, vma->vm_start, pfn, size, prot)) {
+ kfree(priv);
+ return -EAGAIN;
+ }
+
+ rdma_umap_priv_init(priv, vma, entry);
+ return 0;
+}
+EXPORT_SYMBOL(rdma_user_mmap_io);
+
+/**
+ * rdma_user_mmap_entry_get_pgoff() - Get an entry from the mmap_xa
+ *
+ * @ucontext: associated user context
+ * @pgoff: The mmap offset >> PAGE_SHIFT
+ *
+ * This function is called when a user tries to mmap with an offset (returned
+ * by rdma_user_mmap_get_offset()) it initially received from the driver. The
+ * rdma_user_mmap_entry was created by the function
+ * rdma_user_mmap_entry_insert(). This function increases the refcnt of the
+ * entry so that it won't be deleted from the xarray in the meantime.
+ *
+ * Return an reference to an entry if exists or NULL if there is no
+ * match. rdma_user_mmap_entry_put() must be called to put the reference.
+ */
+struct rdma_user_mmap_entry *
+rdma_user_mmap_entry_get_pgoff(struct ib_ucontext *ucontext,
+ unsigned long pgoff)
+{
+ struct rdma_user_mmap_entry *entry;
+
+ if (pgoff > U32_MAX)
+ return NULL;
+
+ xa_lock(&ucontext->mmap_xa);
+
+ entry = xa_load(&ucontext->mmap_xa, pgoff);
+
+ /*
+ * If refcount is zero, entry is already being deleted, driver_removed
+ * indicates that the no further mmaps are possible and we waiting for
+ * the active VMAs to be closed.
+ */
+ if (!entry || entry->start_pgoff != pgoff || entry->driver_removed ||
+ !kref_get_unless_zero(&entry->ref))
+ goto err;
+
+ xa_unlock(&ucontext->mmap_xa);
+
+ ibdev_dbg(ucontext->device, "mmap: pgoff[%#lx] npages[%#zx] returned\n",
+ pgoff, entry->npages);
+
+ return entry;
+
+err:
+ xa_unlock(&ucontext->mmap_xa);
+ return NULL;
+}
+EXPORT_SYMBOL(rdma_user_mmap_entry_get_pgoff);
+
+/**
+ * rdma_user_mmap_entry_get() - Get an entry from the mmap_xa
+ *
+ * @ucontext: associated user context
+ * @vma: the vma being mmap'd into
+ *
+ * This function is like rdma_user_mmap_entry_get_pgoff() except that it also
+ * checks that the VMA is correct.
+ */
+struct rdma_user_mmap_entry *
+rdma_user_mmap_entry_get(struct ib_ucontext *ucontext,
+ struct vm_area_struct *vma)
+{
+ struct rdma_user_mmap_entry *entry;
+
+ if (!(vma->vm_flags & VM_SHARED))
+ return NULL;
+ entry = rdma_user_mmap_entry_get_pgoff(ucontext, vma->vm_pgoff);
+ if (!entry)
+ return NULL;
+ if (entry->npages * PAGE_SIZE != vma->vm_end - vma->vm_start) {
+ rdma_user_mmap_entry_put(entry);
+ return NULL;
+ }
+ return entry;
+}
+EXPORT_SYMBOL(rdma_user_mmap_entry_get);
+
+static void rdma_user_mmap_entry_free(struct kref *kref)
+{
+ struct rdma_user_mmap_entry *entry =
+ container_of(kref, struct rdma_user_mmap_entry, ref);
+ struct ib_ucontext *ucontext = entry->ucontext;
+ unsigned long i;
+
+ /*
+ * Erase all entries occupied by this single entry, this is deferred
+ * until all VMA are closed so that the mmap offsets remain unique.
+ */
+ xa_lock(&ucontext->mmap_xa);
+ for (i = 0; i < entry->npages; i++)
+ __xa_erase(&ucontext->mmap_xa, entry->start_pgoff + i);
+ xa_unlock(&ucontext->mmap_xa);
+
+ ibdev_dbg(ucontext->device, "mmap: pgoff[%#lx] npages[%#zx] removed\n",
+ entry->start_pgoff, entry->npages);
+
+ if (ucontext->device->ops.mmap_free)
+ ucontext->device->ops.mmap_free(entry);
+}
+
+/**
+ * rdma_user_mmap_entry_put() - Drop reference to the mmap entry
+ *
+ * @entry: an entry in the mmap_xa
+ *
+ * This function is called when the mapping is closed if it was
+ * an io mapping or when the driver is done with the entry for
+ * some other reason.
+ * Should be called after rdma_user_mmap_entry_get was called
+ * and entry is no longer needed. This function will erase the
+ * entry and free it if its refcnt reaches zero.
+ */
+void rdma_user_mmap_entry_put(struct rdma_user_mmap_entry *entry)
+{
+ kref_put(&entry->ref, rdma_user_mmap_entry_free);
+}
+EXPORT_SYMBOL(rdma_user_mmap_entry_put);
+
+/**
+ * rdma_user_mmap_entry_remove() - Drop reference to entry and
+ * mark it as unmmapable
+ *
+ * @entry: the entry to insert into the mmap_xa
+ *
+ * Drivers can call this to prevent userspace from creating more mappings for
+ * entry, however existing mmaps continue to exist and ops->mmap_free() will
+ * not be called until all user mmaps are destroyed.
+ */
+void rdma_user_mmap_entry_remove(struct rdma_user_mmap_entry *entry)
+{
+ if (!entry)
+ return;
+
+ entry->driver_removed = true;
+ kref_put(&entry->ref, rdma_user_mmap_entry_free);
+}
+EXPORT_SYMBOL(rdma_user_mmap_entry_remove);
+
+/**
+ * rdma_user_mmap_entry_insert() - Insert an entry to the mmap_xa
+ *
+ * @ucontext: associated user context.
+ * @entry: the entry to insert into the mmap_xa
+ * @length: length of the address that will be mmapped
+ *
+ * This function should be called by drivers that use the rdma_user_mmap
+ * interface for implementing their mmap syscall A database of mmap offsets is
+ * handled in the core and helper functions are provided to insert entries
+ * into the database and extract entries when the user calls mmap with the
+ * given offset. The function allocates a unique page offset that should be
+ * provided to user, the user will use the offset to retrieve information such
+ * as address to be mapped and how.
+ *
+ * Return: 0 on success and -ENOMEM on failure
+ */
+int rdma_user_mmap_entry_insert(struct ib_ucontext *ucontext,
+ struct rdma_user_mmap_entry *entry,
+ size_t length)
+{
+ struct ib_uverbs_file *ufile = ucontext->ufile;
+ XA_STATE(xas, &ucontext->mmap_xa, 0);
+ u32 xa_first, xa_last, npages;
+ int err;
+ u32 i;
+
+ if (!entry)
+ return -EINVAL;
+
+ kref_init(&entry->ref);
+ entry->ucontext = ucontext;
+
+ /*
+ * We want the whole allocation to be done without interruption from a
+ * different thread. The allocation requires finding a free range and
+ * storing. During the xa_insert the lock could be released, possibly
+ * allowing another thread to choose the same range.
+ */
+ mutex_lock(&ufile->umap_lock);
+
+ xa_lock(&ucontext->mmap_xa);
+
+ /* We want to find an empty range */
+ npages = (u32)DIV_ROUND_UP(length, PAGE_SIZE);
+ entry->npages = npages;
+ while (true) {
+ /* First find an empty index */
+ xas_find_marked(&xas, U32_MAX, XA_FREE_MARK);
+ if (xas.xa_node == XAS_RESTART)
+ goto err_unlock;
+
+ xa_first = xas.xa_index;
+
+ /* Is there enough room to have the range? */
+ if (check_add_overflow(xa_first, npages, &xa_last))
+ goto err_unlock;
+
+ /*
+ * Now look for the next present entry. If an entry doesn't
+ * exist, we found an empty range and can proceed.
+ */
+ xas_next_entry(&xas, xa_last - 1);
+ if (xas.xa_node == XAS_BOUNDS || xas.xa_index >= xa_last)
+ break;
+ }
+
+ for (i = xa_first; i < xa_last; i++) {
+ err = __xa_insert(&ucontext->mmap_xa, i, entry, GFP_KERNEL);
+ if (err)
+ goto err_undo;
+ }
+
+ /*
+ * Internally the kernel uses a page offset, in libc this is a byte
+ * offset. Drivers should not return pgoff to userspace.
+ */
+ entry->start_pgoff = xa_first;
+ xa_unlock(&ucontext->mmap_xa);
+ mutex_unlock(&ufile->umap_lock);
+
+ ibdev_dbg(ucontext->device, "mmap: pgoff[%#lx] npages[%#x] inserted\n",
+ entry->start_pgoff, npages);
+
+ return 0;
+
+err_undo:
+ for (; i > xa_first; i--)
+ __xa_erase(&ucontext->mmap_xa, i - 1);
+
+err_unlock:
+ xa_unlock(&ucontext->mmap_xa);
+ mutex_unlock(&ufile->umap_lock);
+ return -ENOMEM;
+}
+EXPORT_SYMBOL(rdma_user_mmap_entry_insert);
diff --git a/drivers/infiniband/core/iwpm_util.h b/drivers/infiniband/core/iwpm_util.h
index 7e2bcc72f66c..1bf87d9fd0bd 100644
--- a/drivers/infiniband/core/iwpm_util.h
+++ b/drivers/infiniband/core/iwpm_util.h
@@ -210,8 +210,10 @@ int iwpm_mapinfo_available(void);
/**
* iwpm_compare_sockaddr - Compare two sockaddr storage structs
+ * @a_sockaddr: first sockaddr to compare
+ * @b_sockaddr: second sockaddr to compare
*
- * Returns 0 if they are holding the same ip/tcp address info,
+ * Return: 0 if they are holding the same ip/tcp address info,
* otherwise returns 1
*/
int iwpm_compare_sockaddr(struct sockaddr_storage *a_sockaddr,
@@ -272,6 +274,7 @@ void iwpm_print_sockaddr(struct sockaddr_storage *sockaddr, char *msg);
* iwpm_send_hello - Send hello response to iwpmd
*
* @nl_client: The index of the netlink client
+ * @iwpm_pid: The pid of the user space port mapper
* @abi_version: The kernel's abi_version
*
* Returns 0 on success or a negative error code
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index 9947d16edef2..c54db13fa9b0 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -913,9 +913,9 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
/* No GRH for DR SMP */
ret = device->ops.process_mad(device, 0, port_num, &mad_wc, NULL,
- (const struct ib_mad_hdr *)smp, mad_size,
- (struct ib_mad_hdr *)mad_priv->mad,
- &mad_size, &out_mad_pkey_index);
+ (const struct ib_mad *)smp,
+ (struct ib_mad *)mad_priv->mad, &mad_size,
+ &out_mad_pkey_index);
switch (ret)
{
case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY:
@@ -1397,25 +1397,6 @@ void ib_free_recv_mad(struct ib_mad_recv_wc *mad_recv_wc)
}
EXPORT_SYMBOL(ib_free_recv_mad);
-struct ib_mad_agent *ib_redirect_mad_qp(struct ib_qp *qp,
- u8 rmpp_version,
- ib_mad_send_handler send_handler,
- ib_mad_recv_handler recv_handler,
- void *context)
-{
- return ERR_PTR(-EINVAL); /* XXX: for now */
-}
-EXPORT_SYMBOL(ib_redirect_mad_qp);
-
-int ib_process_mad_wc(struct ib_mad_agent *mad_agent,
- struct ib_wc *wc)
-{
- dev_err(&mad_agent->device->dev,
- "ib_process_mad_wc() not implemented yet\n");
- return 0;
-}
-EXPORT_SYMBOL(ib_process_mad_wc);
-
static int method_in_use(struct ib_mad_mgmt_method_table **method,
struct ib_mad_reg_req *mad_reg_req)
{
@@ -2340,9 +2321,9 @@ static void ib_mad_recv_done(struct ib_cq *cq, struct ib_wc *wc)
if (port_priv->device->ops.process_mad) {
ret = port_priv->device->ops.process_mad(
port_priv->device, 0, port_priv->port_num, wc,
- &recv->grh, (const struct ib_mad_hdr *)recv->mad,
- recv->mad_size, (struct ib_mad_hdr *)response->mad,
- &mad_size, &resp_mad_pkey_index);
+ &recv->grh, (const struct ib_mad *)recv->mad,
+ (struct ib_mad *)response->mad, &mad_size,
+ &resp_mad_pkey_index);
if (opa)
wc->pkey_index = resp_mad_pkey_index;
diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index c03af08b80e7..cbf6041a5d4a 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -42,6 +42,9 @@
#include "cma_priv.h"
#include "restrack.h"
+typedef int (*res_fill_func_t)(struct sk_buff*, bool,
+ struct rdma_restrack_entry*, uint32_t);
+
/*
* Sort array elements by the netlink attribute name
*/
@@ -180,6 +183,19 @@ static int _rdma_nl_put_driver_u64(struct sk_buff *msg, const char *name,
return 0;
}
+int rdma_nl_put_driver_string(struct sk_buff *msg, const char *name,
+ const char *str)
+{
+ if (put_driver_name_print_type(msg, name,
+ RDMA_NLDEV_PRINT_TYPE_UNSPEC))
+ return -EMSGSIZE;
+ if (nla_put_string(msg, RDMA_NLDEV_ATTR_DRIVER_STRING, str))
+ return -EMSGSIZE;
+
+ return 0;
+}
+EXPORT_SYMBOL(rdma_nl_put_driver_string);
+
int rdma_nl_put_driver_u32(struct sk_buff *msg, const char *name, u32 value)
{
return _rdma_nl_put_driver_u32(msg, name, RDMA_NLDEV_PRINT_TYPE_UNSPEC,
@@ -399,20 +415,34 @@ err:
static int fill_res_name_pid(struct sk_buff *msg,
struct rdma_restrack_entry *res)
{
+ int err = 0;
+
/*
* For user resources, user is should read /proc/PID/comm to get the
* name of the task file.
*/
if (rdma_is_kernel_res(res)) {
- if (nla_put_string(msg, RDMA_NLDEV_ATTR_RES_KERN_NAME,
- res->kern_name))
- return -EMSGSIZE;
+ err = nla_put_string(msg, RDMA_NLDEV_ATTR_RES_KERN_NAME,
+ res->kern_name);
} else {
- if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PID,
- task_pid_vnr(res->task)))
- return -EMSGSIZE;
+ pid_t pid;
+
+ pid = task_pid_vnr(res->task);
+ /*
+ * Task is dead and in zombie state.
+ * There is no need to print PID anymore.
+ */
+ if (pid)
+ /*
+ * This part is racy, task can be killed and PID will
+ * be zero right here but it is ok, next query won't
+ * return PID. We don't promise real-time reflection
+ * of SW objects.
+ */
+ err = nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PID, pid);
}
- return 0;
+
+ return err ? -EMSGSIZE : 0;
}
static bool fill_res_entry(struct ib_device *dev, struct sk_buff *msg,
@@ -423,6 +453,14 @@ static bool fill_res_entry(struct ib_device *dev, struct sk_buff *msg,
return dev->ops.fill_res_entry(msg, res);
}
+static bool fill_stat_entry(struct ib_device *dev, struct sk_buff *msg,
+ struct rdma_restrack_entry *res)
+{
+ if (!dev->ops.fill_stat_entry)
+ return false;
+ return dev->ops.fill_stat_entry(msg, res);
+}
+
static int fill_res_qp_entry(struct sk_buff *msg, bool has_cap_net_admin,
struct rdma_restrack_entry *res, uint32_t port)
{
@@ -698,9 +736,6 @@ static int fill_stat_counter_qps(struct sk_buff *msg,
rt = &counter->device->res[RDMA_RESTRACK_QP];
xa_lock(&rt->xa);
xa_for_each(&rt->xa, id, res) {
- if (!rdma_is_visible_in_pid_ns(res))
- continue;
-
qp = container_of(res, struct ib_qp, res);
if (qp->qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW))
continue;
@@ -723,8 +758,8 @@ err:
return ret;
}
-static int fill_stat_hwcounter_entry(struct sk_buff *msg,
- const char *name, u64 value)
+int rdma_nl_stat_hwcounter_entry(struct sk_buff *msg, const char *name,
+ u64 value)
{
struct nlattr *entry_attr;
@@ -746,6 +781,25 @@ err:
nla_nest_cancel(msg, entry_attr);
return -EMSGSIZE;
}
+EXPORT_SYMBOL(rdma_nl_stat_hwcounter_entry);
+
+static int fill_stat_mr_entry(struct sk_buff *msg, bool has_cap_net_admin,
+ struct rdma_restrack_entry *res, uint32_t port)
+{
+ struct ib_mr *mr = container_of(res, struct ib_mr, res);
+ struct ib_device *dev = mr->pd->device;
+
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_MRN, res->id))
+ goto err;
+
+ if (fill_stat_entry(dev, msg, res))
+ goto err;
+
+ return 0;
+
+err:
+ return -EMSGSIZE;
+}
static int fill_stat_counter_hwcounters(struct sk_buff *msg,
struct rdma_counter *counter)
@@ -759,7 +813,7 @@ static int fill_stat_counter_hwcounters(struct sk_buff *msg,
return -EMSGSIZE;
for (i = 0; i < st->num_counters; i++)
- if (fill_stat_hwcounter_entry(msg, st->names[i], st->value[i]))
+ if (rdma_nl_stat_hwcounter_entry(msg, st->names[i], st->value[i]))
goto err;
nla_nest_end(msg, table_attr);
@@ -1117,8 +1171,6 @@ static int nldev_res_get_dumpit(struct sk_buff *skb,
}
struct nldev_fill_res_entry {
- int (*fill_res_func)(struct sk_buff *msg, bool has_cap_net_admin,
- struct rdma_restrack_entry *res, u32 port);
enum rdma_nldev_attr nldev_attr;
enum rdma_nldev_command nldev_cmd;
u8 flags;
@@ -1132,21 +1184,18 @@ enum nldev_res_flags {
static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = {
[RDMA_RESTRACK_QP] = {
- .fill_res_func = fill_res_qp_entry,
.nldev_cmd = RDMA_NLDEV_CMD_RES_QP_GET,
.nldev_attr = RDMA_NLDEV_ATTR_RES_QP,
.entry = RDMA_NLDEV_ATTR_RES_QP_ENTRY,
.id = RDMA_NLDEV_ATTR_RES_LQPN,
},
[RDMA_RESTRACK_CM_ID] = {
- .fill_res_func = fill_res_cm_id_entry,
.nldev_cmd = RDMA_NLDEV_CMD_RES_CM_ID_GET,
.nldev_attr = RDMA_NLDEV_ATTR_RES_CM_ID,
.entry = RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY,
.id = RDMA_NLDEV_ATTR_RES_CM_IDN,
},
[RDMA_RESTRACK_CQ] = {
- .fill_res_func = fill_res_cq_entry,
.nldev_cmd = RDMA_NLDEV_CMD_RES_CQ_GET,
.nldev_attr = RDMA_NLDEV_ATTR_RES_CQ,
.flags = NLDEV_PER_DEV,
@@ -1154,7 +1203,6 @@ static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = {
.id = RDMA_NLDEV_ATTR_RES_CQN,
},
[RDMA_RESTRACK_MR] = {
- .fill_res_func = fill_res_mr_entry,
.nldev_cmd = RDMA_NLDEV_CMD_RES_MR_GET,
.nldev_attr = RDMA_NLDEV_ATTR_RES_MR,
.flags = NLDEV_PER_DEV,
@@ -1162,7 +1210,6 @@ static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = {
.id = RDMA_NLDEV_ATTR_RES_MRN,
},
[RDMA_RESTRACK_PD] = {
- .fill_res_func = fill_res_pd_entry,
.nldev_cmd = RDMA_NLDEV_CMD_RES_PD_GET,
.nldev_attr = RDMA_NLDEV_ATTR_RES_PD,
.flags = NLDEV_PER_DEV,
@@ -1170,7 +1217,6 @@ static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = {
.id = RDMA_NLDEV_ATTR_RES_PDN,
},
[RDMA_RESTRACK_COUNTER] = {
- .fill_res_func = fill_res_counter_entry,
.nldev_cmd = RDMA_NLDEV_CMD_STAT_GET,
.nldev_attr = RDMA_NLDEV_ATTR_STAT_COUNTER,
.entry = RDMA_NLDEV_ATTR_STAT_COUNTER_ENTRY,
@@ -1180,7 +1226,8 @@ static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = {
static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack,
- enum rdma_restrack_type res_type)
+ enum rdma_restrack_type res_type,
+ res_fill_func_t fill_func)
{
const struct nldev_fill_res_entry *fe = &fill_entries[res_type];
struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
@@ -1222,11 +1269,6 @@ static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
goto err;
}
- if (!rdma_is_visible_in_pid_ns(res)) {
- ret = -ENOENT;
- goto err_get;
- }
-
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg) {
ret = -ENOMEM;
@@ -1243,7 +1285,9 @@ static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
}
has_cap_net_admin = netlink_capable(skb, CAP_NET_ADMIN);
- ret = fe->fill_res_func(msg, has_cap_net_admin, res, port);
+
+ ret = fill_func(msg, has_cap_net_admin, res, port);
+
rdma_restrack_put(res);
if (ret)
goto err_free;
@@ -1263,7 +1307,8 @@ err:
static int res_get_common_dumpit(struct sk_buff *skb,
struct netlink_callback *cb,
- enum rdma_restrack_type res_type)
+ enum rdma_restrack_type res_type,
+ res_fill_func_t fill_func)
{
const struct nldev_fill_res_entry *fe = &fill_entries[res_type];
struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
@@ -1334,9 +1379,6 @@ static int res_get_common_dumpit(struct sk_buff *skb,
* objects.
*/
xa_for_each(&rt->xa, id, res) {
- if (!rdma_is_visible_in_pid_ns(res))
- continue;
-
if (idx < start || !rdma_restrack_get(res))
goto next;
@@ -1351,7 +1393,8 @@ static int res_get_common_dumpit(struct sk_buff *skb,
goto msg_full;
}
- ret = fe->fill_res_func(skb, has_cap_net_admin, res, port);
+ ret = fill_func(skb, has_cap_net_admin, res, port);
+
rdma_restrack_put(res);
if (ret) {
@@ -1394,17 +1437,19 @@ err_index:
return ret;
}
-#define RES_GET_FUNCS(name, type) \
- static int nldev_res_get_##name##_dumpit(struct sk_buff *skb, \
+#define RES_GET_FUNCS(name, type) \
+ static int nldev_res_get_##name##_dumpit(struct sk_buff *skb, \
struct netlink_callback *cb) \
- { \
- return res_get_common_dumpit(skb, cb, type); \
- } \
- static int nldev_res_get_##name##_doit(struct sk_buff *skb, \
- struct nlmsghdr *nlh, \
+ { \
+ return res_get_common_dumpit(skb, cb, type, \
+ fill_res_##name##_entry); \
+ } \
+ static int nldev_res_get_##name##_doit(struct sk_buff *skb, \
+ struct nlmsghdr *nlh, \
struct netlink_ext_ack *extack) \
- { \
- return res_get_common_doit(skb, nlh, extack, type); \
+ { \
+ return res_get_common_doit(skb, nlh, extack, type, \
+ fill_res_##name##_entry); \
}
RES_GET_FUNCS(qp, RDMA_RESTRACK_QP);
@@ -1880,7 +1925,7 @@ static int stat_get_doit_default_counter(struct sk_buff *skb,
for (i = 0; i < num_cnts; i++) {
v = stats->value[i] +
rdma_counter_get_hwstat_value(device, port, i);
- if (fill_stat_hwcounter_entry(msg, stats->names[i], v)) {
+ if (rdma_nl_stat_hwcounter_entry(msg, stats->names[i], v)) {
ret = -EMSGSIZE;
goto err_table;
}
@@ -1989,7 +2034,10 @@ static int nldev_stat_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
case RDMA_NLDEV_ATTR_RES_QP:
ret = stat_get_doit_qp(skb, nlh, extack, tb);
break;
-
+ case RDMA_NLDEV_ATTR_RES_MR:
+ ret = res_get_common_doit(skb, nlh, extack, RDMA_RESTRACK_MR,
+ fill_stat_mr_entry);
+ break;
default:
ret = -EINVAL;
break;
@@ -2013,7 +2061,10 @@ static int nldev_stat_get_dumpit(struct sk_buff *skb,
case RDMA_NLDEV_ATTR_RES_QP:
ret = nldev_res_get_counter_dumpit(skb, cb);
break;
-
+ case RDMA_NLDEV_ATTR_RES_MR:
+ ret = res_get_common_dumpit(skb, cb, RDMA_RESTRACK_MR,
+ fill_stat_mr_entry);
+ break;
default:
ret = -EINVAL;
break;
diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c
index ccf4d069c25c..6c72773faf29 100644
--- a/drivers/infiniband/core/rdma_core.c
+++ b/drivers/infiniband/core/rdma_core.c
@@ -817,6 +817,7 @@ static void ufile_destroy_ucontext(struct ib_uverbs_file *ufile,
rdma_restrack_del(&ucontext->res);
ib_dev->ops.dealloc_ucontext(ucontext);
+ WARN_ON(!xa_empty(&ucontext->mmap_xa));
kfree(ucontext);
ufile->ucontext = NULL;
diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c
index a07665f7ef8c..62fbb0ae9cb4 100644
--- a/drivers/infiniband/core/restrack.c
+++ b/drivers/infiniband/core/restrack.c
@@ -116,11 +116,8 @@ int rdma_restrack_count(struct ib_device *dev, enum rdma_restrack_type type)
u32 cnt = 0;
xa_lock(&rt->xa);
- xas_for_each(&xas, e, U32_MAX) {
- if (!rdma_is_visible_in_pid_ns(e))
- continue;
+ xas_for_each(&xas, e, U32_MAX)
cnt++;
- }
xa_unlock(&rt->xa);
return cnt;
}
@@ -346,18 +343,3 @@ out:
}
}
EXPORT_SYMBOL(rdma_restrack_del);
-
-bool rdma_is_visible_in_pid_ns(struct rdma_restrack_entry *res)
-{
- /*
- * 1. Kern resources should be visible in init
- * namespace only
- * 2. Present only resources visible in the current
- * namespace
- */
- if (rdma_is_kernel_res(res))
- return task_active_pid_ns(current) == &init_pid_ns;
-
- /* PID 0 means that resource is not found in current namespace */
- return task_pid_vnr(res->task);
-}
diff --git a/drivers/infiniband/core/restrack.h b/drivers/infiniband/core/restrack.h
index 7bd177cc0a61..d084e5f89849 100644
--- a/drivers/infiniband/core/restrack.h
+++ b/drivers/infiniband/core/restrack.h
@@ -27,5 +27,4 @@ int rdma_restrack_init(struct ib_device *dev);
void rdma_restrack_clean(struct ib_device *dev);
void rdma_restrack_attach_task(struct rdma_restrack_entry *res,
struct task_struct *task);
-bool rdma_is_visible_in_pid_ns(struct rdma_restrack_entry *res);
#endif /* _RDMA_CORE_RESTRACK_H_ */
diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c
index 5337393d4dfe..4fad732f9b3c 100644
--- a/drivers/infiniband/core/rw.c
+++ b/drivers/infiniband/core/rw.c
@@ -20,14 +20,17 @@ module_param_named(force_mr, rdma_rw_force_mr, bool, 0);
MODULE_PARM_DESC(force_mr, "Force usage of MRs for RDMA READ/WRITE operations");
/*
- * Check if the device might use memory registration. This is currently only
- * true for iWarp devices. In the future we can hopefully fine tune this based
- * on HCA driver input.
+ * Report whether memory registration should be used. Memory registration must
+ * be used for iWarp devices because of iWARP-specific limitations. Memory
+ * registration is also enabled if registering memory might yield better
+ * performance than using multiple SGE entries, see rdma_rw_io_needs_mr()
*/
static inline bool rdma_rw_can_use_mr(struct ib_device *dev, u8 port_num)
{
if (rdma_protocol_iwarp(dev, port_num))
return true;
+ if (dev->attrs.max_sgl_rd)
+ return true;
if (unlikely(rdma_rw_force_mr))
return true;
return false;
@@ -35,17 +38,19 @@ static inline bool rdma_rw_can_use_mr(struct ib_device *dev, u8 port_num)
/*
* Check if the device will use memory registration for this RW operation.
- * We currently always use memory registrations for iWarp RDMA READs, and
- * have a debug option to force usage of MRs.
- *
- * XXX: In the future we can hopefully fine tune this based on HCA driver
- * input.
+ * For RDMA READs we must use MRs on iWarp and can optionally use them as an
+ * optimization otherwise. Additionally we have a debug option to force usage
+ * of MRs to help testing this code path.
*/
static inline bool rdma_rw_io_needs_mr(struct ib_device *dev, u8 port_num,
enum dma_data_direction dir, int dma_nents)
{
- if (rdma_protocol_iwarp(dev, port_num) && dir == DMA_FROM_DEVICE)
- return true;
+ if (dir == DMA_FROM_DEVICE) {
+ if (rdma_protocol_iwarp(dev, port_num))
+ return true;
+ if (dev->attrs.max_sgl_rd && dma_nents > dev->attrs.max_sgl_rd)
+ return true;
+ }
if (unlikely(rdma_rw_force_mr))
return true;
return false;
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index 17fc2936c077..8917125ea16d 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -1246,7 +1246,7 @@ static int init_ah_attr_grh_fields(struct ib_device *device, u8 port_num,
* @port_num: Port on the specified device.
* @rec: path record entry to use for ah attributes initialization.
* @ah_attr: address handle attributes to initialization from path record.
- * @sgid_attr: SGID attribute to consider during initialization.
+ * @gid_attr: SGID attribute to consider during initialization.
*
* When ib_init_ah_attr_from_path() returns success,
* (a) for IB link layer it optionally contains a reference to SGID attribute
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
index 7a50cedcef1f..087682e6969e 100644
--- a/drivers/infiniband/core/sysfs.c
+++ b/drivers/infiniband/core/sysfs.c
@@ -481,8 +481,8 @@ static int get_perf_mad(struct ib_device *dev, int port_num, __be16 attr,
if (!dev->ops.process_mad)
return -ENOSYS;
- in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL);
- out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+ in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL);
+ out_mad = kzalloc(sizeof(*out_mad), GFP_KERNEL);
if (!in_mad || !out_mad) {
ret = -ENOMEM;
goto out;
@@ -497,10 +497,8 @@ static int get_perf_mad(struct ib_device *dev, int port_num, __be16 attr,
if (attr != IB_PMA_CLASS_PORT_INFO)
in_mad->data[41] = port_num; /* PortSelect field */
- if ((dev->ops.process_mad(dev, IB_MAD_IGNORE_MKEY,
- port_num, NULL, NULL,
- (const struct ib_mad_hdr *)in_mad, mad_size,
- (struct ib_mad_hdr *)out_mad, &mad_size,
+ if ((dev->ops.process_mad(dev, IB_MAD_IGNORE_MKEY, port_num, NULL, NULL,
+ in_mad, out_mad, &mad_size,
&out_mad_pkey_index) &
(IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) !=
(IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) {
@@ -1268,7 +1266,7 @@ static ssize_t node_desc_store(struct device *device,
int ret;
if (!dev->ops.modify_device)
- return -EIO;
+ return -EOPNOTSUPP;
memcpy(desc.node_desc, buf, min_t(int, count, IB_DEVICE_NODE_DESC_MAX));
ret = ib_modify_device(dev, IB_DEVICE_MODIFY_NODE_DESC, &desc);
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 24244a2f68cc..7a3b99597ead 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -185,10 +185,9 @@ EXPORT_SYMBOL(ib_umem_find_best_pgsz);
* @addr: userspace virtual address to start at
* @size: length of region to pin
* @access: IB_ACCESS_xxx flags for memory being pinned
- * @dmasync: flush in-flight DMA when the memory region is written
*/
struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
- size_t size, int access, int dmasync)
+ size_t size, int access)
{
struct ib_ucontext *context;
struct ib_umem *umem;
@@ -199,7 +198,6 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
struct mm_struct *mm;
unsigned long npages;
int ret;
- unsigned long dma_attrs = 0;
struct scatterlist *sg;
unsigned int gup_flags = FOLL_WRITE;
@@ -211,9 +209,6 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
if (!context)
return ERR_PTR(-EIO);
- if (dmasync)
- dma_attrs |= DMA_ATTR_WRITE_BARRIER;
-
/*
* If the combination of the addr and size requested for this memory
* region causes an integer overflow, return error.
@@ -294,11 +289,10 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
sg_mark_end(sg);
- umem->nmap = ib_dma_map_sg_attrs(context->device,
+ umem->nmap = ib_dma_map_sg(context->device,
umem->sg_head.sgl,
umem->sg_nents,
- DMA_BIDIRECTIONAL,
- dma_attrs);
+ DMA_BIDIRECTIONAL);
if (!umem->nmap) {
ret = -ENOMEM;
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index 163ff7ba92b7..d7d5fadf0899 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -508,7 +508,6 @@ static int ib_umem_odp_map_dma_single_page(
{
struct ib_device *dev = umem_odp->umem.ibdev;
dma_addr_t dma_addr;
- int remove_existing_mapping = 0;
int ret = 0;
/*
@@ -534,28 +533,29 @@ static int ib_umem_odp_map_dma_single_page(
} else if (umem_odp->page_list[page_index] == page) {
umem_odp->dma_list[page_index] |= access_mask;
} else {
- pr_err("error: got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n",
- umem_odp->page_list[page_index], page);
- /* Better remove the mapping now, to prevent any further
- * damage. */
- remove_existing_mapping = 1;
+ /*
+ * This is a race here where we could have done:
+ *
+ * CPU0 CPU1
+ * get_user_pages()
+ * invalidate()
+ * page_fault()
+ * mutex_lock(umem_mutex)
+ * page from GUP != page in ODP
+ *
+ * It should be prevented by the retry test above as reading
+ * the seq number should be reliable under the
+ * umem_mutex. Thus something is really not working right if
+ * things get here.
+ */
+ WARN(true,
+ "Got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n",
+ umem_odp->page_list[page_index], page);
+ ret = -EAGAIN;
}
out:
put_user_page(page);
-
- if (remove_existing_mapping) {
- ib_umem_notifier_start_account(umem_odp);
- dev->ops.invalidate_range(
- umem_odp,
- ib_umem_start(umem_odp) +
- (page_index << umem_odp->page_shift),
- ib_umem_start(umem_odp) +
- ((page_index + 1) << umem_odp->page_shift));
- ib_umem_notifier_end_account(umem_odp);
- ret = -EAGAIN;
- }
-
return ret;
}
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 14a80fd9f464..06ed32c8662f 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -252,6 +252,8 @@ static int ib_uverbs_get_context(struct uverbs_attr_bundle *attrs)
ucontext->closing = false;
ucontext->cleanup_retryable = false;
+ xa_init_flags(&ucontext->mmap_xa, XA_FLAGS_ALLOC);
+
ret = get_unused_fd_flags(O_CLOEXEC);
if (ret < 0)
goto err_free;
diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c
index 61758201d9b2..269938f59d3f 100644
--- a/drivers/infiniband/core/uverbs_ioctl.c
+++ b/drivers/infiniband/core/uverbs_ioctl.c
@@ -795,6 +795,9 @@ int uverbs_copy_to_struct_or_zero(const struct uverbs_attr_bundle *bundle,
{
const struct uverbs_attr *attr = uverbs_attr_get(bundle, idx);
+ if (IS_ERR(attr))
+ return PTR_ERR(attr);
+
if (size < attr->ptr_attr.len) {
if (clear_user(u64_to_user_ptr(attr->ptr_attr.data) + size,
attr->ptr_attr.len - size))
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index db98111b47f4..9aa7ffc1d12a 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -772,6 +772,8 @@ out_unlock:
return (ret) ? : count;
}
+static const struct vm_operations_struct rdma_umap_ops;
+
static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma)
{
struct ib_uverbs_file *file = filp->private_data;
@@ -785,7 +787,7 @@ static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma)
ret = PTR_ERR(ucontext);
goto out;
}
-
+ vma->vm_ops = &rdma_umap_ops;
ret = ucontext->device->ops.mmap(ucontext, vma);
out:
srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
@@ -793,38 +795,6 @@ out:
}
/*
- * Each time we map IO memory into user space this keeps track of the mapping.
- * When the device is hot-unplugged we 'zap' the mmaps in user space to point
- * to the zero page and allow the hot unplug to proceed.
- *
- * This is necessary for cases like PCI physical hot unplug as the actual BAR
- * memory may vanish after this and access to it from userspace could MCE.
- *
- * RDMA drivers supporting disassociation must have their user space designed
- * to cope in some way with their IO pages going to the zero page.
- */
-struct rdma_umap_priv {
- struct vm_area_struct *vma;
- struct list_head list;
-};
-
-static const struct vm_operations_struct rdma_umap_ops;
-
-static void rdma_umap_priv_init(struct rdma_umap_priv *priv,
- struct vm_area_struct *vma)
-{
- struct ib_uverbs_file *ufile = vma->vm_file->private_data;
-
- priv->vma = vma;
- vma->vm_private_data = priv;
- vma->vm_ops = &rdma_umap_ops;
-
- mutex_lock(&ufile->umap_lock);
- list_add(&priv->list, &ufile->umaps);
- mutex_unlock(&ufile->umap_lock);
-}
-
-/*
* The VMA has been dup'd, initialize the vm_private_data with a new tracking
* struct
*/
@@ -849,7 +819,7 @@ static void rdma_umap_open(struct vm_area_struct *vma)
priv = kzalloc(sizeof(*priv), GFP_KERNEL);
if (!priv)
goto out_unlock;
- rdma_umap_priv_init(priv, vma);
+ rdma_umap_priv_init(priv, vma, opriv->entry);
up_read(&ufile->hw_destroy_rwsem);
return;
@@ -880,6 +850,9 @@ static void rdma_umap_close(struct vm_area_struct *vma)
* this point.
*/
mutex_lock(&ufile->umap_lock);
+ if (priv->entry)
+ rdma_user_mmap_entry_put(priv->entry);
+
list_del(&priv->list);
mutex_unlock(&ufile->umap_lock);
kfree(priv);
@@ -931,44 +904,6 @@ static const struct vm_operations_struct rdma_umap_ops = {
.fault = rdma_umap_fault,
};
-/*
- * Map IO memory into a process. This is to be called by drivers as part of
- * their mmap() functions if they wish to send something like PCI-E BAR memory
- * to userspace.
- */
-int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma,
- unsigned long pfn, unsigned long size, pgprot_t prot)
-{
- struct ib_uverbs_file *ufile = ucontext->ufile;
- struct rdma_umap_priv *priv;
-
- if (!(vma->vm_flags & VM_SHARED))
- return -EINVAL;
-
- if (vma->vm_end - vma->vm_start != size)
- return -EINVAL;
-
- /* Driver is using this wrong, must be called by ib_uverbs_mmap */
- if (WARN_ON(!vma->vm_file ||
- vma->vm_file->private_data != ufile))
- return -EINVAL;
- lockdep_assert_held(&ufile->device->disassociate_srcu);
-
- priv = kzalloc(sizeof(*priv), GFP_KERNEL);
- if (!priv)
- return -ENOMEM;
-
- vma->vm_page_prot = prot;
- if (io_remap_pfn_range(vma, vma->vm_start, pfn, size, prot)) {
- kfree(priv);
- return -EAGAIN;
- }
-
- rdma_umap_priv_init(priv, vma);
- return 0;
-}
-EXPORT_SYMBOL(rdma_user_mmap_io);
-
void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile)
{
struct rdma_umap_priv *priv, *next_priv;
@@ -1018,6 +953,11 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile)
zap_vma_ptes(vma, vma->vm_start,
vma->vm_end - vma->vm_start);
+
+ if (priv->entry) {
+ rdma_user_mmap_entry_put(priv->entry);
+ priv->entry = NULL;
+ }
}
mutex_unlock(&ufile->umap_lock);
skip_mm:
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 35c2841a569e..dd765e176cdd 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -244,6 +244,8 @@ EXPORT_SYMBOL(rdma_port_get_link_layer);
/**
* ib_alloc_pd - Allocates an unused protection domain.
* @device: The device on which to allocate the protection domain.
+ * @flags: protection domain flags
+ * @caller: caller's build-time module name
*
* A protection domain object provides an association between QPs, shared
* receive queues, address handles, memory regions, and memory windows.
@@ -2459,6 +2461,16 @@ int ib_set_vf_guid(struct ib_device *device, int vf, u8 port, u64 guid,
}
EXPORT_SYMBOL(ib_set_vf_guid);
+int ib_get_vf_guid(struct ib_device *device, int vf, u8 port,
+ struct ifla_vf_guid *node_guid,
+ struct ifla_vf_guid *port_guid)
+{
+ if (!device->ops.get_vf_guid)
+ return -EOPNOTSUPP;
+
+ return device->ops.get_vf_guid(device, vf, port, node_guid, port_guid);
+}
+EXPORT_SYMBOL(ib_get_vf_guid);
/**
* ib_map_mr_sg_pi() - Map the dma mapped SG lists for PI (protection
* information) and set an appropriate memory region for registration.