From 954a8e3aea87e896e320cf648c1a5bbe47de443e Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@mellanox.com>
Date: Thu, 30 Aug 2018 08:35:19 +0300
Subject: RDMA/cma: Protect cma dev list with lock

When AF_IB addresses are used during rdma_resolve_addr() a lock is not
held. A cma device can get removed while list traversal is in progress
which may lead to crash. ie

        CPU0                                     CPU1
        ====                                     ====
rdma_resolve_addr()
 cma_resolve_ib_dev()
  list_for_each()                         cma_remove_one()
    cur_dev->device                        mutex_lock(&lock)
                                            list_del();
                                           mutex_unlock(&lock);
                                           cma_process_remove();


Therefore, hold a lock while traversing the list which avoids such
situation.

Cc: <stable@vger.kernel.org> # 3.10
Fixes: f17df3b0dede ("RDMA/cma: Add support for AF_IB to rdma_resolve_addr()")
Signed-off-by: Parav Pandit <parav@mellanox.com>
Reviewed-by: Daniel Jurgens <danielj@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/cma.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'drivers')

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index f72677291b69..a36c94930c31 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -724,6 +724,7 @@ static int cma_resolve_ib_dev(struct rdma_id_private *id_priv)
 	dgid = (union ib_gid *) &addr->sib_addr;
 	pkey = ntohs(addr->sib_pkey);
 
+	mutex_lock(&lock);
 	list_for_each_entry(cur_dev, &dev_list, list) {
 		for (p = 1; p <= cur_dev->device->phys_port_cnt; ++p) {
 			if (!rdma_cap_af_ib(cur_dev->device, p))
@@ -750,18 +751,19 @@ static int cma_resolve_ib_dev(struct rdma_id_private *id_priv)
 					cma_dev = cur_dev;
 					sgid = gid;
 					id_priv->id.port_num = p;
+					goto found;
 				}
 			}
 		}
 	}
-
-	if (!cma_dev)
-		return -ENODEV;
+	mutex_unlock(&lock);
+	return -ENODEV;
 
 found:
 	cma_attach_to_dev(id_priv, cma_dev);
-	addr = (struct sockaddr_ib *) cma_src_addr(id_priv);
-	memcpy(&addr->sib_addr, &sgid, sizeof sgid);
+	mutex_unlock(&lock);
+	addr = (struct sockaddr_ib *)cma_src_addr(id_priv);
+	memcpy(&addr->sib_addr, &sgid, sizeof(sgid));
 	cma_translate_ib(addr, &id_priv->id.route.addr.dev_addr);
 	return 0;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 8f28b178f71cc56eccf2a6e2c0ace17c82f900d7 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Mon, 3 Sep 2018 09:11:14 +0300
Subject: RDMA/mlx4: Ensure that maximal send/receive SGE less than supported
 by HW

In calculating the global maximum number of the Scatter/Gather elements
supported, the following four maximum parameters must be taken into
consideration: max_sg_rq, max_sg_sq, max_desc_sz_rq and max_desc_sz_sq.

However instead of bringing this complexity to query_device, which still
won't be sufficient anyway (the calculations are dependent on QP type),
the safer approach will be to restore old code, which will give us 32
SGEs.

Fixes: 33023fb85a42 ("IB/core: add max_send_sge and max_recv_sge attributes")
Reported-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx4/main.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'drivers')

diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index ca0f1ee26091..0bbeaaae47e0 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -517,9 +517,11 @@ static int mlx4_ib_query_device(struct ib_device *ibdev,
 	props->page_size_cap	   = dev->dev->caps.page_size_cap;
 	props->max_qp		   = dev->dev->quotas.qp;
 	props->max_qp_wr	   = dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE;
-	props->max_send_sge	   = dev->dev->caps.max_sq_sg;
-	props->max_recv_sge	   = dev->dev->caps.max_rq_sg;
-	props->max_sge_rd	   = MLX4_MAX_SGE_RD;
+	props->max_send_sge =
+		min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg);
+	props->max_recv_sge =
+		min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg);
+	props->max_sge_rd = MLX4_MAX_SGE_RD;
 	props->max_cq		   = dev->dev->quotas.cq;
 	props->max_cqe		   = dev->dev->caps.max_cqes;
 	props->max_mr		   = dev->dev->quotas.mpt;
-- 
cgit v1.2.3-59-g8ed1b


From 9f34519a82356f6cf0ccb8480ee0ed99b3d0af75 Mon Sep 17 00:00:00 2001
From: Steve Wise <swise@opengridcomputing.com>
Date: Fri, 31 Aug 2018 11:52:00 -0700
Subject: cxgb4: fix abort_req_rss6 struct

Remove the incorrect WR_HDR field which can cause a misinterpretation
of ABORT CPL by ULDs, such as iw_cxgb4.

Fixes: a3cdaa69e4ae ("cxgb4: Adds CPL support for Shared Receive Queues")
Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/net/ethernet/chelsio/cxgb4/t4_msg.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h b/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h
index b8f75a22fb6c..f152da1ce046 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h
@@ -753,7 +753,6 @@ struct cpl_abort_req_rss {
 };
 
 struct cpl_abort_req_rss6 {
-	WR_HDR;
 	union opcode_tid ot;
 	__be32 srqidx_status;
 };
-- 
cgit v1.2.3-59-g8ed1b


From 67e3816842fe6414d629c7515b955952ec40c7d7 Mon Sep 17 00:00:00 2001
From: Steve Wise <swise@opengridcomputing.com>
Date: Fri, 31 Aug 2018 07:16:03 -0700
Subject: RDMA/uverbs: Atomically flush and mark closed the comp event queue

Currently a uverbs completion event queue is flushed of events in
ib_uverbs_comp_event_close() with the queue spinlock held and then
released.  Yet setting ev_queue->is_closed is not set until later in
uverbs_hot_unplug_completion_event_file().

In between the time ib_uverbs_comp_event_close() releases the lock and
uverbs_hot_unplug_completion_event_file() acquires the lock, a completion
event can arrive and be inserted into the event queue by
ib_uverbs_comp_handler().

This can cause a "double add" list_add warning or crash depending on the
kernel configuration, or a memory leak because the event is never dequeued
since the queue is already closed down.

So add setting ev_queue->is_closed = 1 to ib_uverbs_comp_event_close().

Cc: stable@vger.kernel.org
Fixes: 1e7710f3f656 ("IB/core: Change completion channel to use the reworked objects schema")
Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/uverbs_main.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'drivers')

diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index 6d974e2363df..50152c1b1004 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -440,6 +440,7 @@ static int ib_uverbs_comp_event_close(struct inode *inode, struct file *filp)
 			list_del(&entry->obj_list);
 		kfree(entry);
 	}
+	file->ev_queue.is_closed = 1;
 	spin_unlock_irq(&file->ev_queue.lock);
 
 	uverbs_close_fd(filp);
-- 
cgit v1.2.3-59-g8ed1b


From 5fe23f262e0548ca7f19fb79f89059a60d087d22 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Wed, 12 Sep 2018 16:27:44 -0700
Subject: ucma: fix a use-after-free in ucma_resolve_ip()

There is a race condition between ucma_close() and ucma_resolve_ip():

CPU0				CPU1
ucma_resolve_ip():		ucma_close():

ctx = ucma_get_ctx(file, cmd.id);

        list_for_each_entry_safe(ctx, tmp, &file->ctx_list, list) {
                mutex_lock(&mut);
                idr_remove(&ctx_idr, ctx->id);
                mutex_unlock(&mut);
		...
                mutex_lock(&mut);
                if (!ctx->closing) {
                        mutex_unlock(&mut);
                        rdma_destroy_id(ctx->cm_id);
		...
                ucma_free_ctx(ctx);

ret = rdma_resolve_addr();
ucma_put_ctx(ctx);

Before idr_remove(), ucma_get_ctx() could still find the ctx
and after rdma_destroy_id(), rdma_resolve_addr() may still
access id_priv pointer. Also, ucma_put_ctx() may use ctx after
ucma_free_ctx() too.

ucma_close() should call ucma_put_ctx() too which tests the
refcnt and waits for the last one releasing it. The similar
pattern is already used by ucma_destroy_id().

Reported-and-tested-by: syzbot+da2591e115d57a9cbb8b@syzkaller.appspotmail.com
Reported-by: syzbot+cfe3c1e8ef634ba8964b@syzkaller.appspotmail.com
Cc: Jason Gunthorpe <jgg@mellanox.com>
Cc: Doug Ledford <dledford@redhat.com>
Cc: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Reviewed-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/core/ucma.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'drivers')

diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index 5f437d1570fb..21863ddde63e 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -1759,6 +1759,8 @@ static int ucma_close(struct inode *inode, struct file *filp)
 		mutex_lock(&mut);
 		if (!ctx->closing) {
 			mutex_unlock(&mut);
+			ucma_put_ctx(ctx);
+			wait_for_completion(&ctx->comp);
 			/* rdma_destroy_id ensures that no event handlers are
 			 * inflight for that id before releasing it.
 			 */
-- 
cgit v1.2.3-59-g8ed1b


From ee92efe41cf358f4b99e73509f2bfd4733609f26 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Mon, 17 Sep 2018 18:10:05 -0700
Subject: IB/srp: Avoid that sg_reset -d ${srp_device} triggers an infinite
 loop

Use different loop variables for the inner and outer loop. This avoids
that an infinite loop occurs if there are more RDMA channels than
target->req_ring_size.

Fixes: d92c0da71a35 ("IB/srp: Add multichannel support")
Cc: <stable@vger.kernel.org>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/ulp/srp/ib_srp.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'drivers')

diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index 444d16520506..0b34e909505f 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -2951,7 +2951,7 @@ static int srp_reset_device(struct scsi_cmnd *scmnd)
 {
 	struct srp_target_port *target = host_to_target(scmnd->device->host);
 	struct srp_rdma_ch *ch;
-	int i;
+	int i, j;
 	u8 status;
 
 	shost_printk(KERN_ERR, target->scsi_host, "SRP reset_device called\n");
@@ -2965,8 +2965,8 @@ static int srp_reset_device(struct scsi_cmnd *scmnd)
 
 	for (i = 0; i < target->ch_count; i++) {
 		ch = &target->ch[i];
-		for (i = 0; i < target->req_ring_size; ++i) {
-			struct srp_request *req = &ch->req_ring[i];
+		for (j = 0; j < target->req_ring_size; ++j) {
+			struct srp_request *req = &ch->req_ring[j];
 
 			srp_finish_req(ch, req, scmnd->device, DID_RESET << 16);
 		}
-- 
cgit v1.2.3-59-g8ed1b


From 4eeed3686981ff887bbdd7254139e2eca276534c Mon Sep 17 00:00:00 2001
From: Majd Dibbiny <majd@mellanox.com>
Date: Tue, 18 Sep 2018 10:51:37 +0300
Subject: RDMA/uverbs: Fix validity check for modify QP

Uverbs shouldn't enforce QP state in the command unless the user set the QP
state bit in the attribute mask.

In addition, only copy qp attr fields which have the corresponding bit set
in the attribute mask over to the internal attr structure.

Fixes: 88de869bbe4f ("RDMA/uverbs: Ensure validity of current QP state value")
Fixes: bc38a6abdd5a ("[PATCH] IB uverbs: core implementation")
Signed-off-by: Majd Dibbiny <majd@mellanox.com>
Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/uverbs_cmd.c | 68 ++++++++++++++++++++++++------------
 1 file changed, 45 insertions(+), 23 deletions(-)

(limited to 'drivers')

diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index a21d5214afc3..e012ca80f9d1 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -2027,33 +2027,55 @@ static int modify_qp(struct ib_uverbs_file *file,
 
 	if ((cmd->base.attr_mask & IB_QP_CUR_STATE &&
 	    cmd->base.cur_qp_state > IB_QPS_ERR) ||
-	    cmd->base.qp_state > IB_QPS_ERR) {
+	    (cmd->base.attr_mask & IB_QP_STATE &&
+	    cmd->base.qp_state > IB_QPS_ERR)) {
 		ret = -EINVAL;
 		goto release_qp;
 	}
 
-	attr->qp_state		  = cmd->base.qp_state;
-	attr->cur_qp_state	  = cmd->base.cur_qp_state;
-	attr->path_mtu		  = cmd->base.path_mtu;
-	attr->path_mig_state	  = cmd->base.path_mig_state;
-	attr->qkey		  = cmd->base.qkey;
-	attr->rq_psn		  = cmd->base.rq_psn;
-	attr->sq_psn		  = cmd->base.sq_psn;
-	attr->dest_qp_num	  = cmd->base.dest_qp_num;
-	attr->qp_access_flags	  = cmd->base.qp_access_flags;
-	attr->pkey_index	  = cmd->base.pkey_index;
-	attr->alt_pkey_index	  = cmd->base.alt_pkey_index;
-	attr->en_sqd_async_notify = cmd->base.en_sqd_async_notify;
-	attr->max_rd_atomic	  = cmd->base.max_rd_atomic;
-	attr->max_dest_rd_atomic  = cmd->base.max_dest_rd_atomic;
-	attr->min_rnr_timer	  = cmd->base.min_rnr_timer;
-	attr->port_num		  = cmd->base.port_num;
-	attr->timeout		  = cmd->base.timeout;
-	attr->retry_cnt		  = cmd->base.retry_cnt;
-	attr->rnr_retry		  = cmd->base.rnr_retry;
-	attr->alt_port_num	  = cmd->base.alt_port_num;
-	attr->alt_timeout	  = cmd->base.alt_timeout;
-	attr->rate_limit	  = cmd->rate_limit;
+	if (cmd->base.attr_mask & IB_QP_STATE)
+		attr->qp_state = cmd->base.qp_state;
+	if (cmd->base.attr_mask & IB_QP_CUR_STATE)
+		attr->cur_qp_state = cmd->base.cur_qp_state;
+	if (cmd->base.attr_mask & IB_QP_PATH_MTU)
+		attr->path_mtu = cmd->base.path_mtu;
+	if (cmd->base.attr_mask & IB_QP_PATH_MIG_STATE)
+		attr->path_mig_state = cmd->base.path_mig_state;
+	if (cmd->base.attr_mask & IB_QP_QKEY)
+		attr->qkey = cmd->base.qkey;
+	if (cmd->base.attr_mask & IB_QP_RQ_PSN)
+		attr->rq_psn = cmd->base.rq_psn;
+	if (cmd->base.attr_mask & IB_QP_SQ_PSN)
+		attr->sq_psn = cmd->base.sq_psn;
+	if (cmd->base.attr_mask & IB_QP_DEST_QPN)
+		attr->dest_qp_num = cmd->base.dest_qp_num;
+	if (cmd->base.attr_mask & IB_QP_ACCESS_FLAGS)
+		attr->qp_access_flags = cmd->base.qp_access_flags;
+	if (cmd->base.attr_mask & IB_QP_PKEY_INDEX)
+		attr->pkey_index = cmd->base.pkey_index;
+	if (cmd->base.attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY)
+		attr->en_sqd_async_notify = cmd->base.en_sqd_async_notify;
+	if (cmd->base.attr_mask & IB_QP_MAX_QP_RD_ATOMIC)
+		attr->max_rd_atomic = cmd->base.max_rd_atomic;
+	if (cmd->base.attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+		attr->max_dest_rd_atomic = cmd->base.max_dest_rd_atomic;
+	if (cmd->base.attr_mask & IB_QP_MIN_RNR_TIMER)
+		attr->min_rnr_timer = cmd->base.min_rnr_timer;
+	if (cmd->base.attr_mask & IB_QP_PORT)
+		attr->port_num = cmd->base.port_num;
+	if (cmd->base.attr_mask & IB_QP_TIMEOUT)
+		attr->timeout = cmd->base.timeout;
+	if (cmd->base.attr_mask & IB_QP_RETRY_CNT)
+		attr->retry_cnt = cmd->base.retry_cnt;
+	if (cmd->base.attr_mask & IB_QP_RNR_RETRY)
+		attr->rnr_retry = cmd->base.rnr_retry;
+	if (cmd->base.attr_mask & IB_QP_ALT_PATH) {
+		attr->alt_port_num = cmd->base.alt_port_num;
+		attr->alt_timeout = cmd->base.alt_timeout;
+		attr->alt_pkey_index = cmd->base.alt_pkey_index;
+	}
+	if (cmd->base.attr_mask & IB_QP_RATE_LIMIT)
+		attr->rate_limit = cmd->rate_limit;
 
 	if (cmd->base.attr_mask & IB_QP_AV)
 		copy_ah_attr_from_uverbs(qp->device, &attr->ah_attr,
-- 
cgit v1.2.3-59-g8ed1b


From 0dbfaa9f2813787679e296eb5476e40938ab48c8 Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Thu, 20 Sep 2018 12:58:46 -0700
Subject: IB/hfi1: Fix SL array bounds check

The SL specified by a user needs to be a valid SL.

Add a range check to the user specified SL value which protects from
running off the end of the SL to SC table.

CC: stable@vger.kernel.org
Fixes: 7724105686e7 ("IB/hfi1: add driver files")
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hfi1/verbs.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c
index 13374c727b14..a7c586a5589d 100644
--- a/drivers/infiniband/hw/hfi1/verbs.c
+++ b/drivers/infiniband/hw/hfi1/verbs.c
@@ -1582,6 +1582,7 @@ static int hfi1_check_ah(struct ib_device *ibdev, struct rdma_ah_attr *ah_attr)
 	struct hfi1_pportdata *ppd;
 	struct hfi1_devdata *dd;
 	u8 sc5;
+	u8 sl;
 
 	if (hfi1_check_mcast(rdma_ah_get_dlid(ah_attr)) &&
 	    !(rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH))
@@ -1590,8 +1591,13 @@ static int hfi1_check_ah(struct ib_device *ibdev, struct rdma_ah_attr *ah_attr)
 	/* test the mapping for validity */
 	ibp = to_iport(ibdev, rdma_ah_get_port_num(ah_attr));
 	ppd = ppd_from_ibp(ibp);
-	sc5 = ibp->sl_to_sc[rdma_ah_get_sl(ah_attr)];
 	dd = dd_from_ppd(ppd);
+
+	sl = rdma_ah_get_sl(ah_attr);
+	if (sl >= ARRAY_SIZE(ibp->sl_to_sc))
+		return -EINVAL;
+
+	sc5 = ibp->sl_to_sc[sl];
 	if (sc_to_vlt(dd, sc5) > num_vls && sc_to_vlt(dd, sc5) != 0xf)
 		return -EINVAL;
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From 94694d18cf27a6faad91487a38ce516c2b16e7d9 Mon Sep 17 00:00:00 2001
From: "Michael J. Ruhl" <michael.j.ruhl@intel.com>
Date: Thu, 20 Sep 2018 12:58:56 -0700
Subject: IB/hfi1: Invalid user input can result in crash

If the number of packets in a user sdma request does not match
the actual iovectors being sent, sdma_cleanup can be called on
an uninitialized request structure, resulting in a crash similar
to this:

BUG: unable to handle kernel NULL pointer dereference at 0000000000000008
IP: [<ffffffffc0ae8bb7>] __sdma_txclean+0x57/0x1e0 [hfi1]
PGD 8000001044f61067 PUD 1052706067 PMD 0
Oops: 0000 [#1] SMP
CPU: 30 PID: 69912 Comm: upsm Kdump: loaded Tainted: G           OE
------------   3.10.0-862.el7.x86_64 #1
Hardware name: Intel Corporation S2600KPR/S2600KPR, BIOS
SE5C610.86B.01.01.0019.101220160604 10/12/2016
task: ffff8b331c890000 ti: ffff8b2ed1f98000 task.ti: ffff8b2ed1f98000
RIP: 0010:[<ffffffffc0ae8bb7>]  [<ffffffffc0ae8bb7>] __sdma_txclean+0x57/0x1e0
[hfi1]
RSP: 0018:ffff8b2ed1f9bab0  EFLAGS: 00010286
RAX: 0000000000008b2b RBX: ffff8b2adf6e0000 RCX: 0000000000000000
RDX: 00000000000000a0 RSI: ffff8b2e9eedc540 RDI: ffff8b2adf6e0000
RBP: ffff8b2ed1f9bad8 R08: 0000000000000000 R09: ffffffffc0b04a06
R10: ffff8b331c890190 R11: ffffe6ed00bf1840 R12: ffff8b3315480000
R13: ffff8b33154800f0 R14: 00000000fffffff2 R15: ffff8b2e9eedc540
FS:  00007f035ac47740(0000) GS:ffff8b331e100000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000008 CR3: 0000000c03fe6000 CR4: 00000000001607e0
Call Trace:
 [<ffffffffc0b0570d>] user_sdma_send_pkts+0xdcd/0x1990 [hfi1]
 [<ffffffff9fe75fb0>] ? gup_pud_range+0x140/0x290
 [<ffffffffc0ad3105>] ? hfi1_mmu_rb_insert+0x155/0x1b0 [hfi1]
 [<ffffffffc0b0777b>] hfi1_user_sdma_process_request+0xc5b/0x11b0 [hfi1]
 [<ffffffffc0ac193a>] hfi1_aio_write+0xba/0x110 [hfi1]
 [<ffffffffa001a2bb>] do_sync_readv_writev+0x7b/0xd0
 [<ffffffffa001bede>] do_readv_writev+0xce/0x260
 [<ffffffffa022b089>] ? tty_ldisc_deref+0x19/0x20
 [<ffffffffa02268c0>] ? n_tty_ioctl+0xe0/0xe0
 [<ffffffffa001c105>] vfs_writev+0x35/0x60
 [<ffffffffa001c2bf>] SyS_writev+0x7f/0x110
 [<ffffffffa051f7d5>] system_call_fastpath+0x1c/0x21
Code: 06 49 c7 47 18 00 00 00 00 0f 87 89 01 00 00 5b 41 5c 41 5d 41 5e 41 5f
5d c3 66 2e 0f 1f 84 00 00 00 00 00 48 8b 4e 10 48 89 fb <48> 8b 51 08 49 89 d4
83 e2 0c 41 81 e4 00 e0 00 00 48 c1 ea 02
RIP  [<ffffffffc0ae8bb7>] __sdma_txclean+0x57/0x1e0 [hfi1]
 RSP <ffff8b2ed1f9bab0>
CR2: 0000000000000008

There are two exit points from user_sdma_send_pkts().  One (free_tx)
merely frees the slab entry and one (free_txreq) cleans the sdma_txreq
prior to freeing the slab entry.   The free_txreq variation can only be
called after one of the sdma_init*() variations has been called.

In the panic case, the slab entry had been allocated but not inited.

Fix the issue by exiting through free_tx thus avoiding sdma_clean().

Cc: <stable@vger.kernel.org> # 4.9.x+
Fixes: 7724105686e7 ("IB/hfi1: add driver files")
Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Reviewed-by: Lukasz Odzioba <lukasz.odzioba@intel.com>
Signed-off-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hfi1/user_sdma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c
index a3a7b33196d6..5c88706121c1 100644
--- a/drivers/infiniband/hw/hfi1/user_sdma.c
+++ b/drivers/infiniband/hw/hfi1/user_sdma.c
@@ -828,7 +828,7 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts)
 			if (READ_ONCE(iovec->offset) == iovec->iov.iov_len) {
 				if (++req->iov_idx == req->data_iovs) {
 					ret = -EFAULT;
-					goto free_txreq;
+					goto free_tx;
 				}
 				iovec = &req->iovs[req->iov_idx];
 				WARN_ON(iovec->offset);
-- 
cgit v1.2.3-59-g8ed1b


From d623500b3c4efd8d4e945ac9003c6b87b469a9ab Mon Sep 17 00:00:00 2001
From: "Michael J. Ruhl" <michael.j.ruhl@intel.com>
Date: Thu, 20 Sep 2018 12:59:05 -0700
Subject: IB/hfi1: Fix context recovery when PBC has an UnsupportedVL

If a packet stream uses an UnsupportedVL (virtual lane), the send
engine will not send the packet, and it will not indicate that an
error has occurred.  This will cause the packet stream to block.

HFI has 8 virtual lanes available for packet streams.  Each lane can
be enabled or disabled using the UnsupportedVL mask.  If a lane is
disabled, adding a packet to the send context must be disallowed.

The current mask for determining unsupported VLs defaults to 0 (allow
all).  This is incorrect.  Only the VLs that are defined should be
allowed.

Determine which VLs are disabled (mtu == 0), and set the appropriate
unsupported bit in the mask.  The correct mask will allow the send
engine to error on the invalid VL, and error recovery will work
correctly.

Cc: <stable@vger.kernel.org> # 4.9.x+
Fixes: 7724105686e7 ("IB/hfi1: add driver files")
Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Reviewed-by: Lukasz Odzioba <lukasz.odzioba@intel.com>
Signed-off-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hfi1/pio.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c
index c2c1cba5b23b..cd962c9ea6bc 100644
--- a/drivers/infiniband/hw/hfi1/pio.c
+++ b/drivers/infiniband/hw/hfi1/pio.c
@@ -86,6 +86,7 @@ void pio_send_control(struct hfi1_devdata *dd, int op)
 	unsigned long flags;
 	int write = 1;	/* write sendctrl back */
 	int flush = 0;	/* re-read sendctrl to make sure it is flushed */
+	int i;
 
 	spin_lock_irqsave(&dd->sendctrl_lock, flags);
 
@@ -95,9 +96,13 @@ void pio_send_control(struct hfi1_devdata *dd, int op)
 		reg |= SEND_CTRL_SEND_ENABLE_SMASK;
 	/* Fall through */
 	case PSC_DATA_VL_ENABLE:
+		mask = 0;
+		for (i = 0; i < ARRAY_SIZE(dd->vld); i++)
+			if (!dd->vld[i].mtu)
+				mask |= BIT_ULL(i);
 		/* Disallow sending on VLs not enabled */
-		mask = (((~0ull) << num_vls) & SEND_CTRL_UNSUPPORTED_VL_MASK) <<
-				SEND_CTRL_UNSUPPORTED_VL_SHIFT;
+		mask = (mask & SEND_CTRL_UNSUPPORTED_VL_MASK) <<
+			SEND_CTRL_UNSUPPORTED_VL_SHIFT;
 		reg = (reg & ~SEND_CTRL_UNSUPPORTED_VL_SMASK) | mask;
 		break;
 	case PSC_GLOBAL_DISABLE:
-- 
cgit v1.2.3-59-g8ed1b


From b4a4957d3d1c328b733fce783b7264996f866ad2 Mon Sep 17 00:00:00 2001
From: "Michael J. Ruhl" <michael.j.ruhl@intel.com>
Date: Thu, 20 Sep 2018 12:59:14 -0700
Subject: IB/hfi1: Fix destroy_qp hang after a link down

rvt_destroy_qp() cannot complete until all in process packets have
been released from the underlying hardware.  If a link down event
occurs, an application can hang with a kernel stack similar to:

cat /proc/<app PID>/stack
 quiesce_qp+0x178/0x250 [hfi1]
 rvt_reset_qp+0x23d/0x400 [rdmavt]
 rvt_destroy_qp+0x69/0x210 [rdmavt]
 ib_destroy_qp+0xba/0x1c0 [ib_core]
 nvme_rdma_destroy_queue_ib+0x46/0x80 [nvme_rdma]
 nvme_rdma_free_queue+0x3c/0xd0 [nvme_rdma]
 nvme_rdma_destroy_io_queues+0x88/0xd0 [nvme_rdma]
 nvme_rdma_error_recovery_work+0x52/0xf0 [nvme_rdma]
 process_one_work+0x17a/0x440
 worker_thread+0x126/0x3c0
 kthread+0xcf/0xe0
 ret_from_fork+0x58/0x90
 0xffffffffffffffff

quiesce_qp() waits until all outstanding packets have been freed.
This wait should be momentary.  During a link down event, the cleanup
handling does not ensure that all packets caught by the link down are
flushed properly.

This is caused by the fact that the freeze path and the link down
event is handled the same.  This is not correct.  The freeze path
waits until the HFI is unfrozen and then restarts PIO.  A link down
is not a freeze event.  The link down path cannot restart the PIO
until link is restored.  If the PIO path is restarted before the link
comes up, the application (QP) using the PIO path will hang (until
link is restored).

Fix by separating the linkdown path from the freeze path and use the
link down path for link down events.

Close a race condition sc_disable() by acquiring both the progress
and release locks.

Close a race condition in sc_stop() by moving the setting of the flag
bits under the alloc lock.

Cc: <stable@vger.kernel.org> # 4.9.x+
Fixes: 7724105686e7 ("IB/hfi1: add driver files")
Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hfi1/chip.c |  6 +++++-
 drivers/infiniband/hw/hfi1/pio.c  | 42 +++++++++++++++++++++++++++++++--------
 drivers/infiniband/hw/hfi1/pio.h  |  2 ++
 3 files changed, 41 insertions(+), 9 deletions(-)

(limited to 'drivers')

diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c
index 2c19bf772451..e1668bcc2d13 100644
--- a/drivers/infiniband/hw/hfi1/chip.c
+++ b/drivers/infiniband/hw/hfi1/chip.c
@@ -6733,6 +6733,7 @@ void start_freeze_handling(struct hfi1_pportdata *ppd, int flags)
 	struct hfi1_devdata *dd = ppd->dd;
 	struct send_context *sc;
 	int i;
+	int sc_flags;
 
 	if (flags & FREEZE_SELF)
 		write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_FREEZE_SMASK);
@@ -6743,11 +6744,13 @@ void start_freeze_handling(struct hfi1_pportdata *ppd, int flags)
 	/* notify all SDMA engines that they are going into a freeze */
 	sdma_freeze_notify(dd, !!(flags & FREEZE_LINK_DOWN));
 
+	sc_flags = SCF_FROZEN | SCF_HALTED | (flags & FREEZE_LINK_DOWN ?
+					      SCF_LINK_DOWN : 0);
 	/* do halt pre-handling on all enabled send contexts */
 	for (i = 0; i < dd->num_send_contexts; i++) {
 		sc = dd->send_contexts[i].sc;
 		if (sc && (sc->flags & SCF_ENABLED))
-			sc_stop(sc, SCF_FROZEN | SCF_HALTED);
+			sc_stop(sc, sc_flags);
 	}
 
 	/* Send context are frozen. Notify user space */
@@ -10674,6 +10677,7 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state)
 		add_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
 
 		handle_linkup_change(dd, 1);
+		pio_kernel_linkup(dd);
 
 		/*
 		 * After link up, a new link width will have been set.
diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c
index cd962c9ea6bc..752057647f09 100644
--- a/drivers/infiniband/hw/hfi1/pio.c
+++ b/drivers/infiniband/hw/hfi1/pio.c
@@ -926,20 +926,18 @@ void sc_free(struct send_context *sc)
 void sc_disable(struct send_context *sc)
 {
 	u64 reg;
-	unsigned long flags;
 	struct pio_buf *pbuf;
 
 	if (!sc)
 		return;
 
 	/* do all steps, even if already disabled */
-	spin_lock_irqsave(&sc->alloc_lock, flags);
+	spin_lock_irq(&sc->alloc_lock);
 	reg = read_kctxt_csr(sc->dd, sc->hw_context, SC(CTRL));
 	reg &= ~SC(CTRL_CTXT_ENABLE_SMASK);
 	sc->flags &= ~SCF_ENABLED;
 	sc_wait_for_packet_egress(sc, 1);
 	write_kctxt_csr(sc->dd, sc->hw_context, SC(CTRL), reg);
-	spin_unlock_irqrestore(&sc->alloc_lock, flags);
 
 	/*
 	 * Flush any waiters.  Once the context is disabled,
@@ -949,7 +947,7 @@ void sc_disable(struct send_context *sc)
 	 * proceed with the flush.
 	 */
 	udelay(1);
-	spin_lock_irqsave(&sc->release_lock, flags);
+	spin_lock(&sc->release_lock);
 	if (sc->sr) {	/* this context has a shadow ring */
 		while (sc->sr_tail != sc->sr_head) {
 			pbuf = &sc->sr[sc->sr_tail].pbuf;
@@ -960,7 +958,8 @@ void sc_disable(struct send_context *sc)
 				sc->sr_tail = 0;
 		}
 	}
-	spin_unlock_irqrestore(&sc->release_lock, flags);
+	spin_unlock(&sc->release_lock);
+	spin_unlock_irq(&sc->alloc_lock);
 }
 
 /* return SendEgressCtxtStatus.PacketOccupancy */
@@ -1183,11 +1182,39 @@ void pio_kernel_unfreeze(struct hfi1_devdata *dd)
 		sc = dd->send_contexts[i].sc;
 		if (!sc || !(sc->flags & SCF_FROZEN) || sc->type == SC_USER)
 			continue;
+		if (sc->flags & SCF_LINK_DOWN)
+			continue;
 
 		sc_enable(sc);	/* will clear the sc frozen flag */
 	}
 }
 
+/**
+ * pio_kernel_linkup() - Re-enable send contexts after linkup event
+ * @dd: valid devive data
+ *
+ * When the link goes down, the freeze path is taken.  However, a link down
+ * event is different from a freeze because if the send context is re-enabled
+ * whowever is sending data will start sending data again, which will hang
+ * any QP that is sending data.
+ *
+ * The freeze path now looks at the type of event that occurs and takes this
+ * path for link down event.
+ */
+void pio_kernel_linkup(struct hfi1_devdata *dd)
+{
+	struct send_context *sc;
+	int i;
+
+	for (i = 0; i < dd->num_send_contexts; i++) {
+		sc = dd->send_contexts[i].sc;
+		if (!sc || !(sc->flags & SCF_LINK_DOWN) || sc->type == SC_USER)
+			continue;
+
+		sc_enable(sc);	/* will clear the sc link down flag */
+	}
+}
+
 /*
  * Wait for the SendPioInitCtxt.PioInitInProgress bit to clear.
  * Returns:
@@ -1387,11 +1414,10 @@ void sc_stop(struct send_context *sc, int flag)
 {
 	unsigned long flags;
 
-	/* mark the context */
-	sc->flags |= flag;
-
 	/* stop buffer allocations */
 	spin_lock_irqsave(&sc->alloc_lock, flags);
+	/* mark the context */
+	sc->flags |= flag;
 	sc->flags &= ~SCF_ENABLED;
 	spin_unlock_irqrestore(&sc->alloc_lock, flags);
 	wake_up(&sc->halt_wait);
diff --git a/drivers/infiniband/hw/hfi1/pio.h b/drivers/infiniband/hw/hfi1/pio.h
index 058b08f459ab..aaf372c3e5d6 100644
--- a/drivers/infiniband/hw/hfi1/pio.h
+++ b/drivers/infiniband/hw/hfi1/pio.h
@@ -139,6 +139,7 @@ struct send_context {
 #define SCF_IN_FREE 0x02
 #define SCF_HALTED  0x04
 #define SCF_FROZEN  0x08
+#define SCF_LINK_DOWN 0x10
 
 struct send_context_info {
 	struct send_context *sc;	/* allocated working context */
@@ -306,6 +307,7 @@ void set_pio_integrity(struct send_context *sc);
 void pio_reset_all(struct hfi1_devdata *dd);
 void pio_freeze(struct hfi1_devdata *dd);
 void pio_kernel_unfreeze(struct hfi1_devdata *dd);
+void pio_kernel_linkup(struct hfi1_devdata *dd);
 
 /* global PIO send control operations */
 #define PSC_GLOBAL_ENABLE 0
-- 
cgit v1.2.3-59-g8ed1b


From de5c95d0f518537f59ee5aef762abc46f868c377 Mon Sep 17 00:00:00 2001
From: Selvin Xavier <selvin.xavier@broadcom.com>
Date: Thu, 20 Sep 2018 22:33:00 -0700
Subject: RDMA/bnxt_re: Fix system crash during RDMA resource initialization

bnxt_re_ib_reg acquires and releases the rtnl lock whenever it accesses
the L2 driver.

The following sequence can trigger a crash

Acquires the rtnl_lock ->
	Registers roce driver callback with L2 driver ->
		release the rtnl lock
bnxt_re acquires the rtnl_lock ->
	Request for MSIx vectors ->
		release the rtnl_lock

Issue happens when bnxt_re proceeds with remaining part of initialization
and L2 driver invokes bnxt_ulp_irq_stop as a part of bnxt_open_nic.

The crash is in bnxt_qplib_nq_stop_irq as the NQ structures are
not initialized yet,

<snip>
[ 3551.726647] BUG: unable to handle kernel NULL pointer dereference at (null)
[ 3551.726656] IP: [<ffffffffc0840ee9>] bnxt_qplib_nq_stop_irq+0x59/0xb0 [bnxt_re]
[ 3551.726674] PGD 0
[ 3551.726679] Oops: 0002 1 SMP
...
[ 3551.726822] Hardware name: Dell Inc. PowerEdge R720/08RW36, BIOS 2.4.3 07/09/2014
[ 3551.726826] task: ffff97e30eec5ee0 ti: ffff97e3173bc000 task.ti: ffff97e3173bc000
[ 3551.726829] RIP: 0010:[<ffffffffc0840ee9>] [<ffffffffc0840ee9>]
bnxt_qplib_nq_stop_irq+0x59/0xb0 [bnxt_re]
...
[ 3551.726872] Call Trace:
[ 3551.726886] [<ffffffffc082cb9e>] bnxt_re_stop_irq+0x4e/0x70 [bnxt_re]
[ 3551.726899] [<ffffffffc07d6a53>] bnxt_ulp_irq_stop+0x43/0x70 [bnxt_en]
[ 3551.726908] [<ffffffffc07c82f4>] bnxt_reserve_rings+0x174/0x1e0 [bnxt_en]
[ 3551.726917] [<ffffffffc07cafd8>] __bnxt_open_nic+0x368/0x9a0 [bnxt_en]
[ 3551.726925] [<ffffffffc07cb62b>] bnxt_open_nic+0x1b/0x50 [bnxt_en]
[ 3551.726934] [<ffffffffc07cc62f>] bnxt_setup_mq_tc+0x11f/0x260 [bnxt_en]
[ 3551.726943] [<ffffffffc07d5f58>] bnxt_dcbnl_ieee_setets+0xb8/0x1f0 [bnxt_en]
[ 3551.726954] [<ffffffff890f983a>] dcbnl_ieee_set+0x9a/0x250
[ 3551.726966] [<ffffffff88fd6d21>] ? __alloc_skb+0xa1/0x2d0
[ 3551.726972] [<ffffffff890f72fa>] dcb_doit+0x13a/0x210
[ 3551.726981] [<ffffffff89003ff7>] rtnetlink_rcv_msg+0xa7/0x260
[ 3551.726989] [<ffffffff88ffdb00>] ? rtnl_unicast+0x20/0x30
[ 3551.726996] [<ffffffff88bf9dc8>] ? __kmalloc_node_track_caller+0x58/0x290
[ 3551.727002] [<ffffffff890f7326>] ? dcb_doit+0x166/0x210
[ 3551.727007] [<ffffffff88fd6d0d>] ? __alloc_skb+0x8d/0x2d0
[ 3551.727012] [<ffffffff89003f50>] ? rtnl_newlink+0x880/0x880
...
[ 3551.727104] [<ffffffff8911f7d5>] system_call_fastpath+0x1c/0x21
...
[ 3551.727164] RIP [<ffffffffc0840ee9>] bnxt_qplib_nq_stop_irq+0x59/0xb0 [bnxt_re]
[ 3551.727175] RSP <ffff97e3173bf788>
[ 3551.727177] CR2: 0000000000000000

Avoid this inconsistent state and  system crash by acquiring
the rtnl lock for the entire duration of device initialization.
Re-factor the code to remove the rtnl lock from the individual function
and acquire and release it from the caller.

Fixes: 1ac5a4047975 ("RDMA/bnxt_re: Add bnxt_re RoCE driver")
Fixes: 6e04b1035689 ("RDMA/bnxt_re: Fix broken RoCE driver due to recent L2 driver changes")
Signed-off-by: Selvin Xavier <selvin.xavier@broadcom.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/bnxt_re/main.c | 93 +++++++++++++++---------------------
 1 file changed, 38 insertions(+), 55 deletions(-)

(limited to 'drivers')

diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c
index 20b9f31052bf..85cd1a3593d6 100644
--- a/drivers/infiniband/hw/bnxt_re/main.c
+++ b/drivers/infiniband/hw/bnxt_re/main.c
@@ -78,7 +78,7 @@ static struct list_head bnxt_re_dev_list = LIST_HEAD_INIT(bnxt_re_dev_list);
 /* Mutex to protect the list of bnxt_re devices added */
 static DEFINE_MUTEX(bnxt_re_dev_lock);
 static struct workqueue_struct *bnxt_re_wq;
-static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev, bool lock_wait);
+static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev);
 
 /* SR-IOV helper functions */
 
@@ -182,7 +182,7 @@ static void bnxt_re_shutdown(void *p)
 	if (!rdev)
 		return;
 
-	bnxt_re_ib_unreg(rdev, false);
+	bnxt_re_ib_unreg(rdev);
 }
 
 static void bnxt_re_stop_irq(void *handle)
@@ -251,7 +251,7 @@ static struct bnxt_ulp_ops bnxt_re_ulp_ops = {
 /* Driver registration routines used to let the networking driver (bnxt_en)
  * to know that the RoCE driver is now installed
  */
-static int bnxt_re_unregister_netdev(struct bnxt_re_dev *rdev, bool lock_wait)
+static int bnxt_re_unregister_netdev(struct bnxt_re_dev *rdev)
 {
 	struct bnxt_en_dev *en_dev;
 	int rc;
@@ -260,14 +260,9 @@ static int bnxt_re_unregister_netdev(struct bnxt_re_dev *rdev, bool lock_wait)
 		return -EINVAL;
 
 	en_dev = rdev->en_dev;
-	/* Acquire rtnl lock if it is not invokded from netdev event */
-	if (lock_wait)
-		rtnl_lock();
 
 	rc = en_dev->en_ops->bnxt_unregister_device(rdev->en_dev,
 						    BNXT_ROCE_ULP);
-	if (lock_wait)
-		rtnl_unlock();
 	return rc;
 }
 
@@ -281,14 +276,12 @@ static int bnxt_re_register_netdev(struct bnxt_re_dev *rdev)
 
 	en_dev = rdev->en_dev;
 
-	rtnl_lock();
 	rc = en_dev->en_ops->bnxt_register_device(en_dev, BNXT_ROCE_ULP,
 						  &bnxt_re_ulp_ops, rdev);
-	rtnl_unlock();
 	return rc;
 }
 
-static int bnxt_re_free_msix(struct bnxt_re_dev *rdev, bool lock_wait)
+static int bnxt_re_free_msix(struct bnxt_re_dev *rdev)
 {
 	struct bnxt_en_dev *en_dev;
 	int rc;
@@ -298,13 +291,9 @@ static int bnxt_re_free_msix(struct bnxt_re_dev *rdev, bool lock_wait)
 
 	en_dev = rdev->en_dev;
 
-	if (lock_wait)
-		rtnl_lock();
 
 	rc = en_dev->en_ops->bnxt_free_msix(rdev->en_dev, BNXT_ROCE_ULP);
 
-	if (lock_wait)
-		rtnl_unlock();
 	return rc;
 }
 
@@ -320,7 +309,6 @@ static int bnxt_re_request_msix(struct bnxt_re_dev *rdev)
 
 	num_msix_want = min_t(u32, BNXT_RE_MAX_MSIX, num_online_cpus());
 
-	rtnl_lock();
 	num_msix_got = en_dev->en_ops->bnxt_request_msix(en_dev, BNXT_ROCE_ULP,
 							 rdev->msix_entries,
 							 num_msix_want);
@@ -335,7 +323,6 @@ static int bnxt_re_request_msix(struct bnxt_re_dev *rdev)
 	}
 	rdev->num_msix = num_msix_got;
 done:
-	rtnl_unlock();
 	return rc;
 }
 
@@ -358,24 +345,18 @@ static void bnxt_re_fill_fw_msg(struct bnxt_fw_msg *fw_msg, void *msg,
 	fw_msg->timeout = timeout;
 }
 
-static int bnxt_re_net_ring_free(struct bnxt_re_dev *rdev, u16 fw_ring_id,
-				 bool lock_wait)
+static int bnxt_re_net_ring_free(struct bnxt_re_dev *rdev, u16 fw_ring_id)
 {
 	struct bnxt_en_dev *en_dev = rdev->en_dev;
 	struct hwrm_ring_free_input req = {0};
 	struct hwrm_ring_free_output resp;
 	struct bnxt_fw_msg fw_msg;
-	bool do_unlock = false;
 	int rc = -EINVAL;
 
 	if (!en_dev)
 		return rc;
 
 	memset(&fw_msg, 0, sizeof(fw_msg));
-	if (lock_wait) {
-		rtnl_lock();
-		do_unlock = true;
-	}
 
 	bnxt_re_init_hwrm_hdr(rdev, (void *)&req, HWRM_RING_FREE, -1, -1);
 	req.ring_type = RING_ALLOC_REQ_RING_TYPE_L2_CMPL;
@@ -386,8 +367,6 @@ static int bnxt_re_net_ring_free(struct bnxt_re_dev *rdev, u16 fw_ring_id,
 	if (rc)
 		dev_err(rdev_to_dev(rdev),
 			"Failed to free HW ring:%d :%#x", req.ring_id, rc);
-	if (do_unlock)
-		rtnl_unlock();
 	return rc;
 }
 
@@ -405,7 +384,6 @@ static int bnxt_re_net_ring_alloc(struct bnxt_re_dev *rdev, dma_addr_t *dma_arr,
 		return rc;
 
 	memset(&fw_msg, 0, sizeof(fw_msg));
-	rtnl_lock();
 	bnxt_re_init_hwrm_hdr(rdev, (void *)&req, HWRM_RING_ALLOC, -1, -1);
 	req.enables = 0;
 	req.page_tbl_addr =  cpu_to_le64(dma_arr[0]);
@@ -426,27 +404,21 @@ static int bnxt_re_net_ring_alloc(struct bnxt_re_dev *rdev, dma_addr_t *dma_arr,
 	if (!rc)
 		*fw_ring_id = le16_to_cpu(resp.ring_id);
 
-	rtnl_unlock();
 	return rc;
 }
 
 static int bnxt_re_net_stats_ctx_free(struct bnxt_re_dev *rdev,
-				      u32 fw_stats_ctx_id, bool lock_wait)
+				      u32 fw_stats_ctx_id)
 {
 	struct bnxt_en_dev *en_dev = rdev->en_dev;
 	struct hwrm_stat_ctx_free_input req = {0};
 	struct bnxt_fw_msg fw_msg;
-	bool do_unlock = false;
 	int rc = -EINVAL;
 
 	if (!en_dev)
 		return rc;
 
 	memset(&fw_msg, 0, sizeof(fw_msg));
-	if (lock_wait) {
-		rtnl_lock();
-		do_unlock = true;
-	}
 
 	bnxt_re_init_hwrm_hdr(rdev, (void *)&req, HWRM_STAT_CTX_FREE, -1, -1);
 	req.stat_ctx_id = cpu_to_le32(fw_stats_ctx_id);
@@ -457,8 +429,6 @@ static int bnxt_re_net_stats_ctx_free(struct bnxt_re_dev *rdev,
 		dev_err(rdev_to_dev(rdev),
 			"Failed to free HW stats context %#x", rc);
 
-	if (do_unlock)
-		rtnl_unlock();
 	return rc;
 }
 
@@ -478,7 +448,6 @@ static int bnxt_re_net_stats_ctx_alloc(struct bnxt_re_dev *rdev,
 		return rc;
 
 	memset(&fw_msg, 0, sizeof(fw_msg));
-	rtnl_lock();
 
 	bnxt_re_init_hwrm_hdr(rdev, (void *)&req, HWRM_STAT_CTX_ALLOC, -1, -1);
 	req.update_period_ms = cpu_to_le32(1000);
@@ -490,7 +459,6 @@ static int bnxt_re_net_stats_ctx_alloc(struct bnxt_re_dev *rdev,
 	if (!rc)
 		*fw_stats_ctx_id = le32_to_cpu(resp.stat_ctx_id);
 
-	rtnl_unlock();
 	return rc;
 }
 
@@ -929,19 +897,19 @@ fail:
 	return rc;
 }
 
-static void bnxt_re_free_nq_res(struct bnxt_re_dev *rdev, bool lock_wait)
+static void bnxt_re_free_nq_res(struct bnxt_re_dev *rdev)
 {
 	int i;
 
 	for (i = 0; i < rdev->num_msix - 1; i++) {
-		bnxt_re_net_ring_free(rdev, rdev->nq[i].ring_id, lock_wait);
+		bnxt_re_net_ring_free(rdev, rdev->nq[i].ring_id);
 		bnxt_qplib_free_nq(&rdev->nq[i]);
 	}
 }
 
-static void bnxt_re_free_res(struct bnxt_re_dev *rdev, bool lock_wait)
+static void bnxt_re_free_res(struct bnxt_re_dev *rdev)
 {
-	bnxt_re_free_nq_res(rdev, lock_wait);
+	bnxt_re_free_nq_res(rdev);
 
 	if (rdev->qplib_res.dpi_tbl.max) {
 		bnxt_qplib_dealloc_dpi(&rdev->qplib_res,
@@ -1219,7 +1187,7 @@ static int bnxt_re_setup_qos(struct bnxt_re_dev *rdev)
 	return 0;
 }
 
-static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev, bool lock_wait)
+static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev)
 {
 	int i, rc;
 
@@ -1234,28 +1202,27 @@ static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev, bool lock_wait)
 		cancel_delayed_work(&rdev->worker);
 
 	bnxt_re_cleanup_res(rdev);
-	bnxt_re_free_res(rdev, lock_wait);
+	bnxt_re_free_res(rdev);
 
 	if (test_and_clear_bit(BNXT_RE_FLAG_RCFW_CHANNEL_EN, &rdev->flags)) {
 		rc = bnxt_qplib_deinit_rcfw(&rdev->rcfw);
 		if (rc)
 			dev_warn(rdev_to_dev(rdev),
 				 "Failed to deinitialize RCFW: %#x", rc);
-		bnxt_re_net_stats_ctx_free(rdev, rdev->qplib_ctx.stats.fw_id,
-					   lock_wait);
+		bnxt_re_net_stats_ctx_free(rdev, rdev->qplib_ctx.stats.fw_id);
 		bnxt_qplib_free_ctx(rdev->en_dev->pdev, &rdev->qplib_ctx);
 		bnxt_qplib_disable_rcfw_channel(&rdev->rcfw);
-		bnxt_re_net_ring_free(rdev, rdev->rcfw.creq_ring_id, lock_wait);
+		bnxt_re_net_ring_free(rdev, rdev->rcfw.creq_ring_id);
 		bnxt_qplib_free_rcfw_channel(&rdev->rcfw);
 	}
 	if (test_and_clear_bit(BNXT_RE_FLAG_GOT_MSIX, &rdev->flags)) {
-		rc = bnxt_re_free_msix(rdev, lock_wait);
+		rc = bnxt_re_free_msix(rdev);
 		if (rc)
 			dev_warn(rdev_to_dev(rdev),
 				 "Failed to free MSI-X vectors: %#x", rc);
 	}
 	if (test_and_clear_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags)) {
-		rc = bnxt_re_unregister_netdev(rdev, lock_wait);
+		rc = bnxt_re_unregister_netdev(rdev);
 		if (rc)
 			dev_warn(rdev_to_dev(rdev),
 				 "Failed to unregister with netdev: %#x", rc);
@@ -1276,6 +1243,12 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
 {
 	int i, j, rc;
 
+	bool locked;
+
+	/* Acquire rtnl lock through out this function */
+	rtnl_lock();
+	locked = true;
+
 	/* Registered a new RoCE device instance to netdev */
 	rc = bnxt_re_register_netdev(rdev);
 	if (rc) {
@@ -1374,12 +1347,16 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
 		schedule_delayed_work(&rdev->worker, msecs_to_jiffies(30000));
 	}
 
+	rtnl_unlock();
+	locked = false;
+
 	/* Register ib dev */
 	rc = bnxt_re_register_ib(rdev);
 	if (rc) {
 		pr_err("Failed to register with IB: %#x\n", rc);
 		goto fail;
 	}
+	set_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags);
 	dev_info(rdev_to_dev(rdev), "Device registered successfully");
 	for (i = 0; i < ARRAY_SIZE(bnxt_re_attributes); i++) {
 		rc = device_create_file(&rdev->ibdev.dev,
@@ -1395,7 +1372,6 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
 			goto fail;
 		}
 	}
-	set_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags);
 	ib_get_eth_speed(&rdev->ibdev, 1, &rdev->active_speed,
 			 &rdev->active_width);
 	set_bit(BNXT_RE_FLAG_ISSUE_ROCE_STATS, &rdev->flags);
@@ -1404,17 +1380,21 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
 
 	return 0;
 free_sctx:
-	bnxt_re_net_stats_ctx_free(rdev, rdev->qplib_ctx.stats.fw_id, true);
+	bnxt_re_net_stats_ctx_free(rdev, rdev->qplib_ctx.stats.fw_id);
 free_ctx:
 	bnxt_qplib_free_ctx(rdev->en_dev->pdev, &rdev->qplib_ctx);
 disable_rcfw:
 	bnxt_qplib_disable_rcfw_channel(&rdev->rcfw);
 free_ring:
-	bnxt_re_net_ring_free(rdev, rdev->rcfw.creq_ring_id, true);
+	bnxt_re_net_ring_free(rdev, rdev->rcfw.creq_ring_id);
 free_rcfw:
 	bnxt_qplib_free_rcfw_channel(&rdev->rcfw);
 fail:
-	bnxt_re_ib_unreg(rdev, true);
+	if (!locked)
+		rtnl_lock();
+	bnxt_re_ib_unreg(rdev);
+	rtnl_unlock();
+
 	return rc;
 }
 
@@ -1567,7 +1547,7 @@ static int bnxt_re_netdev_event(struct notifier_block *notifier,
 		 */
 		if (atomic_read(&rdev->sched_count) > 0)
 			goto exit;
-		bnxt_re_ib_unreg(rdev, false);
+		bnxt_re_ib_unreg(rdev);
 		bnxt_re_remove_one(rdev);
 		bnxt_re_dev_unreg(rdev);
 		break;
@@ -1646,7 +1626,10 @@ static void __exit bnxt_re_mod_exit(void)
 		 */
 		flush_workqueue(bnxt_re_wq);
 		bnxt_re_dev_stop(rdev);
-		bnxt_re_ib_unreg(rdev, true);
+		/* Acquire the rtnl_lock as the L2 resources are freed here */
+		rtnl_lock();
+		bnxt_re_ib_unreg(rdev);
+		rtnl_unlock();
 		bnxt_re_remove_one(rdev);
 		bnxt_re_dev_unreg(rdev);
 	}
-- 
cgit v1.2.3-59-g8ed1b


From a9360abd3de0aad745d25d003923d56afb28a04b Mon Sep 17 00:00:00 2001
From: Mark Bloch <markb@mellanox.com>
Date: Tue, 25 Sep 2018 11:23:55 +0300
Subject: IB/uverbs: Free uapi on destroy

Make sure we free struct uverbs_api once we clean the radix tree. It was
allocated by uverbs_alloc_api().

Fixes: 9ed3e5f44772 ("IB/uverbs: Build the specs into a radix tree at runtime")
Reported-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/uverbs_uapi.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'drivers')

diff --git a/drivers/infiniband/core/uverbs_uapi.c b/drivers/infiniband/core/uverbs_uapi.c
index 73ea6f0db88f..be854628a7c6 100644
--- a/drivers/infiniband/core/uverbs_uapi.c
+++ b/drivers/infiniband/core/uverbs_uapi.c
@@ -248,6 +248,7 @@ void uverbs_destroy_api(struct uverbs_api *uapi)
 		kfree(rcu_dereference_protected(*slot, true));
 		radix_tree_iter_delete(&uapi->radix, &iter, slot);
 	}
+	kfree(uapi);
 }
 
 struct uverbs_api *uverbs_alloc_api(
-- 
cgit v1.2.3-59-g8ed1b


From e8ef090a614292db01b5956a6f5467afbe6c5cf7 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Tue, 25 Sep 2018 12:11:12 +0300
Subject: IB/mlx5: Destroy the DEVX object upon error flow

Upon DEVX object creation the object must be destroyed upon a follows
error flow.

Fixes: 7efce3691d33 ("IB/mlx5: Add obj create and destroy functionality")
Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Reviewed-by: Artemy Kovalyov <artemyko@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/devx.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index ac116d63e466..f2f11e652dcd 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -723,6 +723,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)(
 		attrs, MLX5_IB_ATTR_DEVX_OBJ_CREATE_HANDLE);
 	struct mlx5_ib_ucontext *c = to_mucontext(uobj->context);
 	struct mlx5_ib_dev *dev = to_mdev(c->ibucontext.device);
+	u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)];
 	struct devx_obj *obj;
 	int err;
 
@@ -754,10 +755,12 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)(
 
 	err = uverbs_copy_to(attrs, MLX5_IB_ATTR_DEVX_OBJ_CREATE_CMD_OUT, cmd_out, cmd_out_len);
 	if (err)
-		goto obj_free;
+		goto obj_destroy;
 
 	return 0;
 
+obj_destroy:
+	mlx5_cmd_exec(obj->mdev, obj->dinbox, obj->dinlen, out, sizeof(out));
 obj_free:
 	kfree(obj);
 	return err;
-- 
cgit v1.2.3-59-g8ed1b


From 5c5702e259dc66e6fceed5117effab79c186e87a Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@mellanox.com>
Date: Tue, 25 Sep 2018 12:10:40 +0300
Subject: RDMA/core: Set right entry state before releasing reference

Currently add_modify_gid() for IB link layer has followong issue
in cache update path.

When GID update event occurs, core releases reference to the GID
table without updating its state and/or entry pointer.

CPU-0                              CPU-1
------                             -----
ib_cache_update()                    IPoIB ULP
   add_modify_gid()                   [..]
      put_gid_entry()
      refcnt = 0, but
      state = valid,
      entry is valid.
      (work item is not yet executed).
                                   ipoib_create_ah()
                                     rdma_create_ah()
                                        rdma_get_gid_attr() <--
                                   	Tries to acquire gid_attr
                                        which has refcnt = 0.
                                   	This is incorrect.

GID entry state and entry pointer is provides the accurate GID enty
state. Such fields must be updated with rwlock to protect against
readers and, such fields must be in sane state before refcount can drop
to zero. Otherwise above race condition can happen leading to
use-after-free situation.

Following backtrace has been observed when cache update for an IB port
is triggered while IPoIB ULP is creating an AH.

Therefore, when updating GID entry, first mark a valid entry as invalid
through state and set the barrier so that no callers can acquired
the GID entry, followed by release reference to it.

refcount_t: increment on 0; use-after-free.
WARNING: CPU: 4 PID: 29106 at lib/refcount.c:153 refcount_inc_checked+0x30/0x50
Workqueue: ib-comp-unb-wq ib_cq_poll_work [ib_core]
RIP: 0010:refcount_inc_checked+0x30/0x50
RSP: 0018:ffff8802ad36f600 EFLAGS: 00010082
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
RDX: 0000000000000002 RSI: 0000000000000008 RDI: ffffffff86710100
RBP: ffff8802d6e60a30 R08: ffffed005d67bf8b R09: ffffed005d67bf8b
R10: 0000000000000001 R11: ffffed005d67bf8a R12: ffff88027620cee8
R13: ffff8802d6e60988 R14: ffff8802d6e60a78 R15: 0000000000000202
FS: 0000000000000000(0000) GS:ffff8802eb200000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f3ab35e5c88 CR3: 00000002ce84a000 CR4: 00000000000006e0
IPv6: ADDRCONF(NETDEV_CHANGE): ib1: link becomes ready
Call Trace:
rdma_get_gid_attr+0x220/0x310 [ib_core]
? lock_acquire+0x145/0x3a0
rdma_fill_sgid_attr+0x32c/0x470 [ib_core]
rdma_create_ah+0x89/0x160 [ib_core]
? rdma_fill_sgid_attr+0x470/0x470 [ib_core]
? ipoib_create_ah+0x52/0x260 [ib_ipoib]
ipoib_create_ah+0xf5/0x260 [ib_ipoib]
ipoib_mcast_join_complete+0xbbe/0x2540 [ib_ipoib]

Fixes: b150c3862d21 ("IB/core: Introduce GID entry reference counts")
Signed-off-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/cache.c | 68 ++++++++++++++++++++---------------------
 1 file changed, 34 insertions(+), 34 deletions(-)

(limited to 'drivers')

diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
index 0bee1f4b914e..3208ad6ad540 100644
--- a/drivers/infiniband/core/cache.c
+++ b/drivers/infiniband/core/cache.c
@@ -337,6 +337,39 @@ static int add_roce_gid(struct ib_gid_table_entry *entry)
 	return 0;
 }
 
+/**
+ * del_gid - Delete GID table entry
+ *
+ * @ib_dev:	IB device whose GID entry to be deleted
+ * @port:	Port number of the IB device
+ * @table:	GID table of the IB device for a port
+ * @ix:		GID entry index to delete
+ *
+ */
+static void del_gid(struct ib_device *ib_dev, u8 port,
+		    struct ib_gid_table *table, int ix)
+{
+	struct ib_gid_table_entry *entry;
+
+	lockdep_assert_held(&table->lock);
+
+	pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__,
+		 ib_dev->name, port, ix,
+		 table->data_vec[ix]->attr.gid.raw);
+
+	write_lock_irq(&table->rwlock);
+	entry = table->data_vec[ix];
+	entry->state = GID_TABLE_ENTRY_PENDING_DEL;
+	/*
+	 * For non RoCE protocol, GID entry slot is ready to use.
+	 */
+	if (!rdma_protocol_roce(ib_dev, port))
+		table->data_vec[ix] = NULL;
+	write_unlock_irq(&table->rwlock);
+
+	put_gid_entry_locked(entry);
+}
+
 /**
  * add_modify_gid - Add or modify GID table entry
  *
@@ -358,7 +391,7 @@ static int add_modify_gid(struct ib_gid_table *table,
 	 * this index.
 	 */
 	if (is_gid_entry_valid(table->data_vec[attr->index]))
-		put_gid_entry(table->data_vec[attr->index]);
+		del_gid(attr->device, attr->port_num, table, attr->index);
 
 	/*
 	 * Some HCA's report multiple GID entries with only one valid GID, and
@@ -386,39 +419,6 @@ done:
 	return ret;
 }
 
-/**
- * del_gid - Delete GID table entry
- *
- * @ib_dev:	IB device whose GID entry to be deleted
- * @port:	Port number of the IB device
- * @table:	GID table of the IB device for a port
- * @ix:		GID entry index to delete
- *
- */
-static void del_gid(struct ib_device *ib_dev, u8 port,
-		    struct ib_gid_table *table, int ix)
-{
-	struct ib_gid_table_entry *entry;
-
-	lockdep_assert_held(&table->lock);
-
-	pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__,
-		 ib_dev->name, port, ix,
-		 table->data_vec[ix]->attr.gid.raw);
-
-	write_lock_irq(&table->rwlock);
-	entry = table->data_vec[ix];
-	entry->state = GID_TABLE_ENTRY_PENDING_DEL;
-	/*
-	 * For non RoCE protocol, GID entry slot is ready to use.
-	 */
-	if (!rdma_protocol_roce(ib_dev, port))
-		table->data_vec[ix] = NULL;
-	write_unlock_irq(&table->rwlock);
-
-	put_gid_entry_locked(entry);
-}
-
 /* rwlock should be read locked, or lock should be held */
 static int find_gid(struct ib_gid_table *table, const union ib_gid *gid,
 		    const struct ib_gid_attr *val, bool default_gid,
-- 
cgit v1.2.3-59-g8ed1b


From dd9a403495704fc80fb9f399003013ef2be2ee23 Mon Sep 17 00:00:00 2001
From: Valentine Fatiev <Valentinef@mellanox.com>
Date: Wed, 10 Oct 2018 09:56:25 +0300
Subject: IB/mlx5: Unmap DMA addr from HCA before IOMMU

The function that puts back the MR in cache also removes the DMA address
from the HCA. Therefore we need to call this function before we remove
the DMA mapping from MMU. Otherwise the HCA may access a memory that
is no longer DMA mapped.

Call trace:
NMI: IOCK error (debug interrupt?) for reason 71 on CPU 0.
CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.19.0-rc6+ #4
Hardware name: HP ProLiant DL360p Gen8, BIOS P71 08/20/2012
RIP: 0010:intel_idle+0x73/0x120
Code: 80 5c 01 00 0f ae 38 0f ae f0 31 d2 65 48 8b 04 25 80 5c 01 00 48 89 d1 0f 60 02
RSP: 0018:ffffffff9a403e38 EFLAGS: 00000046
RAX: 0000000000000030 RBX: 0000000000000005 RCX: 0000000000000001
RDX: 0000000000000000 RSI: ffffffff9a5790c0 RDI: 0000000000000000
RBP: 0000000000000030 R08: 0000000000000000 R09: 0000000000007cf9
R10: 000000000000030a R11: 0000000000000018 R12: 0000000000000000
R13: ffffffff9a5792b8 R14: ffffffff9a5790c0 R15: 0000002b48471e4d
FS:  0000000000000000(0000) GS:ffff9c6caf400000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f5737185000 CR3: 0000000590c0a002 CR4: 00000000000606f0
Call Trace:
 cpuidle_enter_state+0x7e/0x2e0
 do_idle+0x1ed/0x290
 cpu_startup_entry+0x6f/0x80
 start_kernel+0x524/0x544
 ? set_init_arg+0x55/0x55
 secondary_startup_64+0xa4/0xb0
DMAR: DRHD: handling fault status reg 2
DMAR: [DMA Read] Request device [04:00.0] fault addr b34d2000 [fault reason 06] PTE Read access is not set
DMAR: [DMA Read] Request device [01:00.2] fault addr bff8b000 [fault reason 06] PTE Read access is not set

Fixes: f3f134f5260a ("RDMA/mlx5: Fix crash while accessing garbage pointer and freed memory")
Signed-off-by: Valentine Fatiev <valentinef@mellanox.com>
Reviewed-by: Moni Shoua <monis@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/mlx5/mr.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'drivers')

diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 9fb1d9cb9401..e22314837645 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -544,6 +544,9 @@ void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 	int shrink = 0;
 	int c;
 
+	if (!mr->allocated_from_cache)
+		return;
+
 	c = order2idx(dev, mr->order);
 	if (c < 0 || c >= MAX_MR_CACHE_ENTRIES) {
 		mlx5_ib_warn(dev, "order %d, cache index %d\n", mr->order, c);
@@ -1647,18 +1650,19 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 		umem = NULL;
 	}
 #endif
-
 	clean_mr(dev, mr);
 
+	/*
+	 * We should unregister the DMA address from the HCA before
+	 * remove the DMA mapping.
+	 */
+	mlx5_mr_cache_free(dev, mr);
 	if (umem) {
 		ib_umem_release(umem);
 		atomic_sub(npages, &dev->mdev->priv.reg_pages);
 	}
-
 	if (!mr->allocated_from_cache)
 		kfree(mr);
-	else
-		mlx5_mr_cache_free(dev, mr);
 }
 
 int mlx5_ib_dereg_mr(struct ib_mr *ibmr)
-- 
cgit v1.2.3-59-g8ed1b