aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/infiniband/hw/hfi1/verbs.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/infiniband/hw/hfi1/verbs.c')
-rw-r--r--drivers/infiniband/hw/hfi1/verbs.c259
1 files changed, 26 insertions, 233 deletions
diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c
index 13374c727b14..48e11e510358 100644
--- a/drivers/infiniband/hw/hfi1/verbs.c
+++ b/drivers/infiniband/hw/hfi1/verbs.c
@@ -129,8 +129,6 @@ unsigned short piothreshold = 256;
module_param(piothreshold, ushort, S_IRUGO);
MODULE_PARM_DESC(piothreshold, "size used to determine sdma vs. pio");
-#define COPY_CACHELESS 1
-#define COPY_ADAPTIVE 2
static unsigned int sge_copy_mode;
module_param(sge_copy_mode, uint, S_IRUGO);
MODULE_PARM_DESC(sge_copy_mode,
@@ -151,159 +149,13 @@ static int pio_wait(struct rvt_qp *qp,
/* 16B trailing buffer */
static const u8 trail_buf[MAX_16B_PADDING];
-static uint wss_threshold;
+static uint wss_threshold = 80;
module_param(wss_threshold, uint, S_IRUGO);
MODULE_PARM_DESC(wss_threshold, "Percentage (1-100) of LLC to use as a threshold for a cacheless copy");
static uint wss_clean_period = 256;
module_param(wss_clean_period, uint, S_IRUGO);
MODULE_PARM_DESC(wss_clean_period, "Count of verbs copies before an entry in the page copy table is cleaned");
-/* memory working set size */
-struct hfi1_wss {
- unsigned long *entries;
- atomic_t total_count;
- atomic_t clean_counter;
- atomic_t clean_entry;
-
- int threshold;
- int num_entries;
- long pages_mask;
-};
-
-static struct hfi1_wss wss;
-
-int hfi1_wss_init(void)
-{
- long llc_size;
- long llc_bits;
- long table_size;
- long table_bits;
-
- /* check for a valid percent range - default to 80 if none or invalid */
- if (wss_threshold < 1 || wss_threshold > 100)
- wss_threshold = 80;
- /* reject a wildly large period */
- if (wss_clean_period > 1000000)
- wss_clean_period = 256;
- /* reject a zero period */
- if (wss_clean_period == 0)
- wss_clean_period = 1;
-
- /*
- * Calculate the table size - the next power of 2 larger than the
- * LLC size. LLC size is in KiB.
- */
- llc_size = wss_llc_size() * 1024;
- table_size = roundup_pow_of_two(llc_size);
-
- /* one bit per page in rounded up table */
- llc_bits = llc_size / PAGE_SIZE;
- table_bits = table_size / PAGE_SIZE;
- wss.pages_mask = table_bits - 1;
- wss.num_entries = table_bits / BITS_PER_LONG;
-
- wss.threshold = (llc_bits * wss_threshold) / 100;
- if (wss.threshold == 0)
- wss.threshold = 1;
-
- atomic_set(&wss.clean_counter, wss_clean_period);
-
- wss.entries = kcalloc(wss.num_entries, sizeof(*wss.entries),
- GFP_KERNEL);
- if (!wss.entries) {
- hfi1_wss_exit();
- return -ENOMEM;
- }
-
- return 0;
-}
-
-void hfi1_wss_exit(void)
-{
- /* coded to handle partially initialized and repeat callers */
- kfree(wss.entries);
- wss.entries = NULL;
-}
-
-/*
- * Advance the clean counter. When the clean period has expired,
- * clean an entry.
- *
- * This is implemented in atomics to avoid locking. Because multiple
- * variables are involved, it can be racy which can lead to slightly
- * inaccurate information. Since this is only a heuristic, this is
- * OK. Any innaccuracies will clean themselves out as the counter
- * advances. That said, it is unlikely the entry clean operation will
- * race - the next possible racer will not start until the next clean
- * period.
- *
- * The clean counter is implemented as a decrement to zero. When zero
- * is reached an entry is cleaned.
- */
-static void wss_advance_clean_counter(void)
-{
- int entry;
- int weight;
- unsigned long bits;
-
- /* become the cleaner if we decrement the counter to zero */
- if (atomic_dec_and_test(&wss.clean_counter)) {
- /*
- * Set, not add, the clean period. This avoids an issue
- * where the counter could decrement below the clean period.
- * Doing a set can result in lost decrements, slowing the
- * clean advance. Since this a heuristic, this possible
- * slowdown is OK.
- *
- * An alternative is to loop, advancing the counter by a
- * clean period until the result is > 0. However, this could
- * lead to several threads keeping another in the clean loop.
- * This could be mitigated by limiting the number of times
- * we stay in the loop.
- */
- atomic_set(&wss.clean_counter, wss_clean_period);
-
- /*
- * Uniquely grab the entry to clean and move to next.
- * The current entry is always the lower bits of
- * wss.clean_entry. The table size, wss.num_entries,
- * is always a power-of-2.
- */
- entry = (atomic_inc_return(&wss.clean_entry) - 1)
- & (wss.num_entries - 1);
-
- /* clear the entry and count the bits */
- bits = xchg(&wss.entries[entry], 0);
- weight = hweight64((u64)bits);
- /* only adjust the contended total count if needed */
- if (weight)
- atomic_sub(weight, &wss.total_count);
- }
-}
-
-/*
- * Insert the given address into the working set array.
- */
-static void wss_insert(void *address)
-{
- u32 page = ((unsigned long)address >> PAGE_SHIFT) & wss.pages_mask;
- u32 entry = page / BITS_PER_LONG; /* assumes this ends up a shift */
- u32 nr = page & (BITS_PER_LONG - 1);
-
- if (!test_and_set_bit(nr, &wss.entries[entry]))
- atomic_inc(&wss.total_count);
-
- wss_advance_clean_counter();
-}
-
-/*
- * Is the working set larger than the threshold?
- */
-static inline bool wss_exceeds_threshold(void)
-{
- return atomic_read(&wss.total_count) >= wss.threshold;
-}
-
/*
* Translate ib_wr_opcode into ib_wc_opcode.
*/
@@ -438,79 +290,6 @@ static const u32 pio_opmask[BIT(3)] = {
*/
__be64 ib_hfi1_sys_image_guid;
-/**
- * hfi1_copy_sge - copy data to SGE memory
- * @ss: the SGE state
- * @data: the data to copy
- * @length: the length of the data
- * @release: boolean to release MR
- * @copy_last: do a separate copy of the last 8 bytes
- */
-void hfi1_copy_sge(
- struct rvt_sge_state *ss,
- void *data, u32 length,
- bool release,
- bool copy_last)
-{
- struct rvt_sge *sge = &ss->sge;
- int i;
- bool in_last = false;
- bool cacheless_copy = false;
-
- if (sge_copy_mode == COPY_CACHELESS) {
- cacheless_copy = length >= PAGE_SIZE;
- } else if (sge_copy_mode == COPY_ADAPTIVE) {
- if (length >= PAGE_SIZE) {
- /*
- * NOTE: this *assumes*:
- * o The first vaddr is the dest.
- * o If multiple pages, then vaddr is sequential.
- */
- wss_insert(sge->vaddr);
- if (length >= (2 * PAGE_SIZE))
- wss_insert(sge->vaddr + PAGE_SIZE);
-
- cacheless_copy = wss_exceeds_threshold();
- } else {
- wss_advance_clean_counter();
- }
- }
- if (copy_last) {
- if (length > 8) {
- length -= 8;
- } else {
- copy_last = false;
- in_last = true;
- }
- }
-
-again:
- while (length) {
- u32 len = rvt_get_sge_length(sge, length);
-
- WARN_ON_ONCE(len == 0);
- if (unlikely(in_last)) {
- /* enforce byte transfer ordering */
- for (i = 0; i < len; i++)
- ((u8 *)sge->vaddr)[i] = ((u8 *)data)[i];
- } else if (cacheless_copy) {
- cacheless_memcpy(sge->vaddr, data, len);
- } else {
- memcpy(sge->vaddr, data, len);
- }
- rvt_update_sge(ss, len, release);
- data += len;
- length -= len;
- }
-
- if (copy_last) {
- copy_last = false;
- in_last = true;
- length = 8;
- goto again;
- }
-}
-
/*
* Make sure the QP is ready and able to accept the given opcode.
*/
@@ -713,7 +492,7 @@ static void verbs_sdma_complete(
spin_lock(&qp->s_lock);
if (tx->wqe) {
- hfi1_send_complete(qp, tx->wqe, IB_WC_SUCCESS);
+ rvt_send_complete(qp, tx->wqe, IB_WC_SUCCESS);
} else if (qp->ibqp.qp_type == IB_QPT_RC) {
struct hfi1_opa_header *hdr;
@@ -737,7 +516,7 @@ static int wait_kmem(struct hfi1_ibdev *dev,
if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
write_seqlock(&dev->iowait_lock);
list_add_tail(&ps->s_txreq->txreq.list,
- &priv->s_iowait.tx_head);
+ &ps->wait->tx_head);
if (list_empty(&priv->s_iowait.list)) {
if (list_empty(&dev->memwait))
mod_timer(&dev->mem_timer, jiffies + 1);
@@ -748,7 +527,7 @@ static int wait_kmem(struct hfi1_ibdev *dev,
rvt_get_qp(qp);
}
write_sequnlock(&dev->iowait_lock);
- qp->s_flags &= ~RVT_S_BUSY;
+ hfi1_qp_unbusy(qp, ps->wait);
ret = -EBUSY;
}
spin_unlock_irqrestore(&qp->s_lock, flags);
@@ -950,8 +729,7 @@ int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
if (unlikely(ret))
goto bail_build;
}
- ret = sdma_send_txreq(tx->sde, &priv->s_iowait, &tx->txreq,
- ps->pkts_sent);
+ ret = sdma_send_txreq(tx->sde, ps->wait, &tx->txreq, ps->pkts_sent);
if (unlikely(ret < 0)) {
if (ret == -ECOMM)
goto bail_ecomm;
@@ -1001,7 +779,7 @@ static int pio_wait(struct rvt_qp *qp,
if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
write_seqlock(&dev->iowait_lock);
list_add_tail(&ps->s_txreq->txreq.list,
- &priv->s_iowait.tx_head);
+ &ps->wait->tx_head);
if (list_empty(&priv->s_iowait.list)) {
struct hfi1_ibdev *dev = &dd->verbs_dev;
int was_empty;
@@ -1020,7 +798,7 @@ static int pio_wait(struct rvt_qp *qp,
hfi1_sc_wantpiobuf_intr(sc, 1);
}
write_sequnlock(&dev->iowait_lock);
- qp->s_flags &= ~RVT_S_BUSY;
+ hfi1_qp_unbusy(qp, ps->wait);
ret = -EBUSY;
}
spin_unlock_irqrestore(&qp->s_lock, flags);
@@ -1160,7 +938,7 @@ int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
pio_bail:
if (qp->s_wqe) {
spin_lock_irqsave(&qp->s_lock, flags);
- hfi1_send_complete(qp, qp->s_wqe, wc_status);
+ rvt_send_complete(qp, qp->s_wqe, wc_status);
spin_unlock_irqrestore(&qp->s_lock, flags);
} else if (qp->ibqp.qp_type == IB_QPT_RC) {
spin_lock_irqsave(&qp->s_lock, flags);
@@ -1367,7 +1145,7 @@ int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
hfi1_cdbg(PIO, "%s() Failed. Completing with err",
__func__);
spin_lock_irqsave(&qp->s_lock, flags);
- hfi1_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR);
+ rvt_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR);
spin_unlock_irqrestore(&qp->s_lock, flags);
}
return -EINVAL;
@@ -1582,6 +1360,7 @@ static int hfi1_check_ah(struct ib_device *ibdev, struct rdma_ah_attr *ah_attr)
struct hfi1_pportdata *ppd;
struct hfi1_devdata *dd;
u8 sc5;
+ u8 sl;
if (hfi1_check_mcast(rdma_ah_get_dlid(ah_attr)) &&
!(rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH))
@@ -1590,8 +1369,13 @@ static int hfi1_check_ah(struct ib_device *ibdev, struct rdma_ah_attr *ah_attr)
/* test the mapping for validity */
ibp = to_iport(ibdev, rdma_ah_get_port_num(ah_attr));
ppd = ppd_from_ibp(ibp);
- sc5 = ibp->sl_to_sc[rdma_ah_get_sl(ah_attr)];
dd = dd_from_ppd(ppd);
+
+ sl = rdma_ah_get_sl(ah_attr);
+ if (sl >= ARRAY_SIZE(ibp->sl_to_sc))
+ return -EINVAL;
+
+ sc5 = ibp->sl_to_sc[sl];
if (sc_to_vlt(dd, sc5) > num_vls && sc_to_vlt(dd, sc5) != 0xf)
return -EINVAL;
return 0;
@@ -1937,7 +1721,7 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd)
dd->verbs_dev.rdi.driver_f.check_modify_qp = hfi1_check_modify_qp;
dd->verbs_dev.rdi.driver_f.modify_qp = hfi1_modify_qp;
dd->verbs_dev.rdi.driver_f.notify_restart_rc = hfi1_restart_rc;
- dd->verbs_dev.rdi.driver_f.check_send_wqe = hfi1_check_send_wqe;
+ dd->verbs_dev.rdi.driver_f.setup_wqe = hfi1_setup_wqe;
dd->verbs_dev.rdi.driver_f.comp_vect_cpu_lookup =
hfi1_comp_vect_mappings_lookup;
@@ -1950,10 +1734,16 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd)
dd->verbs_dev.rdi.dparms.lkey_table_size = hfi1_lkey_table_size;
dd->verbs_dev.rdi.dparms.nports = dd->num_pports;
dd->verbs_dev.rdi.dparms.npkeys = hfi1_get_npkeys(dd);
+ dd->verbs_dev.rdi.dparms.sge_copy_mode = sge_copy_mode;
+ dd->verbs_dev.rdi.dparms.wss_threshold = wss_threshold;
+ dd->verbs_dev.rdi.dparms.wss_clean_period = wss_clean_period;
/* post send table */
dd->verbs_dev.rdi.post_parms = hfi1_post_parms;
+ /* opcode translation table */
+ dd->verbs_dev.rdi.wc_opcode = ib_hfi1_wc_opcode;
+
ppd = dd->pport;
for (i = 0; i < dd->num_pports; i++, ppd++)
rvt_init_port(&dd->verbs_dev.rdi,
@@ -1961,6 +1751,9 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd)
i,
ppd->pkeys);
+ rdma_set_device_sysfs_group(&dd->verbs_dev.rdi.ibdev,
+ &ib_hfi1_attr_group);
+
ret = rvt_register_device(&dd->verbs_dev.rdi, RDMA_DRIVER_HFI1);
if (ret)
goto err_verbs_txreq;