aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/drivers/infiniband/hw/hfi1
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--drivers/infiniband/hw/hfi1/chip.c59
-rw-r--r--drivers/infiniband/hw/hfi1/exp_rcv.h5
-rw-r--r--drivers/infiniband/hw/hfi1/file_ops.c92
-rw-r--r--drivers/infiniband/hw/hfi1/init.c2
-rw-r--r--drivers/infiniband/hw/hfi1/sdma.c4
-rw-r--r--drivers/infiniband/hw/hfi1/sdma.h15
-rw-r--r--drivers/infiniband/hw/hfi1/user_exp_rcv.c255
-rw-r--r--drivers/infiniband/hw/hfi1/user_exp_rcv.h3
-rw-r--r--drivers/infiniband/hw/hfi1/user_pages.c61
-rw-r--r--drivers/infiniband/hw/hfi1/verbs.c81
10 files changed, 346 insertions, 231 deletions
diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c
index ebe970f76232..90b672feed83 100644
--- a/drivers/infiniband/hw/hfi1/chip.c
+++ b/drivers/infiniband/hw/hfi1/chip.c
@@ -1056,7 +1056,7 @@ static void read_link_down_reason(struct hfi1_devdata *dd, u8 *ldr);
static void handle_temp_err(struct hfi1_devdata *dd);
static void dc_shutdown(struct hfi1_devdata *dd);
static void dc_start(struct hfi1_devdata *dd);
-static int qos_rmt_entries(struct hfi1_devdata *dd, unsigned int *mp,
+static int qos_rmt_entries(unsigned int n_krcv_queues, unsigned int *mp,
unsigned int *np);
static void clear_full_mgmt_pkey(struct hfi1_pportdata *ppd);
static int wait_link_transfer_active(struct hfi1_devdata *dd, int wait_ms);
@@ -13362,7 +13362,6 @@ static int set_up_context_variables(struct hfi1_devdata *dd)
int ret;
unsigned ngroups;
int rmt_count;
- int user_rmt_reduced;
u32 n_usr_ctxts;
u32 send_contexts = chip_send_contexts(dd);
u32 rcv_contexts = chip_rcv_contexts(dd);
@@ -13421,28 +13420,34 @@ static int set_up_context_variables(struct hfi1_devdata *dd)
(num_kernel_contexts + n_usr_ctxts),
&node_affinity.real_cpu_mask);
/*
- * The RMT entries are currently allocated as shown below:
- * 1. QOS (0 to 128 entries);
- * 2. FECN (num_kernel_context - 1 + num_user_contexts +
- * num_netdev_contexts);
- * 3. netdev (num_netdev_contexts).
- * It should be noted that FECN oversubscribe num_netdev_contexts
- * entries of RMT because both netdev and PSM could allocate any receive
- * context between dd->first_dyn_alloc_text and dd->num_rcv_contexts,
- * and PSM FECN must reserve an RMT entry for each possible PSM receive
- * context.
+ * RMT entries are allocated as follows:
+ * 1. QOS (0 to 128 entries)
+ * 2. FECN (num_kernel_context - 1 [a] + num_user_contexts +
+ * num_netdev_contexts [b])
+ * 3. netdev (NUM_NETDEV_MAP_ENTRIES)
+ *
+ * Notes:
+ * [a] Kernel contexts (except control) are included in FECN if kernel
+ * TID_RDMA is active.
+ * [b] Netdev and user contexts are randomly allocated from the same
+ * context pool, so FECN must cover all contexts in the pool.
*/
- rmt_count = qos_rmt_entries(dd, NULL, NULL) + (num_netdev_contexts * 2);
- if (HFI1_CAP_IS_KSET(TID_RDMA))
- rmt_count += num_kernel_contexts - 1;
- if (rmt_count + n_usr_ctxts > NUM_MAP_ENTRIES) {
- user_rmt_reduced = NUM_MAP_ENTRIES - rmt_count;
- dd_dev_err(dd,
- "RMT size is reducing the number of user receive contexts from %u to %d\n",
- n_usr_ctxts,
- user_rmt_reduced);
- /* recalculate */
- n_usr_ctxts = user_rmt_reduced;
+ rmt_count = qos_rmt_entries(num_kernel_contexts - 1, NULL, NULL)
+ + (HFI1_CAP_IS_KSET(TID_RDMA) ? num_kernel_contexts - 1
+ : 0)
+ + n_usr_ctxts
+ + num_netdev_contexts
+ + NUM_NETDEV_MAP_ENTRIES;
+ if (rmt_count > NUM_MAP_ENTRIES) {
+ int over = rmt_count - NUM_MAP_ENTRIES;
+ /* try to squish user contexts, minimum of 1 */
+ if (over >= n_usr_ctxts) {
+ dd_dev_err(dd, "RMT overflow: reduce the requested number of contexts\n");
+ return -EINVAL;
+ }
+ dd_dev_err(dd, "RMT overflow: reducing # user contexts from %u to %u\n",
+ n_usr_ctxts, n_usr_ctxts - over);
+ n_usr_ctxts -= over;
}
/* the first N are kernel contexts, the rest are user/netdev contexts */
@@ -14299,15 +14304,15 @@ static void clear_rsm_rule(struct hfi1_devdata *dd, u8 rule_index)
}
/* return the number of RSM map table entries that will be used for QOS */
-static int qos_rmt_entries(struct hfi1_devdata *dd, unsigned int *mp,
+static int qos_rmt_entries(unsigned int n_krcv_queues, unsigned int *mp,
unsigned int *np)
{
int i;
unsigned int m, n;
- u8 max_by_vl = 0;
+ uint max_by_vl = 0;
/* is QOS active at all? */
- if (dd->n_krcv_queues <= MIN_KERNEL_KCTXTS ||
+ if (n_krcv_queues < MIN_KERNEL_KCTXTS ||
num_vls == 1 ||
krcvqsset <= 1)
goto no_qos;
@@ -14365,7 +14370,7 @@ static void init_qos(struct hfi1_devdata *dd, struct rsm_map_table *rmt)
if (!rmt)
goto bail;
- rmt_entries = qos_rmt_entries(dd, &m, &n);
+ rmt_entries = qos_rmt_entries(dd->n_krcv_queues - 1, &m, &n);
if (rmt_entries == 0)
goto bail;
qpns_per_vl = 1 << m;
diff --git a/drivers/infiniband/hw/hfi1/exp_rcv.h b/drivers/infiniband/hw/hfi1/exp_rcv.h
index c6291bbf723c..41f7fe5d1839 100644
--- a/drivers/infiniband/hw/hfi1/exp_rcv.h
+++ b/drivers/infiniband/hw/hfi1/exp_rcv.h
@@ -133,12 +133,13 @@ static inline struct tid_group *tid_group_pop(struct exp_tid_set *set)
return grp;
}
-static inline u32 rcventry2tidinfo(u32 rcventry)
+static inline u32 create_tid(u32 rcventry, u32 npages)
{
u32 pair = rcventry & ~0x1;
return EXP_TID_SET(IDX, pair >> 1) |
- EXP_TID_SET(CTRL, 1 << (rcventry - pair));
+ EXP_TID_SET(CTRL, 1 << (rcventry - pair)) |
+ EXP_TID_SET(LEN, npages);
}
/**
diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c
index f5f9269fdc16..b1d6ca7e9708 100644
--- a/drivers/infiniband/hw/hfi1/file_ops.c
+++ b/drivers/infiniband/hw/hfi1/file_ops.c
@@ -306,6 +306,17 @@ static ssize_t hfi1_write_iter(struct kiocb *kiocb, struct iov_iter *from)
return reqs;
}
+static inline void mmap_cdbg(u16 ctxt, u8 subctxt, u8 type, u8 mapio, u8 vmf,
+ u64 memaddr, void *memvirt, dma_addr_t memdma,
+ ssize_t memlen, struct vm_area_struct *vma)
+{
+ hfi1_cdbg(PROC,
+ "%u:%u type:%u io/vf/dma:%d/%d/%d, addr:0x%llx, len:%lu(%lu), flags:0x%lx",
+ ctxt, subctxt, type, mapio, vmf, !!memdma,
+ memaddr ?: (u64)memvirt, memlen,
+ vma->vm_end - vma->vm_start, vma->vm_flags);
+}
+
static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma)
{
struct hfi1_filedata *fd = fp->private_data;
@@ -315,6 +326,7 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma)
u64 token = vma->vm_pgoff << PAGE_SHIFT,
memaddr = 0;
void *memvirt = NULL;
+ dma_addr_t memdma = 0;
u8 subctxt, mapio = 0, vmf = 0, type;
ssize_t memlen = 0;
int ret = 0;
@@ -334,6 +346,11 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma)
goto done;
}
+ /*
+ * vm_pgoff is used as a buffer selector cookie. Always mmap from
+ * the beginning.
+ */
+ vma->vm_pgoff = 0;
flags = vma->vm_flags;
switch (type) {
@@ -355,7 +372,8 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma)
vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
mapio = 1;
break;
- case PIO_CRED:
+ case PIO_CRED: {
+ u64 cr_page_offset;
if (flags & VM_WRITE) {
ret = -EPERM;
goto done;
@@ -365,10 +383,11 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma)
* second or third page allocated for credit returns (if number
* of enabled contexts > 64 and 128 respectively).
*/
- memvirt = dd->cr_base[uctxt->numa_id].va;
- memaddr = virt_to_phys(memvirt) +
- (((u64)uctxt->sc->hw_free -
- (u64)dd->cr_base[uctxt->numa_id].va) & PAGE_MASK);
+ cr_page_offset = ((u64)uctxt->sc->hw_free -
+ (u64)dd->cr_base[uctxt->numa_id].va) &
+ PAGE_MASK;
+ memvirt = dd->cr_base[uctxt->numa_id].va + cr_page_offset;
+ memdma = dd->cr_base[uctxt->numa_id].dma + cr_page_offset;
memlen = PAGE_SIZE;
flags &= ~VM_MAYWRITE;
flags |= VM_DONTCOPY | VM_DONTEXPAND;
@@ -378,14 +397,16 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma)
* memory been flagged as non-cached?
*/
/* vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); */
- mapio = 1;
break;
+ }
case RCV_HDRQ:
memlen = rcvhdrq_size(uctxt);
memvirt = uctxt->rcvhdrq;
+ memdma = uctxt->rcvhdrq_dma;
break;
case RCV_EGRBUF: {
- unsigned long addr;
+ unsigned long vm_start_save;
+ unsigned long vm_end_save;
int i;
/*
* The RcvEgr buffer need to be handled differently
@@ -403,25 +424,35 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma)
ret = -EPERM;
goto done;
}
- vma->vm_flags &= ~VM_MAYWRITE;
- addr = vma->vm_start;
+ vm_flags_clear(vma, VM_MAYWRITE);
+ /*
+ * Mmap multiple separate allocations into a single vma. From
+ * here, dma_mmap_coherent() calls dma_direct_mmap(), which
+ * requires the mmap to exactly fill the vma starting at
+ * vma_start. Adjust the vma start and end for each eager
+ * buffer segment mapped. Restore the originals when done.
+ */
+ vm_start_save = vma->vm_start;
+ vm_end_save = vma->vm_end;
+ vma->vm_end = vma->vm_start;
for (i = 0 ; i < uctxt->egrbufs.numbufs; i++) {
memlen = uctxt->egrbufs.buffers[i].len;
memvirt = uctxt->egrbufs.buffers[i].addr;
- ret = remap_pfn_range(
- vma, addr,
- /*
- * virt_to_pfn() does the same, but
- * it's not available on x86_64
- * when CONFIG_MMU is enabled.
- */
- PFN_DOWN(__pa(memvirt)),
- memlen,
- vma->vm_page_prot);
- if (ret < 0)
+ memdma = uctxt->egrbufs.buffers[i].dma;
+ vma->vm_end += memlen;
+ mmap_cdbg(ctxt, subctxt, type, mapio, vmf, memaddr,
+ memvirt, memdma, memlen, vma);
+ ret = dma_mmap_coherent(&dd->pcidev->dev, vma,
+ memvirt, memdma, memlen);
+ if (ret < 0) {
+ vma->vm_start = vm_start_save;
+ vma->vm_end = vm_end_save;
goto done;
- addr += memlen;
+ }
+ vma->vm_start += memlen;
}
+ vma->vm_start = vm_start_save;
+ vma->vm_end = vm_end_save;
ret = 0;
goto done;
}
@@ -481,6 +512,7 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma)
}
memlen = PAGE_SIZE;
memvirt = (void *)hfi1_rcvhdrtail_kvaddr(uctxt);
+ memdma = uctxt->rcvhdrqtailaddr_dma;
flags &= ~VM_MAYWRITE;
break;
case SUBCTXT_UREGS:
@@ -528,15 +560,16 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma)
goto done;
}
- vma->vm_flags = flags;
- hfi1_cdbg(PROC,
- "%u:%u type:%u io/vf:%d/%d, addr:0x%llx, len:%lu(%lu), flags:0x%lx\n",
- ctxt, subctxt, type, mapio, vmf, memaddr, memlen,
- vma->vm_end - vma->vm_start, vma->vm_flags);
+ vm_flags_reset(vma, flags);
+ mmap_cdbg(ctxt, subctxt, type, mapio, vmf, memaddr, memvirt, memdma,
+ memlen, vma);
if (vmf) {
vma->vm_pgoff = PFN_DOWN(memaddr);
vma->vm_ops = &vm_ops;
ret = 0;
+ } else if (memdma) {
+ ret = dma_mmap_coherent(&dd->pcidev->dev, vma,
+ memvirt, memdma, memlen);
} else if (mapio) {
ret = io_remap_pfn_range(vma, vma->vm_start,
PFN_DOWN(memaddr),
@@ -1318,12 +1351,15 @@ static int user_exp_rcv_setup(struct hfi1_filedata *fd, unsigned long arg,
addr = arg + offsetof(struct hfi1_tid_info, tidcnt);
if (copy_to_user((void __user *)addr, &tinfo.tidcnt,
sizeof(tinfo.tidcnt)))
- return -EFAULT;
+ ret = -EFAULT;
addr = arg + offsetof(struct hfi1_tid_info, length);
- if (copy_to_user((void __user *)addr, &tinfo.length,
+ if (!ret && copy_to_user((void __user *)addr, &tinfo.length,
sizeof(tinfo.length)))
ret = -EFAULT;
+
+ if (ret)
+ hfi1_user_exp_rcv_invalid(fd, &tinfo);
}
return ret;
diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c
index 24c0f0d257fc..62b6c5020039 100644
--- a/drivers/infiniband/hw/hfi1/init.c
+++ b/drivers/infiniband/hw/hfi1/init.c
@@ -464,7 +464,7 @@ bail:
*
* This wrapper is the free function that matches hfi1_create_ctxtdata().
* When a context is done being used (kernel or user), this function is called
- * for the "final" put to match the kref init from hf1i_create_ctxtdata().
+ * for the "final" put to match the kref init from hfi1_create_ctxtdata().
* Other users of the context do a get/put sequence to make sure that the
* structure isn't removed while in use.
*/
diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c
index a95b654f5254..8ed20392e9f0 100644
--- a/drivers/infiniband/hw/hfi1/sdma.c
+++ b/drivers/infiniband/hw/hfi1/sdma.c
@@ -3160,8 +3160,7 @@ int _pad_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx)
{
int rval = 0;
- tx->num_desc++;
- if ((unlikely(tx->num_desc == tx->desc_limit))) {
+ if ((unlikely(tx->num_desc + 1 == tx->desc_limit))) {
rval = _extend_sdma_tx_descs(dd, tx);
if (rval) {
__sdma_txclean(dd, tx);
@@ -3174,6 +3173,7 @@ int _pad_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx)
SDMA_MAP_NONE,
dd->sdma_pad_phys,
sizeof(u32) - (tx->packet_len & (sizeof(u32) - 1)));
+ tx->num_desc++;
_sdma_close_tx(dd, tx);
return rval;
}
diff --git a/drivers/infiniband/hw/hfi1/sdma.h b/drivers/infiniband/hw/hfi1/sdma.h
index d8170fcbfbdd..b023fc461bd5 100644
--- a/drivers/infiniband/hw/hfi1/sdma.h
+++ b/drivers/infiniband/hw/hfi1/sdma.h
@@ -631,14 +631,13 @@ static inline void sdma_txclean(struct hfi1_devdata *dd, struct sdma_txreq *tx)
static inline void _sdma_close_tx(struct hfi1_devdata *dd,
struct sdma_txreq *tx)
{
- tx->descp[tx->num_desc].qw[0] |=
- SDMA_DESC0_LAST_DESC_FLAG;
- tx->descp[tx->num_desc].qw[1] |=
- dd->default_desc1;
+ u16 last_desc = tx->num_desc - 1;
+
+ tx->descp[last_desc].qw[0] |= SDMA_DESC0_LAST_DESC_FLAG;
+ tx->descp[last_desc].qw[1] |= dd->default_desc1;
if (tx->flags & SDMA_TXREQ_F_URGENT)
- tx->descp[tx->num_desc].qw[1] |=
- (SDMA_DESC1_HEAD_TO_HOST_FLAG |
- SDMA_DESC1_INT_REQ_FLAG);
+ tx->descp[last_desc].qw[1] |= (SDMA_DESC1_HEAD_TO_HOST_FLAG |
+ SDMA_DESC1_INT_REQ_FLAG);
}
static inline int _sdma_txadd_daddr(
@@ -655,6 +654,7 @@ static inline int _sdma_txadd_daddr(
type,
addr, len);
WARN_ON(len > tx->tlen);
+ tx->num_desc++;
tx->tlen -= len;
/* special cases for last */
if (!tx->tlen) {
@@ -666,7 +666,6 @@ static inline int _sdma_txadd_daddr(
_sdma_close_tx(dd, tx);
}
}
- tx->num_desc++;
return rval;
}
diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c
index 186d30291260..96058baf36ed 100644
--- a/drivers/infiniband/hw/hfi1/user_exp_rcv.c
+++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c
@@ -23,18 +23,24 @@ static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
static bool tid_rb_invalidate(struct mmu_interval_notifier *mni,
const struct mmu_notifier_range *range,
unsigned long cur_seq);
+static bool tid_cover_invalidate(struct mmu_interval_notifier *mni,
+ const struct mmu_notifier_range *range,
+ unsigned long cur_seq);
static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *,
- struct tid_group *grp,
- unsigned int start, u16 count,
+ struct tid_group *grp, u16 count,
u32 *tidlist, unsigned int *tididx,
unsigned int *pmapped);
-static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
- struct tid_group **grp);
+static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo);
+static void __clear_tid_node(struct hfi1_filedata *fd,
+ struct tid_rb_node *node);
static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node);
static const struct mmu_interval_notifier_ops tid_mn_ops = {
.invalidate = tid_rb_invalidate,
};
+static const struct mmu_interval_notifier_ops tid_cover_ops = {
+ .invalidate = tid_cover_invalidate,
+};
/*
* Initialize context and file private data needed for Expected
@@ -153,16 +159,11 @@ static void unpin_rcv_pages(struct hfi1_filedata *fd,
static int pin_rcv_pages(struct hfi1_filedata *fd, struct tid_user_buf *tidbuf)
{
int pinned;
- unsigned int npages;
+ unsigned int npages = tidbuf->npages;
unsigned long vaddr = tidbuf->vaddr;
struct page **pages = NULL;
struct hfi1_devdata *dd = fd->uctxt->dd;
- /* Get the number of pages the user buffer spans */
- npages = num_user_pages(vaddr, tidbuf->length);
- if (!npages)
- return -EINVAL;
-
if (npages > fd->uctxt->expected_count) {
dd_dev_err(dd, "Expected buffer too big\n");
return -EINVAL;
@@ -189,7 +190,6 @@ static int pin_rcv_pages(struct hfi1_filedata *fd, struct tid_user_buf *tidbuf)
return pinned;
}
tidbuf->pages = pages;
- tidbuf->npages = npages;
fd->tid_n_pinned += pinned;
return pinned;
}
@@ -249,57 +249,70 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd,
int ret = 0, need_group = 0, pinned;
struct hfi1_ctxtdata *uctxt = fd->uctxt;
struct hfi1_devdata *dd = uctxt->dd;
- unsigned int ngroups, pageidx = 0, pageset_count,
+ unsigned int ngroups, pageset_count,
tididx = 0, mapped, mapped_pages = 0;
u32 *tidlist = NULL;
struct tid_user_buf *tidbuf;
+ unsigned long mmu_seq = 0;
if (!PAGE_ALIGNED(tinfo->vaddr))
return -EINVAL;
+ if (tinfo->length == 0)
+ return -EINVAL;
tidbuf = kzalloc(sizeof(*tidbuf), GFP_KERNEL);
if (!tidbuf)
return -ENOMEM;
+ mutex_init(&tidbuf->cover_mutex);
tidbuf->vaddr = tinfo->vaddr;
tidbuf->length = tinfo->length;
+ tidbuf->npages = num_user_pages(tidbuf->vaddr, tidbuf->length);
tidbuf->psets = kcalloc(uctxt->expected_count, sizeof(*tidbuf->psets),
GFP_KERNEL);
if (!tidbuf->psets) {
- kfree(tidbuf);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto fail_release_mem;
+ }
+
+ if (fd->use_mn) {
+ ret = mmu_interval_notifier_insert(
+ &tidbuf->notifier, current->mm,
+ tidbuf->vaddr, tidbuf->npages * PAGE_SIZE,
+ &tid_cover_ops);
+ if (ret)
+ goto fail_release_mem;
+ mmu_seq = mmu_interval_read_begin(&tidbuf->notifier);
}
pinned = pin_rcv_pages(fd, tidbuf);
if (pinned <= 0) {
- kfree(tidbuf->psets);
- kfree(tidbuf);
- return pinned;
+ ret = (pinned < 0) ? pinned : -ENOSPC;
+ goto fail_unpin;
}
/* Find sets of physically contiguous pages */
tidbuf->n_psets = find_phys_blocks(tidbuf, pinned);
- /*
- * We don't need to access this under a lock since tid_used is per
- * process and the same process cannot be in hfi1_user_exp_rcv_clear()
- * and hfi1_user_exp_rcv_setup() at the same time.
- */
+ /* Reserve the number of expected tids to be used. */
spin_lock(&fd->tid_lock);
if (fd->tid_used + tidbuf->n_psets > fd->tid_limit)
pageset_count = fd->tid_limit - fd->tid_used;
else
pageset_count = tidbuf->n_psets;
+ fd->tid_used += pageset_count;
spin_unlock(&fd->tid_lock);
- if (!pageset_count)
- goto bail;
+ if (!pageset_count) {
+ ret = -ENOSPC;
+ goto fail_unreserve;
+ }
ngroups = pageset_count / dd->rcv_entries.group_size;
tidlist = kcalloc(pageset_count, sizeof(*tidlist), GFP_KERNEL);
if (!tidlist) {
ret = -ENOMEM;
- goto nomem;
+ goto fail_unreserve;
}
tididx = 0;
@@ -318,7 +331,7 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd,
tid_group_pop(&uctxt->tid_group_list);
ret = program_rcvarray(fd, tidbuf, grp,
- pageidx, dd->rcv_entries.group_size,
+ dd->rcv_entries.group_size,
tidlist, &tididx, &mapped);
/*
* If there was a failure to program the RcvArray
@@ -334,11 +347,10 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd,
tid_group_add_tail(grp, &uctxt->tid_full_list);
ngroups--;
- pageidx += ret;
mapped_pages += mapped;
}
- while (pageidx < pageset_count) {
+ while (tididx < pageset_count) {
struct tid_group *grp, *ptr;
/*
* If we don't have any partially used tid groups, check
@@ -360,11 +372,11 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd,
*/
list_for_each_entry_safe(grp, ptr, &uctxt->tid_used_list.list,
list) {
- unsigned use = min_t(unsigned, pageset_count - pageidx,
+ unsigned use = min_t(unsigned, pageset_count - tididx,
grp->size - grp->used);
ret = program_rcvarray(fd, tidbuf, grp,
- pageidx, use, tidlist,
+ use, tidlist,
&tididx, &mapped);
if (ret < 0) {
hfi1_cdbg(TID,
@@ -376,11 +388,10 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd,
tid_group_move(grp,
&uctxt->tid_used_list,
&uctxt->tid_full_list);
- pageidx += ret;
mapped_pages += mapped;
need_group = 0;
/* Check if we are done so we break out early */
- if (pageidx >= pageset_count)
+ if (tididx >= pageset_count)
break;
} else if (WARN_ON(ret == 0)) {
/*
@@ -395,43 +406,78 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd,
}
unlock:
mutex_unlock(&uctxt->exp_mutex);
-nomem:
hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)", tididx,
mapped_pages, ret);
- if (tididx) {
- spin_lock(&fd->tid_lock);
- fd->tid_used += tididx;
- spin_unlock(&fd->tid_lock);
- tinfo->tidcnt = tididx;
- tinfo->length = mapped_pages * PAGE_SIZE;
-
- if (copy_to_user(u64_to_user_ptr(tinfo->tidlist),
- tidlist, sizeof(tidlist[0]) * tididx)) {
- /*
- * On failure to copy to the user level, we need to undo
- * everything done so far so we don't leak resources.
- */
- tinfo->tidlist = (unsigned long)&tidlist;
- hfi1_user_exp_rcv_clear(fd, tinfo);
- tinfo->tidlist = 0;
- ret = -EFAULT;
- goto bail;
+
+ /* fail if nothing was programmed, set error if none provided */
+ if (tididx == 0) {
+ if (ret >= 0)
+ ret = -ENOSPC;
+ goto fail_unreserve;
+ }
+
+ /* adjust reserved tid_used to actual count */
+ spin_lock(&fd->tid_lock);
+ fd->tid_used -= pageset_count - tididx;
+ spin_unlock(&fd->tid_lock);
+
+ /* unpin all pages not covered by a TID */
+ unpin_rcv_pages(fd, tidbuf, NULL, mapped_pages, pinned - mapped_pages,
+ false);
+
+ if (fd->use_mn) {
+ /* check for an invalidate during setup */
+ bool fail = false;
+
+ mutex_lock(&tidbuf->cover_mutex);
+ fail = mmu_interval_read_retry(&tidbuf->notifier, mmu_seq);
+ mutex_unlock(&tidbuf->cover_mutex);
+
+ if (fail) {
+ ret = -EBUSY;
+ goto fail_unprogram;
}
}
- /*
- * If not everything was mapped (due to insufficient RcvArray entries,
- * for example), unpin all unmapped pages so we can pin them nex time.
- */
- if (mapped_pages != pinned)
- unpin_rcv_pages(fd, tidbuf, NULL, mapped_pages,
- (pinned - mapped_pages), false);
-bail:
+ tinfo->tidcnt = tididx;
+ tinfo->length = mapped_pages * PAGE_SIZE;
+
+ if (copy_to_user(u64_to_user_ptr(tinfo->tidlist),
+ tidlist, sizeof(tidlist[0]) * tididx)) {
+ ret = -EFAULT;
+ goto fail_unprogram;
+ }
+
+ if (fd->use_mn)
+ mmu_interval_notifier_remove(&tidbuf->notifier);
+ kfree(tidbuf->pages);
kfree(tidbuf->psets);
+ kfree(tidbuf);
kfree(tidlist);
+ return 0;
+
+fail_unprogram:
+ /* unprogram, unmap, and unpin all allocated TIDs */
+ tinfo->tidlist = (unsigned long)tidlist;
+ hfi1_user_exp_rcv_clear(fd, tinfo);
+ tinfo->tidlist = 0;
+ pinned = 0; /* nothing left to unpin */
+ pageset_count = 0; /* nothing left reserved */
+fail_unreserve:
+ spin_lock(&fd->tid_lock);
+ fd->tid_used -= pageset_count;
+ spin_unlock(&fd->tid_lock);
+fail_unpin:
+ if (fd->use_mn)
+ mmu_interval_notifier_remove(&tidbuf->notifier);
+ if (pinned > 0)
+ unpin_rcv_pages(fd, tidbuf, NULL, 0, pinned, false);
+fail_release_mem:
kfree(tidbuf->pages);
+ kfree(tidbuf->psets);
kfree(tidbuf);
- return ret > 0 ? 0 : ret;
+ kfree(tidlist);
+ return ret;
}
int hfi1_user_exp_rcv_clear(struct hfi1_filedata *fd,
@@ -452,7 +498,7 @@ int hfi1_user_exp_rcv_clear(struct hfi1_filedata *fd,
mutex_lock(&uctxt->exp_mutex);
for (tididx = 0; tididx < tinfo->tidcnt; tididx++) {
- ret = unprogram_rcvarray(fd, tidinfo[tididx], NULL);
+ ret = unprogram_rcvarray(fd, tidinfo[tididx]);
if (ret) {
hfi1_cdbg(TID, "Failed to unprogram rcv array %d",
ret);
@@ -589,7 +635,6 @@ static u32 find_phys_blocks(struct tid_user_buf *tidbuf, unsigned int npages)
* struct tid_pageset holding information on physically contiguous
* chunks from the user buffer), and other fields.
* @grp: RcvArray group
- * @start: starting index into sets array
* @count: number of struct tid_pageset's to program
* @tidlist: the array of u32 elements when the information about the
* programmed RcvArray entries is to be encoded.
@@ -609,14 +654,14 @@ static u32 find_phys_blocks(struct tid_user_buf *tidbuf, unsigned int npages)
* number of RcvArray entries programmed.
*/
static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *tbuf,
- struct tid_group *grp,
- unsigned int start, u16 count,
+ struct tid_group *grp, u16 count,
u32 *tidlist, unsigned int *tididx,
unsigned int *pmapped)
{
struct hfi1_ctxtdata *uctxt = fd->uctxt;
struct hfi1_devdata *dd = uctxt->dd;
u16 idx;
+ unsigned int start = *tididx;
u32 tidinfo = 0, rcventry, useidx = 0;
int mapped = 0;
@@ -661,8 +706,7 @@ static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *tbuf,
return ret;
mapped += npages;
- tidinfo = rcventry2tidinfo(rcventry - uctxt->expected_base) |
- EXP_TID_SET(LEN, npages);
+ tidinfo = create_tid(rcventry - uctxt->expected_base, npages);
tidlist[(*tididx)++] = tidinfo;
grp->used++;
grp->map |= 1 << useidx++;
@@ -706,6 +750,7 @@ static int set_rcvarray_entry(struct hfi1_filedata *fd,
}
node->fdata = fd;
+ mutex_init(&node->invalidate_mutex);
node->phys = page_to_phys(pages[0]);
node->npages = npages;
node->rcventry = rcventry;
@@ -721,11 +766,6 @@ static int set_rcvarray_entry(struct hfi1_filedata *fd,
&tid_mn_ops);
if (ret)
goto out_unmap;
- /*
- * FIXME: This is in the wrong order, the notifier should be
- * established before the pages are pinned by pin_rcv_pages.
- */
- mmu_interval_read_begin(&node->notifier);
}
fd->entry_to_rb[node->rcventry - uctxt->expected_base] = node;
@@ -745,33 +785,29 @@ out_unmap:
return -EFAULT;
}
-static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
- struct tid_group **grp)
+static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo)
{
struct hfi1_ctxtdata *uctxt = fd->uctxt;
struct hfi1_devdata *dd = uctxt->dd;
struct tid_rb_node *node;
- u8 tidctrl = EXP_TID_GET(tidinfo, CTRL);
+ u32 tidctrl = EXP_TID_GET(tidinfo, CTRL);
u32 tididx = EXP_TID_GET(tidinfo, IDX) << 1, rcventry;
- if (tididx >= uctxt->expected_count) {
- dd_dev_err(dd, "Invalid RcvArray entry (%u) index for ctxt %u\n",
- tididx, uctxt->ctxt);
- return -EINVAL;
- }
-
- if (tidctrl == 0x3)
+ if (tidctrl == 0x3 || tidctrl == 0x0)
return -EINVAL;
rcventry = tididx + (tidctrl - 1);
+ if (rcventry >= uctxt->expected_count) {
+ dd_dev_err(dd, "Invalid RcvArray entry (%u) index for ctxt %u\n",
+ rcventry, uctxt->ctxt);
+ return -EINVAL;
+ }
+
node = fd->entry_to_rb[rcventry];
if (!node || node->rcventry != (uctxt->expected_base + rcventry))
return -EBADF;
- if (grp)
- *grp = node->grp;
-
if (fd->use_mn)
mmu_interval_notifier_remove(&node->notifier);
cacheless_tid_rb_remove(fd, node);
@@ -779,23 +815,34 @@ static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
return 0;
}
-static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node)
+static void __clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node)
{
struct hfi1_ctxtdata *uctxt = fd->uctxt;
struct hfi1_devdata *dd = uctxt->dd;
+ mutex_lock(&node->invalidate_mutex);
+ if (node->freed)
+ goto done;
+ node->freed = true;
+
trace_hfi1_exp_tid_unreg(uctxt->ctxt, fd->subctxt, node->rcventry,
node->npages,
node->notifier.interval_tree.start, node->phys,
node->dma_addr);
- /*
- * Make sure device has seen the write before we unpin the
- * pages.
- */
+ /* Make sure device has seen the write before pages are unpinned */
hfi1_put_tid(dd, node->rcventry, PT_INVALID_FLUSH, 0, 0);
unpin_rcv_pages(fd, NULL, node, 0, node->npages, true);
+done:
+ mutex_unlock(&node->invalidate_mutex);
+}
+
+static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node)
+{
+ struct hfi1_ctxtdata *uctxt = fd->uctxt;
+
+ __clear_tid_node(fd, node);
node->grp->used--;
node->grp->map &= ~(1 << (node->rcventry - node->grp->base));
@@ -854,17 +901,22 @@ static bool tid_rb_invalidate(struct mmu_interval_notifier *mni,
if (node->freed)
return true;
+ /* take action only if unmapping */
+ if (range->event != MMU_NOTIFY_UNMAP)
+ return true;
+
trace_hfi1_exp_tid_inval(uctxt->ctxt, fdata->subctxt,
node->notifier.interval_tree.start,
node->rcventry, node->npages, node->dma_addr);
- node->freed = true;
+
+ /* clear the hardware rcvarray entry */
+ __clear_tid_node(fdata, node);
spin_lock(&fdata->invalid_lock);
if (fdata->invalid_tid_idx < uctxt->expected_count) {
fdata->invalid_tids[fdata->invalid_tid_idx] =
- rcventry2tidinfo(node->rcventry - uctxt->expected_base);
- fdata->invalid_tids[fdata->invalid_tid_idx] |=
- EXP_TID_SET(LEN, node->npages);
+ create_tid(node->rcventry - uctxt->expected_base,
+ node->npages);
if (!fdata->invalid_tid_idx) {
unsigned long *ev;
@@ -887,6 +939,23 @@ static bool tid_rb_invalidate(struct mmu_interval_notifier *mni,
return true;
}
+static bool tid_cover_invalidate(struct mmu_interval_notifier *mni,
+ const struct mmu_notifier_range *range,
+ unsigned long cur_seq)
+{
+ struct tid_user_buf *tidbuf =
+ container_of(mni, struct tid_user_buf, notifier);
+
+ /* take action only if unmapping */
+ if (range->event == MMU_NOTIFY_UNMAP) {
+ mutex_lock(&tidbuf->cover_mutex);
+ mmu_interval_set_seq(mni, cur_seq);
+ mutex_unlock(&tidbuf->cover_mutex);
+ }
+
+ return true;
+}
+
static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
struct tid_rb_node *tnode)
{
diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.h b/drivers/infiniband/hw/hfi1/user_exp_rcv.h
index 8c53e416bf84..f8ee997d0050 100644
--- a/drivers/infiniband/hw/hfi1/user_exp_rcv.h
+++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.h
@@ -16,6 +16,8 @@ struct tid_pageset {
};
struct tid_user_buf {
+ struct mmu_interval_notifier notifier;
+ struct mutex cover_mutex;
unsigned long vaddr;
unsigned long length;
unsigned int npages;
@@ -27,6 +29,7 @@ struct tid_user_buf {
struct tid_rb_node {
struct mmu_interval_notifier notifier;
struct hfi1_filedata *fdata;
+ struct mutex invalidate_mutex; /* covers hw removal */
unsigned long phys;
struct tid_group *grp;
u32 rcventry;
diff --git a/drivers/infiniband/hw/hfi1/user_pages.c b/drivers/infiniband/hw/hfi1/user_pages.c
index 7bce963e2ae6..36aaedc65145 100644
--- a/drivers/infiniband/hw/hfi1/user_pages.c
+++ b/drivers/infiniband/hw/hfi1/user_pages.c
@@ -29,33 +29,52 @@ MODULE_PARM_DESC(cache_size, "Send and receive side cache size limit (in MB)");
bool hfi1_can_pin_pages(struct hfi1_devdata *dd, struct mm_struct *mm,
u32 nlocked, u32 npages)
{
- unsigned long ulimit = rlimit(RLIMIT_MEMLOCK), pinned, cache_limit,
- size = (cache_size * (1UL << 20)); /* convert to bytes */
- unsigned int usr_ctxts =
- dd->num_rcv_contexts - dd->first_dyn_alloc_ctxt;
- bool can_lock = capable(CAP_IPC_LOCK);
+ unsigned long ulimit_pages;
+ unsigned long cache_limit_pages;
+ unsigned int usr_ctxts;
/*
- * Calculate per-cache size. The calculation below uses only a quarter
- * of the available per-context limit. This leaves space for other
- * pinning. Should we worry about shared ctxts?
+ * Perform RLIMIT_MEMLOCK based checks unless CAP_IPC_LOCK is present.
*/
- cache_limit = (ulimit / usr_ctxts) / 4;
-
- /* If ulimit isn't set to "unlimited" and is smaller than cache_size. */
- if (ulimit != (-1UL) && size > cache_limit)
- size = cache_limit;
-
- /* Convert to number of pages */
- size = DIV_ROUND_UP(size, PAGE_SIZE);
-
- pinned = atomic64_read(&mm->pinned_vm);
+ if (!capable(CAP_IPC_LOCK)) {
+ ulimit_pages =
+ DIV_ROUND_DOWN_ULL(rlimit(RLIMIT_MEMLOCK), PAGE_SIZE);
+
+ /*
+ * Pinning these pages would exceed this process's locked memory
+ * limit.
+ */
+ if (atomic64_read(&mm->pinned_vm) + npages > ulimit_pages)
+ return false;
+
+ /*
+ * Only allow 1/4 of the user's RLIMIT_MEMLOCK to be used for HFI
+ * caches. This fraction is then equally distributed among all
+ * existing user contexts. Note that if RLIMIT_MEMLOCK is
+ * 'unlimited' (-1), the value of this limit will be > 2^42 pages
+ * (2^64 / 2^12 / 2^8 / 2^2).
+ *
+ * The effectiveness of this check may be reduced if I/O occurs on
+ * some user contexts before all user contexts are created. This
+ * check assumes that this process is the only one using this
+ * context (e.g., the corresponding fd was not passed to another
+ * process for concurrent access) as there is no per-context,
+ * per-process tracking of pinned pages. It also assumes that each
+ * user context has only one cache to limit.
+ */
+ usr_ctxts = dd->num_rcv_contexts - dd->first_dyn_alloc_ctxt;
+ if (nlocked + npages > (ulimit_pages / usr_ctxts / 4))
+ return false;
+ }
- /* First, check the absolute limit against all pinned pages. */
- if (pinned + npages >= ulimit && !can_lock)
+ /*
+ * Pinning these pages would exceed the size limit for this cache.
+ */
+ cache_limit_pages = cache_size * (1024 * 1024) / PAGE_SIZE;
+ if (nlocked + npages > cache_limit_pages)
return false;
- return ((nlocked + npages) <= size) || can_lock;
+ return true;
}
int hfi1_acquire_user_pages(struct mm_struct *mm, unsigned long vaddr, size_t npages,
diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c
index e6e17984553c..7f6d7fc7951d 100644
--- a/drivers/infiniband/hw/hfi1/verbs.c
+++ b/drivers/infiniband/hw/hfi1/verbs.c
@@ -1598,13 +1598,11 @@ static const char * const driver_cntr_names[] = {
"DRIVER_EgrHdrFull"
};
-static DEFINE_MUTEX(cntr_names_lock); /* protects the *_cntr_names bufers */
static struct rdma_stat_desc *dev_cntr_descs;
static struct rdma_stat_desc *port_cntr_descs;
int num_driver_cntrs = ARRAY_SIZE(driver_cntr_names);
static int num_dev_cntrs;
static int num_port_cntrs;
-static int cntr_names_initialized;
/*
* Convert a list of names separated by '\n' into an array of NULL terminated
@@ -1615,8 +1613,8 @@ static int init_cntr_names(const char *names_in, const size_t names_len,
int num_extra_names, int *num_cntrs,
struct rdma_stat_desc **cntr_descs)
{
- struct rdma_stat_desc *q;
- char *names_out, *p;
+ struct rdma_stat_desc *names_out;
+ char *p;
int i, n;
n = 0;
@@ -1624,65 +1622,45 @@ static int init_cntr_names(const char *names_in, const size_t names_len,
if (names_in[i] == '\n')
n++;
- names_out =
- kzalloc((n + num_extra_names) * sizeof(*q) + names_len,
- GFP_KERNEL);
+ names_out = kzalloc((n + num_extra_names) * sizeof(*names_out)
+ + names_len,
+ GFP_KERNEL);
if (!names_out) {
*num_cntrs = 0;
*cntr_descs = NULL;
return -ENOMEM;
}
- p = names_out + (n + num_extra_names) * sizeof(*q);
+ p = (char *)&names_out[n + num_extra_names];
memcpy(p, names_in, names_len);
- q = (struct rdma_stat_desc *)names_out;
for (i = 0; i < n; i++) {
- q[i].name = p;
+ names_out[i].name = p;
p = strchr(p, '\n');
*p++ = '\0';
}
*num_cntrs = n;
- *cntr_descs = (struct rdma_stat_desc *)names_out;
+ *cntr_descs = names_out;
return 0;
}
-static int init_counters(struct ib_device *ibdev)
-{
- struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
- int i, err = 0;
-
- mutex_lock(&cntr_names_lock);
- if (cntr_names_initialized)
- goto out_unlock;
-
- err = init_cntr_names(dd->cntrnames, dd->cntrnameslen, num_driver_cntrs,
- &num_dev_cntrs, &dev_cntr_descs);
- if (err)
- goto out_unlock;
-
- for (i = 0; i < num_driver_cntrs; i++)
- dev_cntr_descs[num_dev_cntrs + i].name = driver_cntr_names[i];
-
- err = init_cntr_names(dd->portcntrnames, dd->portcntrnameslen, 0,
- &num_port_cntrs, &port_cntr_descs);
- if (err) {
- kfree(dev_cntr_descs);
- dev_cntr_descs = NULL;
- goto out_unlock;
- }
- cntr_names_initialized = 1;
-
-out_unlock:
- mutex_unlock(&cntr_names_lock);
- return err;
-}
-
static struct rdma_hw_stats *hfi1_alloc_hw_device_stats(struct ib_device *ibdev)
{
- if (init_counters(ibdev))
- return NULL;
+ if (!dev_cntr_descs) {
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ int i, err;
+
+ err = init_cntr_names(dd->cntrnames, dd->cntrnameslen,
+ num_driver_cntrs,
+ &num_dev_cntrs, &dev_cntr_descs);
+ if (err)
+ return NULL;
+
+ for (i = 0; i < num_driver_cntrs; i++)
+ dev_cntr_descs[num_dev_cntrs + i].name =
+ driver_cntr_names[i];
+ }
return rdma_alloc_hw_stats_struct(dev_cntr_descs,
num_dev_cntrs + num_driver_cntrs,
RDMA_HW_STATS_DEFAULT_LIFESPAN);
@@ -1691,8 +1669,16 @@ static struct rdma_hw_stats *hfi1_alloc_hw_device_stats(struct ib_device *ibdev)
static struct rdma_hw_stats *hfi_alloc_hw_port_stats(struct ib_device *ibdev,
u32 port_num)
{
- if (init_counters(ibdev))
- return NULL;
+ if (!port_cntr_descs) {
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ int err;
+
+ err = init_cntr_names(dd->portcntrnames, dd->portcntrnameslen,
+ 0,
+ &num_port_cntrs, &port_cntr_descs);
+ if (err)
+ return NULL;
+ }
return rdma_alloc_hw_stats_struct(port_cntr_descs, num_port_cntrs,
RDMA_HW_STATS_DEFAULT_LIFESPAN);
}
@@ -1917,13 +1903,10 @@ void hfi1_unregister_ib_device(struct hfi1_devdata *dd)
del_timer_sync(&dev->mem_timer);
verbs_txreq_exit(dev);
- mutex_lock(&cntr_names_lock);
kfree(dev_cntr_descs);
kfree(port_cntr_descs);
dev_cntr_descs = NULL;
port_cntr_descs = NULL;
- cntr_names_initialized = 0;
- mutex_unlock(&cntr_names_lock);
}
void hfi1_cnp_rcv(struct hfi1_packet *packet)