diff options
Diffstat (limited to '')
-rw-r--r-- | drivers/infiniband/hw/hfi1/chip.c | 59 | ||||
-rw-r--r-- | drivers/infiniband/hw/hfi1/exp_rcv.h | 5 | ||||
-rw-r--r-- | drivers/infiniband/hw/hfi1/file_ops.c | 92 | ||||
-rw-r--r-- | drivers/infiniband/hw/hfi1/init.c | 2 | ||||
-rw-r--r-- | drivers/infiniband/hw/hfi1/sdma.c | 4 | ||||
-rw-r--r-- | drivers/infiniband/hw/hfi1/sdma.h | 15 | ||||
-rw-r--r-- | drivers/infiniband/hw/hfi1/user_exp_rcv.c | 255 | ||||
-rw-r--r-- | drivers/infiniband/hw/hfi1/user_exp_rcv.h | 3 | ||||
-rw-r--r-- | drivers/infiniband/hw/hfi1/user_pages.c | 61 | ||||
-rw-r--r-- | drivers/infiniband/hw/hfi1/verbs.c | 81 |
10 files changed, 346 insertions, 231 deletions
diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index ebe970f76232..90b672feed83 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -1056,7 +1056,7 @@ static void read_link_down_reason(struct hfi1_devdata *dd, u8 *ldr); static void handle_temp_err(struct hfi1_devdata *dd); static void dc_shutdown(struct hfi1_devdata *dd); static void dc_start(struct hfi1_devdata *dd); -static int qos_rmt_entries(struct hfi1_devdata *dd, unsigned int *mp, +static int qos_rmt_entries(unsigned int n_krcv_queues, unsigned int *mp, unsigned int *np); static void clear_full_mgmt_pkey(struct hfi1_pportdata *ppd); static int wait_link_transfer_active(struct hfi1_devdata *dd, int wait_ms); @@ -13362,7 +13362,6 @@ static int set_up_context_variables(struct hfi1_devdata *dd) int ret; unsigned ngroups; int rmt_count; - int user_rmt_reduced; u32 n_usr_ctxts; u32 send_contexts = chip_send_contexts(dd); u32 rcv_contexts = chip_rcv_contexts(dd); @@ -13421,28 +13420,34 @@ static int set_up_context_variables(struct hfi1_devdata *dd) (num_kernel_contexts + n_usr_ctxts), &node_affinity.real_cpu_mask); /* - * The RMT entries are currently allocated as shown below: - * 1. QOS (0 to 128 entries); - * 2. FECN (num_kernel_context - 1 + num_user_contexts + - * num_netdev_contexts); - * 3. netdev (num_netdev_contexts). - * It should be noted that FECN oversubscribe num_netdev_contexts - * entries of RMT because both netdev and PSM could allocate any receive - * context between dd->first_dyn_alloc_text and dd->num_rcv_contexts, - * and PSM FECN must reserve an RMT entry for each possible PSM receive - * context. + * RMT entries are allocated as follows: + * 1. QOS (0 to 128 entries) + * 2. FECN (num_kernel_context - 1 [a] + num_user_contexts + + * num_netdev_contexts [b]) + * 3. netdev (NUM_NETDEV_MAP_ENTRIES) + * + * Notes: + * [a] Kernel contexts (except control) are included in FECN if kernel + * TID_RDMA is active. + * [b] Netdev and user contexts are randomly allocated from the same + * context pool, so FECN must cover all contexts in the pool. */ - rmt_count = qos_rmt_entries(dd, NULL, NULL) + (num_netdev_contexts * 2); - if (HFI1_CAP_IS_KSET(TID_RDMA)) - rmt_count += num_kernel_contexts - 1; - if (rmt_count + n_usr_ctxts > NUM_MAP_ENTRIES) { - user_rmt_reduced = NUM_MAP_ENTRIES - rmt_count; - dd_dev_err(dd, - "RMT size is reducing the number of user receive contexts from %u to %d\n", - n_usr_ctxts, - user_rmt_reduced); - /* recalculate */ - n_usr_ctxts = user_rmt_reduced; + rmt_count = qos_rmt_entries(num_kernel_contexts - 1, NULL, NULL) + + (HFI1_CAP_IS_KSET(TID_RDMA) ? num_kernel_contexts - 1 + : 0) + + n_usr_ctxts + + num_netdev_contexts + + NUM_NETDEV_MAP_ENTRIES; + if (rmt_count > NUM_MAP_ENTRIES) { + int over = rmt_count - NUM_MAP_ENTRIES; + /* try to squish user contexts, minimum of 1 */ + if (over >= n_usr_ctxts) { + dd_dev_err(dd, "RMT overflow: reduce the requested number of contexts\n"); + return -EINVAL; + } + dd_dev_err(dd, "RMT overflow: reducing # user contexts from %u to %u\n", + n_usr_ctxts, n_usr_ctxts - over); + n_usr_ctxts -= over; } /* the first N are kernel contexts, the rest are user/netdev contexts */ @@ -14299,15 +14304,15 @@ static void clear_rsm_rule(struct hfi1_devdata *dd, u8 rule_index) } /* return the number of RSM map table entries that will be used for QOS */ -static int qos_rmt_entries(struct hfi1_devdata *dd, unsigned int *mp, +static int qos_rmt_entries(unsigned int n_krcv_queues, unsigned int *mp, unsigned int *np) { int i; unsigned int m, n; - u8 max_by_vl = 0; + uint max_by_vl = 0; /* is QOS active at all? */ - if (dd->n_krcv_queues <= MIN_KERNEL_KCTXTS || + if (n_krcv_queues < MIN_KERNEL_KCTXTS || num_vls == 1 || krcvqsset <= 1) goto no_qos; @@ -14365,7 +14370,7 @@ static void init_qos(struct hfi1_devdata *dd, struct rsm_map_table *rmt) if (!rmt) goto bail; - rmt_entries = qos_rmt_entries(dd, &m, &n); + rmt_entries = qos_rmt_entries(dd->n_krcv_queues - 1, &m, &n); if (rmt_entries == 0) goto bail; qpns_per_vl = 1 << m; diff --git a/drivers/infiniband/hw/hfi1/exp_rcv.h b/drivers/infiniband/hw/hfi1/exp_rcv.h index c6291bbf723c..41f7fe5d1839 100644 --- a/drivers/infiniband/hw/hfi1/exp_rcv.h +++ b/drivers/infiniband/hw/hfi1/exp_rcv.h @@ -133,12 +133,13 @@ static inline struct tid_group *tid_group_pop(struct exp_tid_set *set) return grp; } -static inline u32 rcventry2tidinfo(u32 rcventry) +static inline u32 create_tid(u32 rcventry, u32 npages) { u32 pair = rcventry & ~0x1; return EXP_TID_SET(IDX, pair >> 1) | - EXP_TID_SET(CTRL, 1 << (rcventry - pair)); + EXP_TID_SET(CTRL, 1 << (rcventry - pair)) | + EXP_TID_SET(LEN, npages); } /** diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c index f5f9269fdc16..b1d6ca7e9708 100644 --- a/drivers/infiniband/hw/hfi1/file_ops.c +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -306,6 +306,17 @@ static ssize_t hfi1_write_iter(struct kiocb *kiocb, struct iov_iter *from) return reqs; } +static inline void mmap_cdbg(u16 ctxt, u8 subctxt, u8 type, u8 mapio, u8 vmf, + u64 memaddr, void *memvirt, dma_addr_t memdma, + ssize_t memlen, struct vm_area_struct *vma) +{ + hfi1_cdbg(PROC, + "%u:%u type:%u io/vf/dma:%d/%d/%d, addr:0x%llx, len:%lu(%lu), flags:0x%lx", + ctxt, subctxt, type, mapio, vmf, !!memdma, + memaddr ?: (u64)memvirt, memlen, + vma->vm_end - vma->vm_start, vma->vm_flags); +} + static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma) { struct hfi1_filedata *fd = fp->private_data; @@ -315,6 +326,7 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma) u64 token = vma->vm_pgoff << PAGE_SHIFT, memaddr = 0; void *memvirt = NULL; + dma_addr_t memdma = 0; u8 subctxt, mapio = 0, vmf = 0, type; ssize_t memlen = 0; int ret = 0; @@ -334,6 +346,11 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma) goto done; } + /* + * vm_pgoff is used as a buffer selector cookie. Always mmap from + * the beginning. + */ + vma->vm_pgoff = 0; flags = vma->vm_flags; switch (type) { @@ -355,7 +372,8 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma) vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); mapio = 1; break; - case PIO_CRED: + case PIO_CRED: { + u64 cr_page_offset; if (flags & VM_WRITE) { ret = -EPERM; goto done; @@ -365,10 +383,11 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma) * second or third page allocated for credit returns (if number * of enabled contexts > 64 and 128 respectively). */ - memvirt = dd->cr_base[uctxt->numa_id].va; - memaddr = virt_to_phys(memvirt) + - (((u64)uctxt->sc->hw_free - - (u64)dd->cr_base[uctxt->numa_id].va) & PAGE_MASK); + cr_page_offset = ((u64)uctxt->sc->hw_free - + (u64)dd->cr_base[uctxt->numa_id].va) & + PAGE_MASK; + memvirt = dd->cr_base[uctxt->numa_id].va + cr_page_offset; + memdma = dd->cr_base[uctxt->numa_id].dma + cr_page_offset; memlen = PAGE_SIZE; flags &= ~VM_MAYWRITE; flags |= VM_DONTCOPY | VM_DONTEXPAND; @@ -378,14 +397,16 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma) * memory been flagged as non-cached? */ /* vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); */ - mapio = 1; break; + } case RCV_HDRQ: memlen = rcvhdrq_size(uctxt); memvirt = uctxt->rcvhdrq; + memdma = uctxt->rcvhdrq_dma; break; case RCV_EGRBUF: { - unsigned long addr; + unsigned long vm_start_save; + unsigned long vm_end_save; int i; /* * The RcvEgr buffer need to be handled differently @@ -403,25 +424,35 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma) ret = -EPERM; goto done; } - vma->vm_flags &= ~VM_MAYWRITE; - addr = vma->vm_start; + vm_flags_clear(vma, VM_MAYWRITE); + /* + * Mmap multiple separate allocations into a single vma. From + * here, dma_mmap_coherent() calls dma_direct_mmap(), which + * requires the mmap to exactly fill the vma starting at + * vma_start. Adjust the vma start and end for each eager + * buffer segment mapped. Restore the originals when done. + */ + vm_start_save = vma->vm_start; + vm_end_save = vma->vm_end; + vma->vm_end = vma->vm_start; for (i = 0 ; i < uctxt->egrbufs.numbufs; i++) { memlen = uctxt->egrbufs.buffers[i].len; memvirt = uctxt->egrbufs.buffers[i].addr; - ret = remap_pfn_range( - vma, addr, - /* - * virt_to_pfn() does the same, but - * it's not available on x86_64 - * when CONFIG_MMU is enabled. - */ - PFN_DOWN(__pa(memvirt)), - memlen, - vma->vm_page_prot); - if (ret < 0) + memdma = uctxt->egrbufs.buffers[i].dma; + vma->vm_end += memlen; + mmap_cdbg(ctxt, subctxt, type, mapio, vmf, memaddr, + memvirt, memdma, memlen, vma); + ret = dma_mmap_coherent(&dd->pcidev->dev, vma, + memvirt, memdma, memlen); + if (ret < 0) { + vma->vm_start = vm_start_save; + vma->vm_end = vm_end_save; goto done; - addr += memlen; + } + vma->vm_start += memlen; } + vma->vm_start = vm_start_save; + vma->vm_end = vm_end_save; ret = 0; goto done; } @@ -481,6 +512,7 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma) } memlen = PAGE_SIZE; memvirt = (void *)hfi1_rcvhdrtail_kvaddr(uctxt); + memdma = uctxt->rcvhdrqtailaddr_dma; flags &= ~VM_MAYWRITE; break; case SUBCTXT_UREGS: @@ -528,15 +560,16 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma) goto done; } - vma->vm_flags = flags; - hfi1_cdbg(PROC, - "%u:%u type:%u io/vf:%d/%d, addr:0x%llx, len:%lu(%lu), flags:0x%lx\n", - ctxt, subctxt, type, mapio, vmf, memaddr, memlen, - vma->vm_end - vma->vm_start, vma->vm_flags); + vm_flags_reset(vma, flags); + mmap_cdbg(ctxt, subctxt, type, mapio, vmf, memaddr, memvirt, memdma, + memlen, vma); if (vmf) { vma->vm_pgoff = PFN_DOWN(memaddr); vma->vm_ops = &vm_ops; ret = 0; + } else if (memdma) { + ret = dma_mmap_coherent(&dd->pcidev->dev, vma, + memvirt, memdma, memlen); } else if (mapio) { ret = io_remap_pfn_range(vma, vma->vm_start, PFN_DOWN(memaddr), @@ -1318,12 +1351,15 @@ static int user_exp_rcv_setup(struct hfi1_filedata *fd, unsigned long arg, addr = arg + offsetof(struct hfi1_tid_info, tidcnt); if (copy_to_user((void __user *)addr, &tinfo.tidcnt, sizeof(tinfo.tidcnt))) - return -EFAULT; + ret = -EFAULT; addr = arg + offsetof(struct hfi1_tid_info, length); - if (copy_to_user((void __user *)addr, &tinfo.length, + if (!ret && copy_to_user((void __user *)addr, &tinfo.length, sizeof(tinfo.length))) ret = -EFAULT; + + if (ret) + hfi1_user_exp_rcv_invalid(fd, &tinfo); } return ret; diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 24c0f0d257fc..62b6c5020039 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -464,7 +464,7 @@ bail: * * This wrapper is the free function that matches hfi1_create_ctxtdata(). * When a context is done being used (kernel or user), this function is called - * for the "final" put to match the kref init from hf1i_create_ctxtdata(). + * for the "final" put to match the kref init from hfi1_create_ctxtdata(). * Other users of the context do a get/put sequence to make sure that the * structure isn't removed while in use. */ diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c index a95b654f5254..8ed20392e9f0 100644 --- a/drivers/infiniband/hw/hfi1/sdma.c +++ b/drivers/infiniband/hw/hfi1/sdma.c @@ -3160,8 +3160,7 @@ int _pad_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx) { int rval = 0; - tx->num_desc++; - if ((unlikely(tx->num_desc == tx->desc_limit))) { + if ((unlikely(tx->num_desc + 1 == tx->desc_limit))) { rval = _extend_sdma_tx_descs(dd, tx); if (rval) { __sdma_txclean(dd, tx); @@ -3174,6 +3173,7 @@ int _pad_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx) SDMA_MAP_NONE, dd->sdma_pad_phys, sizeof(u32) - (tx->packet_len & (sizeof(u32) - 1))); + tx->num_desc++; _sdma_close_tx(dd, tx); return rval; } diff --git a/drivers/infiniband/hw/hfi1/sdma.h b/drivers/infiniband/hw/hfi1/sdma.h index d8170fcbfbdd..b023fc461bd5 100644 --- a/drivers/infiniband/hw/hfi1/sdma.h +++ b/drivers/infiniband/hw/hfi1/sdma.h @@ -631,14 +631,13 @@ static inline void sdma_txclean(struct hfi1_devdata *dd, struct sdma_txreq *tx) static inline void _sdma_close_tx(struct hfi1_devdata *dd, struct sdma_txreq *tx) { - tx->descp[tx->num_desc].qw[0] |= - SDMA_DESC0_LAST_DESC_FLAG; - tx->descp[tx->num_desc].qw[1] |= - dd->default_desc1; + u16 last_desc = tx->num_desc - 1; + + tx->descp[last_desc].qw[0] |= SDMA_DESC0_LAST_DESC_FLAG; + tx->descp[last_desc].qw[1] |= dd->default_desc1; if (tx->flags & SDMA_TXREQ_F_URGENT) - tx->descp[tx->num_desc].qw[1] |= - (SDMA_DESC1_HEAD_TO_HOST_FLAG | - SDMA_DESC1_INT_REQ_FLAG); + tx->descp[last_desc].qw[1] |= (SDMA_DESC1_HEAD_TO_HOST_FLAG | + SDMA_DESC1_INT_REQ_FLAG); } static inline int _sdma_txadd_daddr( @@ -655,6 +654,7 @@ static inline int _sdma_txadd_daddr( type, addr, len); WARN_ON(len > tx->tlen); + tx->num_desc++; tx->tlen -= len; /* special cases for last */ if (!tx->tlen) { @@ -666,7 +666,6 @@ static inline int _sdma_txadd_daddr( _sdma_close_tx(dd, tx); } } - tx->num_desc++; return rval; } diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c index 186d30291260..96058baf36ed 100644 --- a/drivers/infiniband/hw/hfi1/user_exp_rcv.c +++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c @@ -23,18 +23,24 @@ static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata, static bool tid_rb_invalidate(struct mmu_interval_notifier *mni, const struct mmu_notifier_range *range, unsigned long cur_seq); +static bool tid_cover_invalidate(struct mmu_interval_notifier *mni, + const struct mmu_notifier_range *range, + unsigned long cur_seq); static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *, - struct tid_group *grp, - unsigned int start, u16 count, + struct tid_group *grp, u16 count, u32 *tidlist, unsigned int *tididx, unsigned int *pmapped); -static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo, - struct tid_group **grp); +static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo); +static void __clear_tid_node(struct hfi1_filedata *fd, + struct tid_rb_node *node); static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node); static const struct mmu_interval_notifier_ops tid_mn_ops = { .invalidate = tid_rb_invalidate, }; +static const struct mmu_interval_notifier_ops tid_cover_ops = { + .invalidate = tid_cover_invalidate, +}; /* * Initialize context and file private data needed for Expected @@ -153,16 +159,11 @@ static void unpin_rcv_pages(struct hfi1_filedata *fd, static int pin_rcv_pages(struct hfi1_filedata *fd, struct tid_user_buf *tidbuf) { int pinned; - unsigned int npages; + unsigned int npages = tidbuf->npages; unsigned long vaddr = tidbuf->vaddr; struct page **pages = NULL; struct hfi1_devdata *dd = fd->uctxt->dd; - /* Get the number of pages the user buffer spans */ - npages = num_user_pages(vaddr, tidbuf->length); - if (!npages) - return -EINVAL; - if (npages > fd->uctxt->expected_count) { dd_dev_err(dd, "Expected buffer too big\n"); return -EINVAL; @@ -189,7 +190,6 @@ static int pin_rcv_pages(struct hfi1_filedata *fd, struct tid_user_buf *tidbuf) return pinned; } tidbuf->pages = pages; - tidbuf->npages = npages; fd->tid_n_pinned += pinned; return pinned; } @@ -249,57 +249,70 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, int ret = 0, need_group = 0, pinned; struct hfi1_ctxtdata *uctxt = fd->uctxt; struct hfi1_devdata *dd = uctxt->dd; - unsigned int ngroups, pageidx = 0, pageset_count, + unsigned int ngroups, pageset_count, tididx = 0, mapped, mapped_pages = 0; u32 *tidlist = NULL; struct tid_user_buf *tidbuf; + unsigned long mmu_seq = 0; if (!PAGE_ALIGNED(tinfo->vaddr)) return -EINVAL; + if (tinfo->length == 0) + return -EINVAL; tidbuf = kzalloc(sizeof(*tidbuf), GFP_KERNEL); if (!tidbuf) return -ENOMEM; + mutex_init(&tidbuf->cover_mutex); tidbuf->vaddr = tinfo->vaddr; tidbuf->length = tinfo->length; + tidbuf->npages = num_user_pages(tidbuf->vaddr, tidbuf->length); tidbuf->psets = kcalloc(uctxt->expected_count, sizeof(*tidbuf->psets), GFP_KERNEL); if (!tidbuf->psets) { - kfree(tidbuf); - return -ENOMEM; + ret = -ENOMEM; + goto fail_release_mem; + } + + if (fd->use_mn) { + ret = mmu_interval_notifier_insert( + &tidbuf->notifier, current->mm, + tidbuf->vaddr, tidbuf->npages * PAGE_SIZE, + &tid_cover_ops); + if (ret) + goto fail_release_mem; + mmu_seq = mmu_interval_read_begin(&tidbuf->notifier); } pinned = pin_rcv_pages(fd, tidbuf); if (pinned <= 0) { - kfree(tidbuf->psets); - kfree(tidbuf); - return pinned; + ret = (pinned < 0) ? pinned : -ENOSPC; + goto fail_unpin; } /* Find sets of physically contiguous pages */ tidbuf->n_psets = find_phys_blocks(tidbuf, pinned); - /* - * We don't need to access this under a lock since tid_used is per - * process and the same process cannot be in hfi1_user_exp_rcv_clear() - * and hfi1_user_exp_rcv_setup() at the same time. - */ + /* Reserve the number of expected tids to be used. */ spin_lock(&fd->tid_lock); if (fd->tid_used + tidbuf->n_psets > fd->tid_limit) pageset_count = fd->tid_limit - fd->tid_used; else pageset_count = tidbuf->n_psets; + fd->tid_used += pageset_count; spin_unlock(&fd->tid_lock); - if (!pageset_count) - goto bail; + if (!pageset_count) { + ret = -ENOSPC; + goto fail_unreserve; + } ngroups = pageset_count / dd->rcv_entries.group_size; tidlist = kcalloc(pageset_count, sizeof(*tidlist), GFP_KERNEL); if (!tidlist) { ret = -ENOMEM; - goto nomem; + goto fail_unreserve; } tididx = 0; @@ -318,7 +331,7 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, tid_group_pop(&uctxt->tid_group_list); ret = program_rcvarray(fd, tidbuf, grp, - pageidx, dd->rcv_entries.group_size, + dd->rcv_entries.group_size, tidlist, &tididx, &mapped); /* * If there was a failure to program the RcvArray @@ -334,11 +347,10 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, tid_group_add_tail(grp, &uctxt->tid_full_list); ngroups--; - pageidx += ret; mapped_pages += mapped; } - while (pageidx < pageset_count) { + while (tididx < pageset_count) { struct tid_group *grp, *ptr; /* * If we don't have any partially used tid groups, check @@ -360,11 +372,11 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, */ list_for_each_entry_safe(grp, ptr, &uctxt->tid_used_list.list, list) { - unsigned use = min_t(unsigned, pageset_count - pageidx, + unsigned use = min_t(unsigned, pageset_count - tididx, grp->size - grp->used); ret = program_rcvarray(fd, tidbuf, grp, - pageidx, use, tidlist, + use, tidlist, &tididx, &mapped); if (ret < 0) { hfi1_cdbg(TID, @@ -376,11 +388,10 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, tid_group_move(grp, &uctxt->tid_used_list, &uctxt->tid_full_list); - pageidx += ret; mapped_pages += mapped; need_group = 0; /* Check if we are done so we break out early */ - if (pageidx >= pageset_count) + if (tididx >= pageset_count) break; } else if (WARN_ON(ret == 0)) { /* @@ -395,43 +406,78 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, } unlock: mutex_unlock(&uctxt->exp_mutex); -nomem: hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)", tididx, mapped_pages, ret); - if (tididx) { - spin_lock(&fd->tid_lock); - fd->tid_used += tididx; - spin_unlock(&fd->tid_lock); - tinfo->tidcnt = tididx; - tinfo->length = mapped_pages * PAGE_SIZE; - - if (copy_to_user(u64_to_user_ptr(tinfo->tidlist), - tidlist, sizeof(tidlist[0]) * tididx)) { - /* - * On failure to copy to the user level, we need to undo - * everything done so far so we don't leak resources. - */ - tinfo->tidlist = (unsigned long)&tidlist; - hfi1_user_exp_rcv_clear(fd, tinfo); - tinfo->tidlist = 0; - ret = -EFAULT; - goto bail; + + /* fail if nothing was programmed, set error if none provided */ + if (tididx == 0) { + if (ret >= 0) + ret = -ENOSPC; + goto fail_unreserve; + } + + /* adjust reserved tid_used to actual count */ + spin_lock(&fd->tid_lock); + fd->tid_used -= pageset_count - tididx; + spin_unlock(&fd->tid_lock); + + /* unpin all pages not covered by a TID */ + unpin_rcv_pages(fd, tidbuf, NULL, mapped_pages, pinned - mapped_pages, + false); + + if (fd->use_mn) { + /* check for an invalidate during setup */ + bool fail = false; + + mutex_lock(&tidbuf->cover_mutex); + fail = mmu_interval_read_retry(&tidbuf->notifier, mmu_seq); + mutex_unlock(&tidbuf->cover_mutex); + + if (fail) { + ret = -EBUSY; + goto fail_unprogram; } } - /* - * If not everything was mapped (due to insufficient RcvArray entries, - * for example), unpin all unmapped pages so we can pin them nex time. - */ - if (mapped_pages != pinned) - unpin_rcv_pages(fd, tidbuf, NULL, mapped_pages, - (pinned - mapped_pages), false); -bail: + tinfo->tidcnt = tididx; + tinfo->length = mapped_pages * PAGE_SIZE; + + if (copy_to_user(u64_to_user_ptr(tinfo->tidlist), + tidlist, sizeof(tidlist[0]) * tididx)) { + ret = -EFAULT; + goto fail_unprogram; + } + + if (fd->use_mn) + mmu_interval_notifier_remove(&tidbuf->notifier); + kfree(tidbuf->pages); kfree(tidbuf->psets); + kfree(tidbuf); kfree(tidlist); + return 0; + +fail_unprogram: + /* unprogram, unmap, and unpin all allocated TIDs */ + tinfo->tidlist = (unsigned long)tidlist; + hfi1_user_exp_rcv_clear(fd, tinfo); + tinfo->tidlist = 0; + pinned = 0; /* nothing left to unpin */ + pageset_count = 0; /* nothing left reserved */ +fail_unreserve: + spin_lock(&fd->tid_lock); + fd->tid_used -= pageset_count; + spin_unlock(&fd->tid_lock); +fail_unpin: + if (fd->use_mn) + mmu_interval_notifier_remove(&tidbuf->notifier); + if (pinned > 0) + unpin_rcv_pages(fd, tidbuf, NULL, 0, pinned, false); +fail_release_mem: kfree(tidbuf->pages); + kfree(tidbuf->psets); kfree(tidbuf); - return ret > 0 ? 0 : ret; + kfree(tidlist); + return ret; } int hfi1_user_exp_rcv_clear(struct hfi1_filedata *fd, @@ -452,7 +498,7 @@ int hfi1_user_exp_rcv_clear(struct hfi1_filedata *fd, mutex_lock(&uctxt->exp_mutex); for (tididx = 0; tididx < tinfo->tidcnt; tididx++) { - ret = unprogram_rcvarray(fd, tidinfo[tididx], NULL); + ret = unprogram_rcvarray(fd, tidinfo[tididx]); if (ret) { hfi1_cdbg(TID, "Failed to unprogram rcv array %d", ret); @@ -589,7 +635,6 @@ static u32 find_phys_blocks(struct tid_user_buf *tidbuf, unsigned int npages) * struct tid_pageset holding information on physically contiguous * chunks from the user buffer), and other fields. * @grp: RcvArray group - * @start: starting index into sets array * @count: number of struct tid_pageset's to program * @tidlist: the array of u32 elements when the information about the * programmed RcvArray entries is to be encoded. @@ -609,14 +654,14 @@ static u32 find_phys_blocks(struct tid_user_buf *tidbuf, unsigned int npages) * number of RcvArray entries programmed. */ static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *tbuf, - struct tid_group *grp, - unsigned int start, u16 count, + struct tid_group *grp, u16 count, u32 *tidlist, unsigned int *tididx, unsigned int *pmapped) { struct hfi1_ctxtdata *uctxt = fd->uctxt; struct hfi1_devdata *dd = uctxt->dd; u16 idx; + unsigned int start = *tididx; u32 tidinfo = 0, rcventry, useidx = 0; int mapped = 0; @@ -661,8 +706,7 @@ static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *tbuf, return ret; mapped += npages; - tidinfo = rcventry2tidinfo(rcventry - uctxt->expected_base) | - EXP_TID_SET(LEN, npages); + tidinfo = create_tid(rcventry - uctxt->expected_base, npages); tidlist[(*tididx)++] = tidinfo; grp->used++; grp->map |= 1 << useidx++; @@ -706,6 +750,7 @@ static int set_rcvarray_entry(struct hfi1_filedata *fd, } node->fdata = fd; + mutex_init(&node->invalidate_mutex); node->phys = page_to_phys(pages[0]); node->npages = npages; node->rcventry = rcventry; @@ -721,11 +766,6 @@ static int set_rcvarray_entry(struct hfi1_filedata *fd, &tid_mn_ops); if (ret) goto out_unmap; - /* - * FIXME: This is in the wrong order, the notifier should be - * established before the pages are pinned by pin_rcv_pages. - */ - mmu_interval_read_begin(&node->notifier); } fd->entry_to_rb[node->rcventry - uctxt->expected_base] = node; @@ -745,33 +785,29 @@ out_unmap: return -EFAULT; } -static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo, - struct tid_group **grp) +static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo) { struct hfi1_ctxtdata *uctxt = fd->uctxt; struct hfi1_devdata *dd = uctxt->dd; struct tid_rb_node *node; - u8 tidctrl = EXP_TID_GET(tidinfo, CTRL); + u32 tidctrl = EXP_TID_GET(tidinfo, CTRL); u32 tididx = EXP_TID_GET(tidinfo, IDX) << 1, rcventry; - if (tididx >= uctxt->expected_count) { - dd_dev_err(dd, "Invalid RcvArray entry (%u) index for ctxt %u\n", - tididx, uctxt->ctxt); - return -EINVAL; - } - - if (tidctrl == 0x3) + if (tidctrl == 0x3 || tidctrl == 0x0) return -EINVAL; rcventry = tididx + (tidctrl - 1); + if (rcventry >= uctxt->expected_count) { + dd_dev_err(dd, "Invalid RcvArray entry (%u) index for ctxt %u\n", + rcventry, uctxt->ctxt); + return -EINVAL; + } + node = fd->entry_to_rb[rcventry]; if (!node || node->rcventry != (uctxt->expected_base + rcventry)) return -EBADF; - if (grp) - *grp = node->grp; - if (fd->use_mn) mmu_interval_notifier_remove(&node->notifier); cacheless_tid_rb_remove(fd, node); @@ -779,23 +815,34 @@ static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo, return 0; } -static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node) +static void __clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node) { struct hfi1_ctxtdata *uctxt = fd->uctxt; struct hfi1_devdata *dd = uctxt->dd; + mutex_lock(&node->invalidate_mutex); + if (node->freed) + goto done; + node->freed = true; + trace_hfi1_exp_tid_unreg(uctxt->ctxt, fd->subctxt, node->rcventry, node->npages, node->notifier.interval_tree.start, node->phys, node->dma_addr); - /* - * Make sure device has seen the write before we unpin the - * pages. - */ + /* Make sure device has seen the write before pages are unpinned */ hfi1_put_tid(dd, node->rcventry, PT_INVALID_FLUSH, 0, 0); unpin_rcv_pages(fd, NULL, node, 0, node->npages, true); +done: + mutex_unlock(&node->invalidate_mutex); +} + +static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node) +{ + struct hfi1_ctxtdata *uctxt = fd->uctxt; + + __clear_tid_node(fd, node); node->grp->used--; node->grp->map &= ~(1 << (node->rcventry - node->grp->base)); @@ -854,17 +901,22 @@ static bool tid_rb_invalidate(struct mmu_interval_notifier *mni, if (node->freed) return true; + /* take action only if unmapping */ + if (range->event != MMU_NOTIFY_UNMAP) + return true; + trace_hfi1_exp_tid_inval(uctxt->ctxt, fdata->subctxt, node->notifier.interval_tree.start, node->rcventry, node->npages, node->dma_addr); - node->freed = true; + + /* clear the hardware rcvarray entry */ + __clear_tid_node(fdata, node); spin_lock(&fdata->invalid_lock); if (fdata->invalid_tid_idx < uctxt->expected_count) { fdata->invalid_tids[fdata->invalid_tid_idx] = - rcventry2tidinfo(node->rcventry - uctxt->expected_base); - fdata->invalid_tids[fdata->invalid_tid_idx] |= - EXP_TID_SET(LEN, node->npages); + create_tid(node->rcventry - uctxt->expected_base, + node->npages); if (!fdata->invalid_tid_idx) { unsigned long *ev; @@ -887,6 +939,23 @@ static bool tid_rb_invalidate(struct mmu_interval_notifier *mni, return true; } +static bool tid_cover_invalidate(struct mmu_interval_notifier *mni, + const struct mmu_notifier_range *range, + unsigned long cur_seq) +{ + struct tid_user_buf *tidbuf = + container_of(mni, struct tid_user_buf, notifier); + + /* take action only if unmapping */ + if (range->event == MMU_NOTIFY_UNMAP) { + mutex_lock(&tidbuf->cover_mutex); + mmu_interval_set_seq(mni, cur_seq); + mutex_unlock(&tidbuf->cover_mutex); + } + + return true; +} + static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata, struct tid_rb_node *tnode) { diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.h b/drivers/infiniband/hw/hfi1/user_exp_rcv.h index 8c53e416bf84..f8ee997d0050 100644 --- a/drivers/infiniband/hw/hfi1/user_exp_rcv.h +++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.h @@ -16,6 +16,8 @@ struct tid_pageset { }; struct tid_user_buf { + struct mmu_interval_notifier notifier; + struct mutex cover_mutex; unsigned long vaddr; unsigned long length; unsigned int npages; @@ -27,6 +29,7 @@ struct tid_user_buf { struct tid_rb_node { struct mmu_interval_notifier notifier; struct hfi1_filedata *fdata; + struct mutex invalidate_mutex; /* covers hw removal */ unsigned long phys; struct tid_group *grp; u32 rcventry; diff --git a/drivers/infiniband/hw/hfi1/user_pages.c b/drivers/infiniband/hw/hfi1/user_pages.c index 7bce963e2ae6..36aaedc65145 100644 --- a/drivers/infiniband/hw/hfi1/user_pages.c +++ b/drivers/infiniband/hw/hfi1/user_pages.c @@ -29,33 +29,52 @@ MODULE_PARM_DESC(cache_size, "Send and receive side cache size limit (in MB)"); bool hfi1_can_pin_pages(struct hfi1_devdata *dd, struct mm_struct *mm, u32 nlocked, u32 npages) { - unsigned long ulimit = rlimit(RLIMIT_MEMLOCK), pinned, cache_limit, - size = (cache_size * (1UL << 20)); /* convert to bytes */ - unsigned int usr_ctxts = - dd->num_rcv_contexts - dd->first_dyn_alloc_ctxt; - bool can_lock = capable(CAP_IPC_LOCK); + unsigned long ulimit_pages; + unsigned long cache_limit_pages; + unsigned int usr_ctxts; /* - * Calculate per-cache size. The calculation below uses only a quarter - * of the available per-context limit. This leaves space for other - * pinning. Should we worry about shared ctxts? + * Perform RLIMIT_MEMLOCK based checks unless CAP_IPC_LOCK is present. */ - cache_limit = (ulimit / usr_ctxts) / 4; - - /* If ulimit isn't set to "unlimited" and is smaller than cache_size. */ - if (ulimit != (-1UL) && size > cache_limit) - size = cache_limit; - - /* Convert to number of pages */ - size = DIV_ROUND_UP(size, PAGE_SIZE); - - pinned = atomic64_read(&mm->pinned_vm); + if (!capable(CAP_IPC_LOCK)) { + ulimit_pages = + DIV_ROUND_DOWN_ULL(rlimit(RLIMIT_MEMLOCK), PAGE_SIZE); + + /* + * Pinning these pages would exceed this process's locked memory + * limit. + */ + if (atomic64_read(&mm->pinned_vm) + npages > ulimit_pages) + return false; + + /* + * Only allow 1/4 of the user's RLIMIT_MEMLOCK to be used for HFI + * caches. This fraction is then equally distributed among all + * existing user contexts. Note that if RLIMIT_MEMLOCK is + * 'unlimited' (-1), the value of this limit will be > 2^42 pages + * (2^64 / 2^12 / 2^8 / 2^2). + * + * The effectiveness of this check may be reduced if I/O occurs on + * some user contexts before all user contexts are created. This + * check assumes that this process is the only one using this + * context (e.g., the corresponding fd was not passed to another + * process for concurrent access) as there is no per-context, + * per-process tracking of pinned pages. It also assumes that each + * user context has only one cache to limit. + */ + usr_ctxts = dd->num_rcv_contexts - dd->first_dyn_alloc_ctxt; + if (nlocked + npages > (ulimit_pages / usr_ctxts / 4)) + return false; + } - /* First, check the absolute limit against all pinned pages. */ - if (pinned + npages >= ulimit && !can_lock) + /* + * Pinning these pages would exceed the size limit for this cache. + */ + cache_limit_pages = cache_size * (1024 * 1024) / PAGE_SIZE; + if (nlocked + npages > cache_limit_pages) return false; - return ((nlocked + npages) <= size) || can_lock; + return true; } int hfi1_acquire_user_pages(struct mm_struct *mm, unsigned long vaddr, size_t npages, diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index e6e17984553c..7f6d7fc7951d 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1598,13 +1598,11 @@ static const char * const driver_cntr_names[] = { "DRIVER_EgrHdrFull" }; -static DEFINE_MUTEX(cntr_names_lock); /* protects the *_cntr_names bufers */ static struct rdma_stat_desc *dev_cntr_descs; static struct rdma_stat_desc *port_cntr_descs; int num_driver_cntrs = ARRAY_SIZE(driver_cntr_names); static int num_dev_cntrs; static int num_port_cntrs; -static int cntr_names_initialized; /* * Convert a list of names separated by '\n' into an array of NULL terminated @@ -1615,8 +1613,8 @@ static int init_cntr_names(const char *names_in, const size_t names_len, int num_extra_names, int *num_cntrs, struct rdma_stat_desc **cntr_descs) { - struct rdma_stat_desc *q; - char *names_out, *p; + struct rdma_stat_desc *names_out; + char *p; int i, n; n = 0; @@ -1624,65 +1622,45 @@ static int init_cntr_names(const char *names_in, const size_t names_len, if (names_in[i] == '\n') n++; - names_out = - kzalloc((n + num_extra_names) * sizeof(*q) + names_len, - GFP_KERNEL); + names_out = kzalloc((n + num_extra_names) * sizeof(*names_out) + + names_len, + GFP_KERNEL); if (!names_out) { *num_cntrs = 0; *cntr_descs = NULL; return -ENOMEM; } - p = names_out + (n + num_extra_names) * sizeof(*q); + p = (char *)&names_out[n + num_extra_names]; memcpy(p, names_in, names_len); - q = (struct rdma_stat_desc *)names_out; for (i = 0; i < n; i++) { - q[i].name = p; + names_out[i].name = p; p = strchr(p, '\n'); *p++ = '\0'; } *num_cntrs = n; - *cntr_descs = (struct rdma_stat_desc *)names_out; + *cntr_descs = names_out; return 0; } -static int init_counters(struct ib_device *ibdev) -{ - struct hfi1_devdata *dd = dd_from_ibdev(ibdev); - int i, err = 0; - - mutex_lock(&cntr_names_lock); - if (cntr_names_initialized) - goto out_unlock; - - err = init_cntr_names(dd->cntrnames, dd->cntrnameslen, num_driver_cntrs, - &num_dev_cntrs, &dev_cntr_descs); - if (err) - goto out_unlock; - - for (i = 0; i < num_driver_cntrs; i++) - dev_cntr_descs[num_dev_cntrs + i].name = driver_cntr_names[i]; - - err = init_cntr_names(dd->portcntrnames, dd->portcntrnameslen, 0, - &num_port_cntrs, &port_cntr_descs); - if (err) { - kfree(dev_cntr_descs); - dev_cntr_descs = NULL; - goto out_unlock; - } - cntr_names_initialized = 1; - -out_unlock: - mutex_unlock(&cntr_names_lock); - return err; -} - static struct rdma_hw_stats *hfi1_alloc_hw_device_stats(struct ib_device *ibdev) { - if (init_counters(ibdev)) - return NULL; + if (!dev_cntr_descs) { + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + int i, err; + + err = init_cntr_names(dd->cntrnames, dd->cntrnameslen, + num_driver_cntrs, + &num_dev_cntrs, &dev_cntr_descs); + if (err) + return NULL; + + for (i = 0; i < num_driver_cntrs; i++) + dev_cntr_descs[num_dev_cntrs + i].name = + driver_cntr_names[i]; + } return rdma_alloc_hw_stats_struct(dev_cntr_descs, num_dev_cntrs + num_driver_cntrs, RDMA_HW_STATS_DEFAULT_LIFESPAN); @@ -1691,8 +1669,16 @@ static struct rdma_hw_stats *hfi1_alloc_hw_device_stats(struct ib_device *ibdev) static struct rdma_hw_stats *hfi_alloc_hw_port_stats(struct ib_device *ibdev, u32 port_num) { - if (init_counters(ibdev)) - return NULL; + if (!port_cntr_descs) { + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + int err; + + err = init_cntr_names(dd->portcntrnames, dd->portcntrnameslen, + 0, + &num_port_cntrs, &port_cntr_descs); + if (err) + return NULL; + } return rdma_alloc_hw_stats_struct(port_cntr_descs, num_port_cntrs, RDMA_HW_STATS_DEFAULT_LIFESPAN); } @@ -1917,13 +1903,10 @@ void hfi1_unregister_ib_device(struct hfi1_devdata *dd) del_timer_sync(&dev->mem_timer); verbs_txreq_exit(dev); - mutex_lock(&cntr_names_lock); kfree(dev_cntr_descs); kfree(port_cntr_descs); dev_cntr_descs = NULL; port_cntr_descs = NULL; - cntr_names_initialized = 0; - mutex_unlock(&cntr_names_lock); } void hfi1_cnp_rcv(struct hfi1_packet *packet) |