From ea4baf7f116a18382df331db2123d98bc1c3cd83 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 18 Dec 2018 14:28:30 +0200 Subject: RDMA: Rename port_callback to init_port Most provider routines are callback routines which ib core invokes. _callback suffix doesn't convey information about when such callback is invoked. Therefore, rename port_callback to init_port. Additionally, store the init_port function pointer in ib_device_ops, so that it can be accessed in subsequent patches when binding rdma device to net namespace. Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/verbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index ec582d86025f..8957adf58af7 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1622,6 +1622,7 @@ static const struct ib_device_ops hfi1_dev_ops = { .alloc_rdma_netdev = hfi1_vnic_alloc_rn, .get_dev_fw_str = hfi1_get_dev_fw_str, .get_hw_stats = get_hw_stats, + .init_port = hfi1_create_port_files, .modify_device = modify_device, /* keep process mad in the driver */ .process_mad = hfi1_process_mad, @@ -1679,7 +1680,6 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd) /* * Fill in rvt info object. */ - dd->verbs_dev.rdi.driver_f.port_callback = hfi1_create_port_files; dd->verbs_dev.rdi.driver_f.get_pci_dev = get_pci_dev; dd->verbs_dev.rdi.driver_f.check_ah = hfi1_check_ah; dd->verbs_dev.rdi.driver_f.notify_new_ah = hfi1_notify_new_ah; -- cgit v1.2.3-59-g8ed1b From 54747231150f0dddf68f2ee29ec2970fcc433909 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 18 Dec 2018 14:15:56 +0200 Subject: RDMA: Introduce and use rdma_device_to_ibdev() Introduce and use rdma_device_to_ibdev() API for those drivers which are registering one sysfs group and also use in ib_core. In subsequent patch, device->provider_ibdev one-to-one mapping is no longer holds true during accessing sysfs entries. Therefore, introduce an API rdma_device_to_ibdev() that provides such information. Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 2 -- drivers/infiniband/core/sysfs.c | 12 ++++++------ drivers/infiniband/hw/bnxt_re/main.c | 6 ++++-- drivers/infiniband/hw/cxgb3/iwch_provider.c | 14 ++++++++------ drivers/infiniband/hw/cxgb4/provider.c | 14 ++++++++------ drivers/infiniband/hw/hfi1/sysfs.c | 16 ++++++++-------- drivers/infiniband/hw/i40iw/i40iw_verbs.c | 5 ++--- drivers/infiniband/hw/mlx4/main.c | 7 ++++--- drivers/infiniband/hw/mlx5/main.c | 13 ++++++++----- drivers/infiniband/hw/mthca/mthca_provider.c | 9 ++++++--- drivers/infiniband/hw/nes/nes_verbs.c | 2 +- drivers/infiniband/hw/ocrdma/ocrdma_main.c | 6 ++++-- drivers/infiniband/hw/qedr/main.c | 3 ++- drivers/infiniband/hw/qib/qib_sysfs.c | 18 +++++++++--------- drivers/infiniband/hw/usnic/usnic_ib_sysfs.c | 26 +++++++++++--------------- drivers/infiniband/sw/rxe/rxe_verbs.c | 4 ++-- include/rdma/ib_verbs.h | 23 +++++++++++++++++++++++ 17 files changed, 106 insertions(+), 74 deletions(-) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 66867e92ddea..f8180cf1a004 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -296,8 +296,6 @@ struct ib_device *ib_alloc_device(size_t size) device->dev.class = &ib_class; device_initialize(&device->dev); - dev_set_drvdata(&device->dev, device); - INIT_LIST_HEAD(&device->event_handler_list); spin_lock_init(&device->event_handler_lock); rwlock_init(&device->client_data_lock); diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 7a5679933df6..c75692802da8 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -1187,7 +1187,7 @@ err_put: static ssize_t node_type_show(struct device *device, struct device_attribute *attr, char *buf) { - struct ib_device *dev = container_of(device, struct ib_device, dev); + struct ib_device *dev = rdma_device_to_ibdev(device); switch (dev->node_type) { case RDMA_NODE_IB_CA: return sprintf(buf, "%d: CA\n", dev->node_type); @@ -1204,7 +1204,7 @@ static DEVICE_ATTR_RO(node_type); static ssize_t sys_image_guid_show(struct device *device, struct device_attribute *dev_attr, char *buf) { - struct ib_device *dev = container_of(device, struct ib_device, dev); + struct ib_device *dev = rdma_device_to_ibdev(device); return sprintf(buf, "%04x:%04x:%04x:%04x\n", be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[0]), @@ -1217,7 +1217,7 @@ static DEVICE_ATTR_RO(sys_image_guid); static ssize_t node_guid_show(struct device *device, struct device_attribute *attr, char *buf) { - struct ib_device *dev = container_of(device, struct ib_device, dev); + struct ib_device *dev = rdma_device_to_ibdev(device); return sprintf(buf, "%04x:%04x:%04x:%04x\n", be16_to_cpu(((__be16 *) &dev->node_guid)[0]), @@ -1230,7 +1230,7 @@ static DEVICE_ATTR_RO(node_guid); static ssize_t node_desc_show(struct device *device, struct device_attribute *attr, char *buf) { - struct ib_device *dev = container_of(device, struct ib_device, dev); + struct ib_device *dev = rdma_device_to_ibdev(device); return sprintf(buf, "%.64s\n", dev->node_desc); } @@ -1239,7 +1239,7 @@ static ssize_t node_desc_store(struct device *device, struct device_attribute *attr, const char *buf, size_t count) { - struct ib_device *dev = container_of(device, struct ib_device, dev); + struct ib_device *dev = rdma_device_to_ibdev(device); struct ib_device_modify desc = {}; int ret; @@ -1258,7 +1258,7 @@ static DEVICE_ATTR_RW(node_desc); static ssize_t fw_ver_show(struct device *device, struct device_attribute *attr, char *buf) { - struct ib_device *dev = container_of(device, struct ib_device, dev); + struct ib_device *dev = rdma_device_to_ibdev(device); ib_get_device_fw_str(dev, buf); strlcat(buf, "\n", IB_FW_VERSION_NAME_MAX); diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 797a3e943366..16eecfa5882c 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -538,7 +538,8 @@ static struct bnxt_en_dev *bnxt_re_dev_probe(struct net_device *netdev) static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr, char *buf) { - struct bnxt_re_dev *rdev = to_bnxt_re_dev(device, ibdev.dev); + struct bnxt_re_dev *rdev = + rdma_device_to_drv_device(device, struct bnxt_re_dev, ibdev); return scnprintf(buf, PAGE_SIZE, "0x%x\n", rdev->en_dev->pdev->vendor); } @@ -547,7 +548,8 @@ static DEVICE_ATTR_RO(hw_rev); static ssize_t hca_type_show(struct device *device, struct device_attribute *attr, char *buf) { - struct bnxt_re_dev *rdev = to_bnxt_re_dev(device, ibdev.dev); + struct bnxt_re_dev *rdev = + rdma_device_to_drv_device(device, struct bnxt_re_dev, ibdev); return scnprintf(buf, PAGE_SIZE, "%s\n", rdev->ibdev.node_desc); } diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index ffdde3cca268..07c20cd07f33 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -1130,8 +1130,9 @@ static int iwch_query_port(struct ib_device *ibdev, static ssize_t hw_rev_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev, - ibdev.dev); + struct iwch_dev *iwch_dev = + rdma_device_to_drv_device(dev, struct iwch_dev, ibdev); + pr_debug("%s dev 0x%p\n", __func__, dev); return sprintf(buf, "%d\n", iwch_dev->rdev.t3cdev_p->type); } @@ -1140,8 +1141,8 @@ static DEVICE_ATTR_RO(hw_rev); static ssize_t hca_type_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev, - ibdev.dev); + struct iwch_dev *iwch_dev = + rdma_device_to_drv_device(dev, struct iwch_dev, ibdev); struct ethtool_drvinfo info; struct net_device *lldev = iwch_dev->rdev.t3cdev_p->lldev; @@ -1154,8 +1155,9 @@ static DEVICE_ATTR_RO(hca_type); static ssize_t board_id_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev, - ibdev.dev); + struct iwch_dev *iwch_dev = + rdma_device_to_drv_device(dev, struct iwch_dev, ibdev); + pr_debug("%s dev 0x%p\n", __func__, dev); return sprintf(buf, "%x.%x\n", iwch_dev->rdev.rnic_info.pdev->vendor, iwch_dev->rdev.rnic_info.pdev->device); diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c index 0a99894b0160..f977f8e7e162 100644 --- a/drivers/infiniband/hw/cxgb4/provider.c +++ b/drivers/infiniband/hw/cxgb4/provider.c @@ -376,8 +376,9 @@ static int c4iw_query_port(struct ib_device *ibdev, u8 port, static ssize_t hw_rev_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev, - ibdev.dev); + struct c4iw_dev *c4iw_dev = + rdma_device_to_drv_device(dev, struct c4iw_dev, ibdev); + pr_debug("dev 0x%p\n", dev); return sprintf(buf, "%d\n", CHELSIO_CHIP_RELEASE(c4iw_dev->rdev.lldi.adapter_type)); @@ -387,8 +388,8 @@ static DEVICE_ATTR_RO(hw_rev); static ssize_t hca_type_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev, - ibdev.dev); + struct c4iw_dev *c4iw_dev = + rdma_device_to_drv_device(dev, struct c4iw_dev, ibdev); struct ethtool_drvinfo info; struct net_device *lldev = c4iw_dev->rdev.lldi.ports[0]; @@ -401,8 +402,9 @@ static DEVICE_ATTR_RO(hca_type); static ssize_t board_id_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev, - ibdev.dev); + struct c4iw_dev *c4iw_dev = + rdma_device_to_drv_device(dev, struct c4iw_dev, ibdev); + pr_debug("dev 0x%p\n", dev); return sprintf(buf, "%x.%x\n", c4iw_dev->rdev.lldi.pdev->vendor, c4iw_dev->rdev.lldi.pdev->device); diff --git a/drivers/infiniband/hw/hfi1/sysfs.c b/drivers/infiniband/hw/hfi1/sysfs.c index 2be513d4c9da..90f62c4bddba 100644 --- a/drivers/infiniband/hw/hfi1/sysfs.c +++ b/drivers/infiniband/hw/hfi1/sysfs.c @@ -498,7 +498,7 @@ static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr, char *buf) { struct hfi1_ibdev *dev = - container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); + rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev); return sprintf(buf, "%x\n", dd_from_dev(dev)->minrev); } @@ -508,7 +508,7 @@ static ssize_t board_id_show(struct device *device, struct device_attribute *attr, char *buf) { struct hfi1_ibdev *dev = - container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); + rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev); struct hfi1_devdata *dd = dd_from_dev(dev); int ret; @@ -524,7 +524,7 @@ static ssize_t boardversion_show(struct device *device, struct device_attribute *attr, char *buf) { struct hfi1_ibdev *dev = - container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); + rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev); struct hfi1_devdata *dd = dd_from_dev(dev); /* The string printed here is already newline-terminated. */ @@ -536,7 +536,7 @@ static ssize_t nctxts_show(struct device *device, struct device_attribute *attr, char *buf) { struct hfi1_ibdev *dev = - container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); + rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev); struct hfi1_devdata *dd = dd_from_dev(dev); /* @@ -555,7 +555,7 @@ static ssize_t nfreectxts_show(struct device *device, struct device_attribute *attr, char *buf) { struct hfi1_ibdev *dev = - container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); + rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev); struct hfi1_devdata *dd = dd_from_dev(dev); /* Return the number of free user ports (contexts) available. */ @@ -567,7 +567,7 @@ static ssize_t serial_show(struct device *device, struct device_attribute *attr, char *buf) { struct hfi1_ibdev *dev = - container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); + rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev); struct hfi1_devdata *dd = dd_from_dev(dev); return scnprintf(buf, PAGE_SIZE, "%s", dd->serial); @@ -579,7 +579,7 @@ static ssize_t chip_reset_store(struct device *device, size_t count) { struct hfi1_ibdev *dev = - container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); + rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev); struct hfi1_devdata *dd = dd_from_dev(dev); int ret; @@ -609,7 +609,7 @@ static ssize_t tempsense_show(struct device *device, struct device_attribute *attr, char *buf) { struct hfi1_ibdev *dev = - container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); + rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev); struct hfi1_devdata *dd = dd_from_dev(dev); struct hfi1_temp temp; int ret; diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index af66ab9d150b..12b31a8440be 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -2139,9 +2139,8 @@ static int i40iw_dereg_mr(struct ib_mr *ib_mr) static ssize_t hw_rev_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct i40iw_ib_device *iwibdev = container_of(dev, - struct i40iw_ib_device, - ibdev.dev); + struct i40iw_ib_device *iwibdev = + rdma_device_to_drv_device(dev, struct i40iw_ib_device, ibdev); u32 hw_rev = iwibdev->iwdev->sc_dev.hw_rev; return sprintf(buf, "%x\n", hw_rev); diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index c3f950d82ed0..dc2ffd293a11 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -2043,7 +2043,7 @@ static ssize_t hca_type_show(struct device *device, struct device_attribute *attr, char *buf) { struct mlx4_ib_dev *dev = - container_of(device, struct mlx4_ib_dev, ib_dev.dev); + rdma_device_to_drv_device(device, struct mlx4_ib_dev, ib_dev); return sprintf(buf, "MT%d\n", dev->dev->persist->pdev->device); } static DEVICE_ATTR_RO(hca_type); @@ -2052,7 +2052,7 @@ static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr, char *buf) { struct mlx4_ib_dev *dev = - container_of(device, struct mlx4_ib_dev, ib_dev.dev); + rdma_device_to_drv_device(device, struct mlx4_ib_dev, ib_dev); return sprintf(buf, "%x\n", dev->dev->rev_id); } static DEVICE_ATTR_RO(hw_rev); @@ -2061,7 +2061,8 @@ static ssize_t board_id_show(struct device *device, struct device_attribute *attr, char *buf) { struct mlx4_ib_dev *dev = - container_of(device, struct mlx4_ib_dev, ib_dev.dev); + rdma_device_to_drv_device(device, struct mlx4_ib_dev, ib_dev); + return sprintf(buf, "%.*s\n", MLX4_BOARD_ID_LEN, dev->dev->board_id); } diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 948617a60d44..4b1b56d54301 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -4104,7 +4104,7 @@ static ssize_t fw_pages_show(struct device *device, struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = - container_of(device, struct mlx5_ib_dev, ib_dev.dev); + rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev); return sprintf(buf, "%d\n", dev->mdev->priv.fw_pages); } @@ -4114,7 +4114,7 @@ static ssize_t reg_pages_show(struct device *device, struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = - container_of(device, struct mlx5_ib_dev, ib_dev.dev); + rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev); return sprintf(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages)); } @@ -4124,7 +4124,8 @@ static ssize_t hca_type_show(struct device *device, struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = - container_of(device, struct mlx5_ib_dev, ib_dev.dev); + rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev); + return sprintf(buf, "MT%d\n", dev->mdev->pdev->device); } static DEVICE_ATTR_RO(hca_type); @@ -4133,7 +4134,8 @@ static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = - container_of(device, struct mlx5_ib_dev, ib_dev.dev); + rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev); + return sprintf(buf, "%x\n", dev->mdev->rev_id); } static DEVICE_ATTR_RO(hw_rev); @@ -4142,7 +4144,8 @@ static ssize_t board_id_show(struct device *device, struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = - container_of(device, struct mlx5_ib_dev, ib_dev.dev); + rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev); + return sprintf(buf, "%.*s\n", MLX5_BOARD_ID_LEN, dev->mdev->board_id); } diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index 3473c6c51b92..63003b4d2485 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -1081,7 +1081,8 @@ static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr, char *buf) { struct mthca_dev *dev = - container_of(device, struct mthca_dev, ib_dev.dev); + rdma_device_to_drv_device(device, struct mthca_dev, ib_dev); + return sprintf(buf, "%x\n", dev->rev_id); } static DEVICE_ATTR_RO(hw_rev); @@ -1090,7 +1091,8 @@ static ssize_t hca_type_show(struct device *device, struct device_attribute *attr, char *buf) { struct mthca_dev *dev = - container_of(device, struct mthca_dev, ib_dev.dev); + rdma_device_to_drv_device(device, struct mthca_dev, ib_dev); + switch (dev->pdev->device) { case PCI_DEVICE_ID_MELLANOX_TAVOR: return sprintf(buf, "MT23108\n"); @@ -1111,7 +1113,8 @@ static ssize_t board_id_show(struct device *device, struct device_attribute *attr, char *buf) { struct mthca_dev *dev = - container_of(device, struct mthca_dev, ib_dev.dev); + rdma_device_to_drv_device(device, struct mthca_dev, ib_dev); + return sprintf(buf, "%.*s\n", MTHCA_BOARD_ID_LEN, dev->board_id); } static DEVICE_ATTR_RO(board_id); diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 34601f0cbd74..034156f7e9ed 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -2560,7 +2560,7 @@ static ssize_t hw_rev_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nes_ib_device *nesibdev = - container_of(dev, struct nes_ib_device, ibdev.dev); + rdma_device_to_drv_device(dev, struct nes_ib_device, ibdev); struct nes_vnic *nesvnic = nesibdev->nesvnic; nes_debug(NES_DBG_INIT, "\n"); diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c index f45b996f617f..b0491b9ecfe4 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c @@ -118,7 +118,8 @@ static void get_dev_fw_str(struct ib_device *device, char *str) static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr, char *buf) { - struct ocrdma_dev *dev = dev_get_drvdata(device); + struct ocrdma_dev *dev = + rdma_device_to_drv_device(device, struct ocrdma_dev, ibdev); return scnprintf(buf, PAGE_SIZE, "0x%x\n", dev->nic_info.pdev->vendor); } @@ -127,7 +128,8 @@ static DEVICE_ATTR_RO(hw_rev); static ssize_t hca_type_show(struct device *device, struct device_attribute *attr, char *buf) { - struct ocrdma_dev *dev = dev_get_drvdata(device); + struct ocrdma_dev *dev = + rdma_device_to_drv_device(device, struct ocrdma_dev, ibdev); return scnprintf(buf, PAGE_SIZE, "%s\n", &dev->model_number[0]); } diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c index 8e5c76d06855..f85e72b65a10 100644 --- a/drivers/infiniband/hw/qedr/main.c +++ b/drivers/infiniband/hw/qedr/main.c @@ -137,7 +137,8 @@ static int qedr_iw_port_immutable(struct ib_device *ibdev, u8 port_num, static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr, char *buf) { - struct qedr_dev *dev = dev_get_drvdata(device); + struct qedr_dev *dev = + rdma_device_to_drv_device(device, struct qedr_dev, ibdev); return scnprintf(buf, PAGE_SIZE, "0x%x\n", dev->pdev->vendor); } diff --git a/drivers/infiniband/hw/qib/qib_sysfs.c b/drivers/infiniband/hw/qib/qib_sysfs.c index 1cf4ca3f23e3..905206a0c2d5 100644 --- a/drivers/infiniband/hw/qib/qib_sysfs.c +++ b/drivers/infiniband/hw/qib/qib_sysfs.c @@ -555,7 +555,7 @@ static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr, char *buf) { struct qib_ibdev *dev = - container_of(device, struct qib_ibdev, rdi.ibdev.dev); + rdma_device_to_drv_device(device, struct qib_ibdev, rdi.ibdev); return sprintf(buf, "%x\n", dd_from_dev(dev)->minrev); } @@ -565,7 +565,7 @@ static ssize_t hca_type_show(struct device *device, struct device_attribute *attr, char *buf) { struct qib_ibdev *dev = - container_of(device, struct qib_ibdev, rdi.ibdev.dev); + rdma_device_to_drv_device(device, struct qib_ibdev, rdi.ibdev); struct qib_devdata *dd = dd_from_dev(dev); int ret; @@ -590,7 +590,7 @@ static ssize_t boardversion_show(struct device *device, struct device_attribute *attr, char *buf) { struct qib_ibdev *dev = - container_of(device, struct qib_ibdev, rdi.ibdev.dev); + rdma_device_to_drv_device(device, struct qib_ibdev, rdi.ibdev); struct qib_devdata *dd = dd_from_dev(dev); /* The string printed here is already newline-terminated. */ @@ -602,7 +602,7 @@ static ssize_t localbus_info_show(struct device *device, struct device_attribute *attr, char *buf) { struct qib_ibdev *dev = - container_of(device, struct qib_ibdev, rdi.ibdev.dev); + rdma_device_to_drv_device(device, struct qib_ibdev, rdi.ibdev); struct qib_devdata *dd = dd_from_dev(dev); /* The string printed here is already newline-terminated. */ @@ -614,7 +614,7 @@ static ssize_t nctxts_show(struct device *device, struct device_attribute *attr, char *buf) { struct qib_ibdev *dev = - container_of(device, struct qib_ibdev, rdi.ibdev.dev); + rdma_device_to_drv_device(device, struct qib_ibdev, rdi.ibdev); struct qib_devdata *dd = dd_from_dev(dev); /* Return the number of user ports (contexts) available. */ @@ -630,7 +630,7 @@ static ssize_t nfreectxts_show(struct device *device, struct device_attribute *attr, char *buf) { struct qib_ibdev *dev = - container_of(device, struct qib_ibdev, rdi.ibdev.dev); + rdma_device_to_drv_device(device, struct qib_ibdev, rdi.ibdev); struct qib_devdata *dd = dd_from_dev(dev); /* Return the number of free user ports (contexts) available. */ @@ -642,7 +642,7 @@ static ssize_t serial_show(struct device *device, struct device_attribute *attr, char *buf) { struct qib_ibdev *dev = - container_of(device, struct qib_ibdev, rdi.ibdev.dev); + rdma_device_to_drv_device(device, struct qib_ibdev, rdi.ibdev); struct qib_devdata *dd = dd_from_dev(dev); buf[sizeof(dd->serial)] = '\0'; @@ -657,7 +657,7 @@ static ssize_t chip_reset_store(struct device *device, size_t count) { struct qib_ibdev *dev = - container_of(device, struct qib_ibdev, rdi.ibdev.dev); + rdma_device_to_drv_device(device, struct qib_ibdev, rdi.ibdev); struct qib_devdata *dd = dd_from_dev(dev); int ret; @@ -679,7 +679,7 @@ static ssize_t tempsense_show(struct device *device, struct device_attribute *attr, char *buf) { struct qib_ibdev *dev = - container_of(device, struct qib_ibdev, rdi.ibdev.dev); + rdma_device_to_drv_device(device, struct qib_ibdev, rdi.ibdev); struct qib_devdata *dd = dd_from_dev(dev); int ret; int idx; diff --git a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c index a7e4b2ccfaf8..c85d48ae7442 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c @@ -50,7 +50,7 @@ static ssize_t board_id_show(struct device *device, struct device_attribute *attr, char *buf) { struct usnic_ib_dev *us_ibdev = - container_of(device, struct usnic_ib_dev, ib_dev.dev); + rdma_device_to_drv_device(device, struct usnic_ib_dev, ib_dev); unsigned short subsystem_device_id; mutex_lock(&us_ibdev->usdev_lock); @@ -67,14 +67,13 @@ static DEVICE_ATTR_RO(board_id); static ssize_t config_show(struct device *device, struct device_attribute *attr, char *buf) { - struct usnic_ib_dev *us_ibdev; + struct usnic_ib_dev *us_ibdev = + rdma_device_to_drv_device(device, struct usnic_ib_dev, ib_dev); char *ptr; unsigned left; unsigned n; enum usnic_vnic_res_type res_type; - us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev); - /* Buffer space limit is 1 page */ ptr = buf; left = PAGE_SIZE; @@ -130,9 +129,8 @@ static DEVICE_ATTR_RO(config); static ssize_t iface_show(struct device *device, struct device_attribute *attr, char *buf) { - struct usnic_ib_dev *us_ibdev; - - us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev); + struct usnic_ib_dev *us_ibdev = + rdma_device_to_drv_device(device, struct usnic_ib_dev, ib_dev); return scnprintf(buf, PAGE_SIZE, "%s\n", netdev_name(us_ibdev->netdev)); @@ -142,9 +140,8 @@ static DEVICE_ATTR_RO(iface); static ssize_t max_vf_show(struct device *device, struct device_attribute *attr, char *buf) { - struct usnic_ib_dev *us_ibdev; - - us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev); + struct usnic_ib_dev *us_ibdev = + rdma_device_to_drv_device(device, struct usnic_ib_dev, ib_dev); return scnprintf(buf, PAGE_SIZE, "%u\n", kref_read(&us_ibdev->vf_cnt)); @@ -154,10 +151,10 @@ static DEVICE_ATTR_RO(max_vf); static ssize_t qp_per_vf_show(struct device *device, struct device_attribute *attr, char *buf) { - struct usnic_ib_dev *us_ibdev; + struct usnic_ib_dev *us_ibdev = + rdma_device_to_drv_device(device, struct usnic_ib_dev, ib_dev); int qp_per_vf; - us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev); qp_per_vf = max(us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_WQ], us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_RQ]); @@ -169,9 +166,8 @@ static DEVICE_ATTR_RO(qp_per_vf); static ssize_t cq_per_vf_show(struct device *device, struct device_attribute *attr, char *buf) { - struct usnic_ib_dev *us_ibdev; - - us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev); + struct usnic_ib_dev *us_ibdev = + rdma_device_to_drv_device(device, struct usnic_ib_dev, ib_dev); return scnprintf(buf, PAGE_SIZE, "%d\n", us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_CQ]); diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 43171148e9c5..3d01247a28db 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -1129,8 +1129,8 @@ static int rxe_detach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid) static ssize_t parent_show(struct device *device, struct device_attribute *attr, char *buf) { - struct rxe_dev *rxe = container_of(device, struct rxe_dev, - ib_dev.dev); + struct rxe_dev *rxe = + rdma_device_to_drv_device(device, struct rxe_dev, ib_dev); return snprintf(buf, 16, "%s\n", rxe_parent_name(rxe, 1)); } diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 1d1902fd9f87..94b6e1dd4dab 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -4241,4 +4241,27 @@ rdma_set_device_sysfs_group(struct ib_device *dev, dev->groups[1] = group; } +/** + * rdma_device_to_ibdev - Get ib_device pointer from device pointer + * + * @device: device pointer for which ib_device pointer to retrieve + * + * rdma_device_to_ibdev() retrieves ib_device pointer from device. + * + */ +static inline struct ib_device *rdma_device_to_ibdev(struct device *device) +{ + return container_of(device, struct ib_device, dev); +} + +/** + * rdma_device_to_drv_device - Helper macro to reach back to driver's + * ib_device holder structure from device pointer. + * + * NOTE: New drivers should not make use of this API; This API is only for + * existing drivers who have exposed sysfs entries using + * rdma_set_device_sysfs_group(). + */ +#define rdma_device_to_drv_device(dev, drv_dev_struct, ibdev_member) \ + container_of(rdma_device_to_ibdev(dev), drv_dev_struct, ibdev_member) #endif /* IB_VERBS_H */ -- cgit v1.2.3-59-g8ed1b From 5c43276499f912ae7aec06737a3c6e0f0f3ba74b Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 22 Jan 2019 16:17:54 +0100 Subject: infiniband: hfi1: drop crazy DEBUGFS_SEQ_FILE_CREATE() macro The macro was just making things harder to follow, and audit, so remove it and call debugfs_create_file() directly. Also, the macro did not need to warn about the call failing as no one should ever care about any debugfs functions failing. Signed-off-by: Greg Kroah-Hartman Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/debugfs.c | 56 ++++++++++++++++++------------------ drivers/infiniband/hw/hfi1/debugfs.h | 12 -------- drivers/infiniband/hw/hfi1/fault.c | 3 +- 3 files changed, 30 insertions(+), 41 deletions(-) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/debugfs.c b/drivers/infiniband/hw/hfi1/debugfs.c index 0a557795563c..aeb0f07103de 100644 --- a/drivers/infiniband/hw/hfi1/debugfs.c +++ b/drivers/infiniband/hw/hfi1/debugfs.c @@ -1167,6 +1167,7 @@ void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd) char link[10]; struct hfi1_devdata *dd = dd_from_dev(ibd); struct hfi1_pportdata *ppd; + struct dentry *root; int unit = dd->unit; int i, j; @@ -1174,31 +1175,29 @@ void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd) return; snprintf(name, sizeof(name), "%s_%d", class_name(), unit); snprintf(link, sizeof(link), "%d", unit); - ibd->hfi1_ibdev_dbg = debugfs_create_dir(name, hfi1_dbg_root); - if (!ibd->hfi1_ibdev_dbg) { - pr_warn("create of %s failed\n", name); - return; - } + root = debugfs_create_dir(name, hfi1_dbg_root); + ibd->hfi1_ibdev_dbg = root; + ibd->hfi1_ibdev_link = debugfs_create_symlink(link, hfi1_dbg_root, name); - if (!ibd->hfi1_ibdev_link) { - pr_warn("create of %s symlink failed\n", name); - return; - } - DEBUGFS_SEQ_FILE_CREATE(opcode_stats, ibd->hfi1_ibdev_dbg, ibd); - DEBUGFS_SEQ_FILE_CREATE(tx_opcode_stats, ibd->hfi1_ibdev_dbg, ibd); - DEBUGFS_SEQ_FILE_CREATE(ctx_stats, ibd->hfi1_ibdev_dbg, ibd); - DEBUGFS_SEQ_FILE_CREATE(qp_stats, ibd->hfi1_ibdev_dbg, ibd); - DEBUGFS_SEQ_FILE_CREATE(sdes, ibd->hfi1_ibdev_dbg, ibd); - DEBUGFS_SEQ_FILE_CREATE(rcds, ibd->hfi1_ibdev_dbg, ibd); - DEBUGFS_SEQ_FILE_CREATE(pios, ibd->hfi1_ibdev_dbg, ibd); - DEBUGFS_SEQ_FILE_CREATE(sdma_cpu_list, ibd->hfi1_ibdev_dbg, ibd); + + debugfs_create_file("opcode_stats", 0444, root, ibd, + &_opcode_stats_file_ops); + debugfs_create_file("tx_opcode_stats", 0444, root, ibd, + &_tx_opcode_stats_file_ops); + debugfs_create_file("ctx_stats", 0444, root, ibd, &_ctx_stats_file_ops); + debugfs_create_file("qp_stats", 0444, root, ibd, &_qp_stats_file_ops); + debugfs_create_file("sdes", 0444, root, ibd, &_sdes_file_ops); + debugfs_create_file("rcds", 0444, root, ibd, &_rcds_file_ops); + debugfs_create_file("pios", 0444, root, ibd, &_pios_file_ops); + debugfs_create_file("sdma_cpu_list", 0444, root, ibd, + &_sdma_cpu_list_file_ops); + /* dev counter files */ for (i = 0; i < ARRAY_SIZE(cntr_ops); i++) - DEBUGFS_FILE_CREATE(cntr_ops[i].name, - ibd->hfi1_ibdev_dbg, - dd, - &cntr_ops[i].ops, S_IRUGO); + debugfs_create_file(cntr_ops[i].name, 0444, root, dd, + &cntr_ops[i].ops); + /* per port files */ for (ppd = dd->pport, j = 0; j < dd->num_pports; j++, ppd++) for (i = 0; i < ARRAY_SIZE(port_cntr_ops); i++) { @@ -1206,12 +1205,11 @@ void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd) sizeof(name), port_cntr_ops[i].name, j + 1); - DEBUGFS_FILE_CREATE(name, - ibd->hfi1_ibdev_dbg, - ppd, - &port_cntr_ops[i].ops, + debugfs_create_file(name, !port_cntr_ops[i].ops.write ? - S_IRUGO : S_IRUGO | S_IWUSR); + S_IRUGO : + S_IRUGO | S_IWUSR, + root, ppd, &port_cntr_ops[i].ops); } hfi1_fault_init_debugfs(ibd); @@ -1343,8 +1341,10 @@ void hfi1_dbg_init(void) hfi1_dbg_root = debugfs_create_dir(DRIVER_NAME, NULL); if (!hfi1_dbg_root) pr_warn("init of debugfs failed\n"); - DEBUGFS_SEQ_FILE_CREATE(driver_stats_names, hfi1_dbg_root, NULL); - DEBUGFS_SEQ_FILE_CREATE(driver_stats, hfi1_dbg_root, NULL); + debugfs_create_file("driver_stats_names", 0444, hfi1_dbg_root, NULL, + &_driver_stats_names_file_ops); + debugfs_create_file("driver_stats", 0444, hfi1_dbg_root, NULL, + &_driver_stats_file_ops); } void hfi1_dbg_exit(void) diff --git a/drivers/infiniband/hw/hfi1/debugfs.h b/drivers/infiniband/hw/hfi1/debugfs.h index d5d824459fcc..57e582caa5eb 100644 --- a/drivers/infiniband/hw/hfi1/debugfs.h +++ b/drivers/infiniband/hw/hfi1/debugfs.h @@ -49,16 +49,6 @@ struct hfi1_ibdev; -#define DEBUGFS_FILE_CREATE(name, parent, data, ops, mode) \ -do { \ - struct dentry *ent; \ - const char *__name = name; \ - ent = debugfs_create_file(__name, mode, parent, \ - data, ops); \ - if (!ent) \ - pr_warn("create of %s failed\n", __name); \ -} while (0) - #define DEBUGFS_SEQ_FILE_OPS(name) \ static const struct seq_operations _##name##_seq_ops = { \ .start = _##name##_seq_start, \ @@ -89,8 +79,6 @@ static const struct file_operations _##name##_file_ops = { \ .release = seq_release \ } -#define DEBUGFS_SEQ_FILE_CREATE(name, parent, data) \ - DEBUGFS_FILE_CREATE(#name, parent, data, &_##name##_file_ops, 0444) ssize_t hfi1_seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos); diff --git a/drivers/infiniband/hw/hfi1/fault.c b/drivers/infiniband/hw/hfi1/fault.c index e2290f32c8d9..dd09b8544568 100644 --- a/drivers/infiniband/hw/hfi1/fault.c +++ b/drivers/infiniband/hw/hfi1/fault.c @@ -278,7 +278,8 @@ int hfi1_fault_init_debugfs(struct hfi1_ibdev *ibd) return -ENOENT; } - DEBUGFS_SEQ_FILE_CREATE(fault_stats, ibd->fault->dir, ibd); + debugfs_create_file("fault_stats", 0444, ibd->fault->dir, ibd, + &_fault_stats_file_ops); if (!debugfs_create_bool("enable", 0600, ibd->fault->dir, &ibd->fault->enable)) goto fail; -- cgit v1.2.3-59-g8ed1b From e77511802560728eadfa87f3a1ac92be42af6f96 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 22 Jan 2019 16:17:55 +0100 Subject: infiniband: hfi1: no need to check return value of debugfs_create functions When calling debugfs functions, there is no need to ever check the return value. The function can work or not, but the code logic should never do something different based on this. Signed-off-by: Greg Kroah-Hartman Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/debugfs.c | 2 -- drivers/infiniband/hw/hfi1/fault.c | 50 +++++++++++++----------------------- 2 files changed, 18 insertions(+), 34 deletions(-) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/debugfs.c b/drivers/infiniband/hw/hfi1/debugfs.c index aeb0f07103de..427ba0ce74a5 100644 --- a/drivers/infiniband/hw/hfi1/debugfs.c +++ b/drivers/infiniband/hw/hfi1/debugfs.c @@ -1339,8 +1339,6 @@ DEBUGFS_FILE_OPS(driver_stats); void hfi1_dbg_init(void) { hfi1_dbg_root = debugfs_create_dir(DRIVER_NAME, NULL); - if (!hfi1_dbg_root) - pr_warn("init of debugfs failed\n"); debugfs_create_file("driver_stats_names", 0444, hfi1_dbg_root, NULL, &_driver_stats_names_file_ops); debugfs_create_file("driver_stats", 0444, hfi1_dbg_root, NULL, diff --git a/drivers/infiniband/hw/hfi1/fault.c b/drivers/infiniband/hw/hfi1/fault.c index dd09b8544568..3fd3315d0fb0 100644 --- a/drivers/infiniband/hw/hfi1/fault.c +++ b/drivers/infiniband/hw/hfi1/fault.c @@ -250,6 +250,7 @@ void hfi1_fault_exit_debugfs(struct hfi1_ibdev *ibd) int hfi1_fault_init_debugfs(struct hfi1_ibdev *ibd) { struct dentry *parent = ibd->hfi1_ibdev_dbg; + struct dentry *fault_dir; ibd->fault = kzalloc(sizeof(*ibd->fault), GFP_KERNEL); if (!ibd->fault) @@ -269,46 +270,31 @@ int hfi1_fault_init_debugfs(struct hfi1_ibdev *ibd) bitmap_zero(ibd->fault->opcodes, sizeof(ibd->fault->opcodes) * BITS_PER_BYTE); - ibd->fault->dir = - fault_create_debugfs_attr("fault", parent, - &ibd->fault->attr); - if (IS_ERR(ibd->fault->dir)) { + fault_dir = + fault_create_debugfs_attr("fault", parent, &ibd->fault->attr); + if (IS_ERR(fault_dir)) { kfree(ibd->fault); ibd->fault = NULL; return -ENOENT; } + ibd->fault->dir = fault_dir; - debugfs_create_file("fault_stats", 0444, ibd->fault->dir, ibd, + debugfs_create_file("fault_stats", 0444, fault_dir, ibd, &_fault_stats_file_ops); - if (!debugfs_create_bool("enable", 0600, ibd->fault->dir, - &ibd->fault->enable)) - goto fail; - if (!debugfs_create_bool("suppress_err", 0600, - ibd->fault->dir, - &ibd->fault->suppress_err)) - goto fail; - if (!debugfs_create_bool("opcode_mode", 0600, ibd->fault->dir, - &ibd->fault->opcode)) - goto fail; - if (!debugfs_create_file("opcodes", 0600, ibd->fault->dir, - ibd->fault, &__fault_opcodes_fops)) - goto fail; - if (!debugfs_create_u64("skip_pkts", 0600, - ibd->fault->dir, - &ibd->fault->fault_skip)) - goto fail; - if (!debugfs_create_u64("skip_usec", 0600, - ibd->fault->dir, - &ibd->fault->fault_skip_usec)) - goto fail; - if (!debugfs_create_u8("direction", 0600, ibd->fault->dir, - &ibd->fault->direction)) - goto fail; + debugfs_create_bool("enable", 0600, fault_dir, &ibd->fault->enable); + debugfs_create_bool("suppress_err", 0600, fault_dir, + &ibd->fault->suppress_err); + debugfs_create_bool("opcode_mode", 0600, fault_dir, + &ibd->fault->opcode); + debugfs_create_file("opcodes", 0600, fault_dir, ibd->fault, + &__fault_opcodes_fops); + debugfs_create_u64("skip_pkts", 0600, fault_dir, + &ibd->fault->fault_skip); + debugfs_create_u64("skip_usec", 0600, fault_dir, + &ibd->fault->fault_skip_usec); + debugfs_create_u8("direction", 0600, fault_dir, &ibd->fault->direction); return 0; -fail: - hfi1_fault_exit_debugfs(ibd); - return -ENOMEM; } bool hfi1_dbg_fault_suppress_err(struct hfi1_ibdev *ibd) -- cgit v1.2.3-59-g8ed1b From 87fc34b575fdb90994c6063541d79cd18b31b14d Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Wed, 23 Jan 2019 19:08:19 -0800 Subject: IB/{hfi1,qib}: Cleanup open coded sge sizing Sge sizing is done in several places using an open coded method. This can cause maintenance issues. The open coded method is encapsulated in a helper routine. The helper was introduced with commit: 1198fcea8a78 ("IB/hfi1, rdmavt: Move SGE state helper routines into rdmavt") Update all call sites that have the open coded path with the helper routine. Reviewed-by: Mike Marciniszyn Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/verbs.c | 12 ++---------- drivers/infiniband/hw/qib/qib_ud.c | 6 +----- drivers/infiniband/hw/qib/qib_verbs.c | 18 +++--------------- 3 files changed, 6 insertions(+), 30 deletions(-) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 8957adf58af7..c980345cf1e1 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -553,11 +553,7 @@ static noinline int build_verbs_ulp_payload( int ret = 0; while (length) { - len = ss->sge.length; - if (len > length) - len = length; - if (len > ss->sge.sge_length) - len = ss->sge.sge_length; + len = rvt_get_sge_length(&ss->sge, length); WARN_ON_ONCE(len == 0); ret = sdma_txadd_kvaddr( sde->dd, @@ -914,12 +910,8 @@ int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps, if (ss) { while (len) { void *addr = ss->sge.vaddr; - u32 slen = ss->sge.length; + u32 slen = rvt_get_sge_length(&ss->sge, len); - if (slen > len) - slen = len; - if (slen > ss->sge.sge_length) - slen = ss->sge.sge_length; rvt_update_sge(ss, slen, false); seg_pio_copy_mid(pbuf, addr, slen); len -= slen; diff --git a/drivers/infiniband/hw/qib/qib_ud.c b/drivers/infiniband/hw/qib/qib_ud.c index 868da0ece7ba..6668bbf4e96d 100644 --- a/drivers/infiniband/hw/qib/qib_ud.c +++ b/drivers/infiniband/hw/qib/qib_ud.c @@ -172,12 +172,8 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) ssge.num_sge = swqe->wr.num_sge; sge = &ssge.sge; while (length) { - u32 len = sge->length; + u32 len = rvt_get_sge_length(sge, length); - if (len > length) - len = length; - if (len > sge->sge_length) - len = sge->sge_length; rvt_copy_sge(qp, &qp->r_sge, sge->vaddr, len, true, false); sge->vaddr += len; sge->length -= len; diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c index ff8dab8e2344..5ff32d32c61c 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.c +++ b/drivers/infiniband/hw/qib/qib_verbs.c @@ -144,12 +144,8 @@ static u32 qib_count_sge(struct rvt_sge_state *ss, u32 length) u32 ndesc = 1; /* count the header */ while (length) { - u32 len = sge.length; + u32 len = rvt_get_sge_length(&sge, length); - if (len > length) - len = length; - if (len > sge.sge_length) - len = sge.sge_length; if (((long) sge.vaddr & (sizeof(u32) - 1)) || (len != length && (len & (sizeof(u32) - 1)))) { ndesc = 0; @@ -186,12 +182,8 @@ static void qib_copy_from_sge(void *data, struct rvt_sge_state *ss, u32 length) struct rvt_sge *sge = &ss->sge; while (length) { - u32 len = sge->length; + u32 len = rvt_get_sge_length(sge, length); - if (len > length) - len = length; - if (len > sge->sge_length) - len = sge->sge_length; memcpy(data, sge->vaddr, len); sge->vaddr += len; sge->length -= len; @@ -440,13 +432,9 @@ static void copy_io(u32 __iomem *piobuf, struct rvt_sge_state *ss, u32 last; while (1) { - u32 len = ss->sge.length; + u32 len = rvt_get_sge_length(&ss->sge, length); u32 off; - if (len > length) - len = length; - if (len > ss->sge.sge_length) - len = ss->sge.sge_length; /* If the source address is not aligned, try to align it. */ off = (unsigned long)ss->sge.vaddr & (sizeof(u32) - 1); if (off) { -- cgit v1.2.3-59-g8ed1b From db421a54996c602503204345171c662e65f20527 Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Wed, 23 Jan 2019 19:08:29 -0800 Subject: IB/{hfi1, qib, rvt} Cleanup open coded sge usage Several locations for manipulating sges use an open coded sequence that is covered by helper functions. Use the appropriate helper functions. Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/ud.c | 24 ++---------------------- drivers/infiniband/hw/qib/qib_sdma.c | 26 ++------------------------ drivers/infiniband/sw/rdmavt/qp.c | 26 ++------------------------ 3 files changed, 6 insertions(+), 70 deletions(-) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c index 88242fe95eaa..c98d94cda15e 100644 --- a/drivers/infiniband/hw/hfi1/ud.c +++ b/drivers/infiniband/hw/hfi1/ud.c @@ -222,31 +222,11 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) ssge.num_sge = swqe->wr.num_sge; sge = &ssge.sge; while (length) { - u32 len = sge->length; + u32 len = rvt_get_sge_length(sge, length); - if (len > length) - len = length; - if (len > sge->sge_length) - len = sge->sge_length; WARN_ON_ONCE(len == 0); rvt_copy_sge(qp, &qp->r_sge, sge->vaddr, len, true, false); - sge->vaddr += len; - sge->length -= len; - sge->sge_length -= len; - if (sge->sge_length == 0) { - if (--ssge.num_sge) - *sge = *ssge.sg_list++; - } else if (sge->length == 0 && sge->mr->lkey) { - if (++sge->n >= RVT_SEGSZ) { - if (++sge->m >= sge->mr->mapsz) - break; - sge->n = 0; - } - sge->vaddr = - sge->mr->map[sge->m]->segs[sge->n].vaddr; - sge->length = - sge->mr->map[sge->m]->segs[sge->n].length; - } + rvt_update_sge(&ssge, len, false); length -= len; } rvt_put_ss(&qp->r_sge); diff --git a/drivers/infiniband/hw/qib/qib_sdma.c b/drivers/infiniband/hw/qib/qib_sdma.c index 3d64081c4819..99e11c347130 100644 --- a/drivers/infiniband/hw/qib/qib_sdma.c +++ b/drivers/infiniband/hw/qib/qib_sdma.c @@ -565,13 +565,8 @@ retry: sge = &ss->sge; while (dwords) { u32 dw; - u32 len; + u32 len = rvt_get_sge_length(sge, dwords << 2); - len = dwords << 2; - if (len > sge->length) - len = sge->length; - if (len > sge->sge_length) - len = sge->sge_length; dw = (len + 3) >> 2; addr = dma_map_single(&ppd->dd->pcidev->dev, sge->vaddr, dw << 2, DMA_TO_DEVICE); @@ -594,24 +589,7 @@ retry: descqp = &ppd->sdma_descq[0].qw[0]; ++ppd->sdma_generation; } - sge->vaddr += len; - sge->length -= len; - sge->sge_length -= len; - if (sge->sge_length == 0) { - if (--ss->num_sge) - *sge = *ss->sg_list++; - } else if (sge->length == 0 && sge->mr->lkey) { - if (++sge->n >= RVT_SEGSZ) { - if (++sge->m >= sge->mr->mapsz) - break; - sge->n = 0; - } - sge->vaddr = - sge->mr->map[sge->m]->segs[sge->n].vaddr; - sge->length = - sge->mr->map[sge->m]->segs[sge->n].length; - } - + rvt_update_sge(ss, len, false); dwoffset += dw; dwords -= dw; } diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index a1bd8cfc2c25..16247d2a671d 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -2986,34 +2986,12 @@ do_write: sge = &sqp->s_sge.sge; while (sqp->s_len) { - u32 len = sqp->s_len; + u32 len = rvt_get_sge_length(sge, sqp->s_len); - if (len > sge->length) - len = sge->length; - if (len > sge->sge_length) - len = sge->sge_length; WARN_ON_ONCE(len == 0); rvt_copy_sge(qp, &qp->r_sge, sge->vaddr, len, release, copy_last); - sge->vaddr += len; - sge->length -= len; - sge->sge_length -= len; - if (sge->sge_length == 0) { - if (!release) - rvt_put_mr(sge->mr); - if (--sqp->s_sge.num_sge) - *sge = *sqp->s_sge.sg_list++; - } else if (sge->length == 0 && sge->mr->lkey) { - if (++sge->n >= RVT_SEGSZ) { - if (++sge->m >= sge->mr->mapsz) - break; - sge->n = 0; - } - sge->vaddr = - sge->mr->map[sge->m]->segs[sge->n].vaddr; - sge->length = - sge->mr->map[sge->m]->segs[sge->n].length; - } + rvt_update_sge(&sqp->s_sge, len, !release); sqp->s_len -= len; } if (release) -- cgit v1.2.3-59-g8ed1b From 44e43d91ad4731d9e2e70c60eecc5982d6671e8c Mon Sep 17 00:00:00 2001 From: Mitko Haralanov Date: Thu, 24 Jan 2019 06:09:46 -0800 Subject: IB/hfi1: OPFN support discovery OPFN (Omni Path Feature Negotiation) support discovery allows a RC QP to announce that it supports OPFN and also discover if OPFN is supported by the peer QP. OPFN parameter negotiation is skipped unless OPFN support is first discovered. OPFN support is announced by claiming what was the reserved bit in dword 1 of OmniPath modified base transport header in requests and responses. Reviewed-by: Mike Marciniszyn Signed-off-by: Ashutosh Dixit Signed-off-by: Mitko Haralanov Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/hfi.h | 1 + drivers/infiniband/hw/hfi1/opfn.h | 53 ++++++++++++++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/rc.c | 16 ++++++++---- drivers/infiniband/hw/hfi1/ruc.c | 16 ++++++------ drivers/infiniband/hw/hfi1/uc.c | 3 ++- drivers/infiniband/hw/hfi1/verbs.h | 3 ++- 6 files changed, 77 insertions(+), 15 deletions(-) create mode 100644 drivers/infiniband/hw/hfi1/opfn.h (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index 6db2276f5c13..ddfcf2fe40ca 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -73,6 +73,7 @@ #include "chip_registers.h" #include "common.h" +#include "opfn.h" #include "verbs.h" #include "pio.h" #include "chip.h" diff --git a/drivers/infiniband/hw/hfi1/opfn.h b/drivers/infiniband/hw/hfi1/opfn.h new file mode 100644 index 000000000000..1a2b3449df67 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/opfn.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ +/* + * Copyright(c) 2018 Intel Corporation. + * + */ +#ifndef _HFI1_OPFN_H +#define _HFI1_OPFN_H + +/** + * DOC: Omni Path Feature Negotion (OPFN) + * + * OPFN is a discovery protocol for Intel Omni-Path fabric that + * allows two RC QPs to negotiate a common feature that both QPs + * can support. Currently, the only OPA feature that OPFN + * supports is TID RDMA. + * + * Architecture + * + * OPFN involves the communication between two QPs on the HFI + * level on an Omni-Path fabric, and ULPs have no knowledge of + * OPFN at all. + * + * Implementation + * + * OPFN extends the existing IB RC protocol with the following + * changes: + * -- Uses Bit 24 (reserved) of DWORD 1 of Base Transport + * Header (BTH1) to indicate that the RC QP supports OPFN; + * -- Uses a combination of RC COMPARE_SWAP opcode (0x13) and + * the address U64_MAX (0xFFFFFFFFFFFFFFFF) as an OPFN + * request; The 64-bit data carried with the request/response + * contains the parameters for negotiation and will be + * defined in tid_rdma.c file; + * -- Defines IB_WR_RESERVED3 as IB_WR_OPFN. + * + * The OPFN communication will be triggered when an RC QP + * receives a request with Bit 24 of BTH1 set. The responder QP + * will then post send an OPFN request with its local + * parameters, which will be sent to the requester QP once all + * existing requests on the responder QP side have been sent. + * Once the requester QP receives the OPFN request, it will + * keep a copy of the responder QP's parameters, and return a + * response packet with its own local parameters. The responder + * QP receives the response packet and keeps a copy of the requester + * QP's parameters. After this exchange, each side has the parameters + * for both sides and therefore can select the right parameters + * for future transactions + */ + +/* STL Verbs Extended */ +#define IB_BTHE_E_SHIFT 24 + +#endif /* _HFI1_OPFN_H */ diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index be603f35d7e4..940e9236c328 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -89,8 +89,8 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, struct rvt_ack_entry *e; u32 hwords; u32 len; - u32 bth0; - u32 bth2; + u32 bth0, bth2; + u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT); int middle = 0; u32 pmtu = qp->pmtu; struct hfi1_qp_priv *priv = qp->priv; @@ -229,7 +229,7 @@ normal: ps->s_txreq->sde = priv->s_sde; ps->s_txreq->s_cur_size = len; ps->s_txreq->hdr_dwords = hwords; - hfi1_make_ruc_header(qp, ohdr, bth0, bth2, middle, ps); + hfi1_make_ruc_header(qp, ohdr, bth0, bth1, bth2, middle, ps); return 1; bail: @@ -262,8 +262,8 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) struct rvt_swqe *wqe; u32 hwords; u32 len; - u32 bth0 = 0; - u32 bth2; + u32 bth0 = 0, bth2; + u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT); u32 pmtu = qp->pmtu; char newreq; int middle = 0; @@ -693,6 +693,7 @@ no_flow_control: qp, ohdr, bth0 | (qp->s_state << 24), + bth1, bth2, middle, ps); @@ -796,6 +797,11 @@ static inline void hfi1_make_rc_ack_9B(struct hfi1_packet *packet, if (qp->s_mig_state == IB_MIG_MIGRATED) bth0 |= IB_BTH_MIG_REQ; bth1 = (!!is_fecn) << IB_BECN_SHIFT; + /* + * Inline ACKs go out without the use of the Verbs send engine, so + * we need to set the STL Verbs Extended bit here + */ + bth1 |= HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT; hfi1_make_bth_aeth(qp, ohdr, bth0, bth1); } diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c index 7fb317c711df..f96c0f544cb0 100644 --- a/drivers/infiniband/hw/hfi1/ruc.c +++ b/drivers/infiniband/hw/hfi1/ruc.c @@ -250,7 +250,6 @@ static inline void hfi1_make_ruc_bth(struct rvt_qp *qp, struct ib_other_headers *ohdr, u32 bth0, u32 bth1, u32 bth2) { - bth1 |= qp->remote_qpn; ohdr->bth[0] = cpu_to_be32(bth0); ohdr->bth[1] = cpu_to_be32(bth1); ohdr->bth[2] = cpu_to_be32(bth2); @@ -272,13 +271,13 @@ static inline void hfi1_make_ruc_bth(struct rvt_qp *qp, */ static inline void hfi1_make_ruc_header_16B(struct rvt_qp *qp, struct ib_other_headers *ohdr, - u32 bth0, u32 bth2, int middle, + u32 bth0, u32 bth1, u32 bth2, + int middle, struct hfi1_pkt_state *ps) { struct hfi1_qp_priv *priv = qp->priv; struct hfi1_ibport *ibp = ps->ibp; struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); - u32 bth1 = 0; u32 slid; u16 pkey = hfi1_get_pkey(ibp, qp->s_pkey_index); u8 l4 = OPA_16B_L4_IB_LOCAL; @@ -360,12 +359,12 @@ static inline void hfi1_make_ruc_header_16B(struct rvt_qp *qp, */ static inline void hfi1_make_ruc_header_9B(struct rvt_qp *qp, struct ib_other_headers *ohdr, - u32 bth0, u32 bth2, int middle, + u32 bth0, u32 bth1, u32 bth2, + int middle, struct hfi1_pkt_state *ps) { struct hfi1_qp_priv *priv = qp->priv; struct hfi1_ibport *ibp = ps->ibp; - u32 bth1 = 0; u16 pkey = hfi1_get_pkey(ibp, qp->s_pkey_index); u16 lrh0 = HFI1_LRH_BTH; u8 extra_bytes = -ps->s_txreq->s_cur_size & 3; @@ -415,7 +414,7 @@ static inline void hfi1_make_ruc_header_9B(struct rvt_qp *qp, typedef void (*hfi1_make_ruc_hdr)(struct rvt_qp *qp, struct ib_other_headers *ohdr, - u32 bth0, u32 bth2, int middle, + u32 bth0, u32 bth1, u32 bth2, int middle, struct hfi1_pkt_state *ps); /* We support only two types - 9B and 16B for now */ @@ -425,7 +424,7 @@ static const hfi1_make_ruc_hdr hfi1_ruc_header_tbl[2] = { }; void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr, - u32 bth0, u32 bth2, int middle, + u32 bth0, u32 bth1, u32 bth2, int middle, struct hfi1_pkt_state *ps) { struct hfi1_qp_priv *priv = qp->priv; @@ -446,7 +445,8 @@ void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr, priv->s_ahg->ahgidx = 0; /* Make the appropriate header */ - hfi1_ruc_header_tbl[priv->hdr_type](qp, ohdr, bth0, bth2, middle, ps); + hfi1_ruc_header_tbl[priv->hdr_type](qp, ohdr, bth0, bth1, bth2, middle, + ps); } /* when sending, force a reschedule every one of these periods */ diff --git a/drivers/infiniband/hw/hfi1/uc.c b/drivers/infiniband/hw/hfi1/uc.c index 6ba47037c424..4ed4fcfabd6c 100644 --- a/drivers/infiniband/hw/hfi1/uc.c +++ b/drivers/infiniband/hw/hfi1/uc.c @@ -271,7 +271,8 @@ int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) ps->s_txreq->ss = &qp->s_sge; ps->s_txreq->s_cur_size = len; hfi1_make_ruc_header(qp, ohdr, bth0 | (qp->s_state << 24), - mask_psn(qp->s_psn++), middle, ps); + qp->remote_qpn, mask_psn(qp->s_psn++), + middle, ps); return 1; done_free_tx: diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index 1ad0b14bdb3c..8834119184b3 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -72,6 +72,7 @@ struct hfi1_packet; #include "iowait.h" #include "tid_rdma.h" +#include "opfn.h" #define HFI1_MAX_RDMA_ATOMIC 16 @@ -356,7 +357,7 @@ u32 hfi1_make_grh(struct hfi1_ibport *ibp, struct ib_grh *hdr, const struct ib_global_route *grh, u32 hwords, u32 nwords); void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr, - u32 bth0, u32 bth2, int middle, + u32 bth0, u32 bth1, u32 bth2, int middle, struct hfi1_pkt_state *ps); void _hfi1_do_send(struct work_struct *work); -- cgit v1.2.3-59-g8ed1b From d22a207d74adb0b43742f83d025079207425928b Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 19:20:42 -0800 Subject: IB/hfi1: Add OPFN helper functions for TID RDMA feature This patch adds the OPFN helper functions to initialize, encode, decode, and reset OPFN parameters for the TID RDMA feature. Reviewed-by: Mike Marciniszyn Signed-off-by: Ashutosh Dixit Signed-off-by: Mitko Haralanov Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/chip.c | 11 ++ drivers/infiniband/hw/hfi1/chip.h | 3 +- drivers/infiniband/hw/hfi1/init.c | 2 + drivers/infiniband/hw/hfi1/opfn.h | 5 + drivers/infiniband/hw/hfi1/tid_rdma.c | 202 ++++++++++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/tid_rdma.h | 28 ++++- drivers/infiniband/hw/hfi1/verbs.h | 5 + 7 files changed, 254 insertions(+), 2 deletions(-) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index b443642eac02..4d40311f082e 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -5222,6 +5222,17 @@ int is_bx(struct hfi1_devdata *dd) return (chip_rev_minor & 0xF0) == 0x10; } +/* return true is kernel urg disabled for rcd */ +bool is_urg_masked(struct hfi1_ctxtdata *rcd) +{ + u64 mask; + u32 is = IS_RCVURGENT_START + rcd->ctxt; + u8 bit = is % 64; + + mask = read_csr(rcd->dd, CCE_INT_MASK + (8 * (is / 64))); + return !(mask & BIT_ULL(bit)); +} + /* * Append string s to buffer buf. Arguments curp and len are the current * position and remaining length, respectively. diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h index 6b9c8f12dff8..ba3d99e6e33b 100644 --- a/drivers/infiniband/hw/hfi1/chip.h +++ b/drivers/infiniband/hw/hfi1/chip.h @@ -1,7 +1,7 @@ #ifndef _CHIP_H #define _CHIP_H /* - * Copyright(c) 2015 - 2017 Intel Corporation. + * Copyright(c) 2015 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -804,6 +804,7 @@ void clear_linkup_counters(struct hfi1_devdata *dd); u32 hdrqempty(struct hfi1_ctxtdata *rcd); int is_ax(struct hfi1_devdata *dd); int is_bx(struct hfi1_devdata *dd); +bool is_urg_masked(struct hfi1_ctxtdata *rcd); u32 read_physical_state(struct hfi1_devdata *dd); u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate); const char *opa_lstate_name(u32 lstate); diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 7835eb52e7c5..2ba5a2a8b68f 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -927,6 +927,8 @@ int hfi1_init(struct hfi1_devdata *dd, int reinit) lastfail = hfi1_create_rcvhdrq(dd, rcd); if (!lastfail) lastfail = hfi1_setup_eagerbufs(rcd); + if (!lastfail) + lastfail = hfi1_kern_exp_rcv_init(rcd, reinit); if (lastfail) { dd_dev_err(dd, "failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n"); diff --git a/drivers/infiniband/hw/hfi1/opfn.h b/drivers/infiniband/hw/hfi1/opfn.h index 1a2b3449df67..1927c9862b8f 100644 --- a/drivers/infiniband/hw/hfi1/opfn.h +++ b/drivers/infiniband/hw/hfi1/opfn.h @@ -50,4 +50,9 @@ /* STL Verbs Extended */ #define IB_BTHE_E_SHIFT 24 +struct hfi1_opfn_data { + /* serialize opfn function calls */ + spinlock_t lock; +}; + #endif /* _HFI1_OPFN_H */ diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index da1ecb68a928..a8fd66f31fee 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -8,6 +8,208 @@ #include "verbs.h" #include "tid_rdma.h" +/* + * J_KEY for kernel contexts when TID RDMA is used. + * See generate_jkey() in hfi.h for more information. + */ +#define TID_RDMA_JKEY 32 +#define HFI1_KERNEL_MIN_JKEY HFI1_ADMIN_JKEY_RANGE +#define HFI1_KERNEL_MAX_JKEY (2 * HFI1_ADMIN_JKEY_RANGE - 1) + +#define TID_RDMA_MAX_READ_SEGS_PER_REQ 6 +#define TID_RDMA_MAX_WRITE_SEGS_PER_REQ 4 + +#define TID_OPFN_QP_CTXT_MASK 0xff +#define TID_OPFN_QP_CTXT_SHIFT 56 +#define TID_OPFN_QP_KDETH_MASK 0xff +#define TID_OPFN_QP_KDETH_SHIFT 48 +#define TID_OPFN_MAX_LEN_MASK 0x7ff +#define TID_OPFN_MAX_LEN_SHIFT 37 +#define TID_OPFN_TIMEOUT_MASK 0x1f +#define TID_OPFN_TIMEOUT_SHIFT 32 +#define TID_OPFN_RESERVED_MASK 0x3f +#define TID_OPFN_RESERVED_SHIFT 26 +#define TID_OPFN_URG_MASK 0x1 +#define TID_OPFN_URG_SHIFT 25 +#define TID_OPFN_VER_MASK 0x7 +#define TID_OPFN_VER_SHIFT 22 +#define TID_OPFN_JKEY_MASK 0x3f +#define TID_OPFN_JKEY_SHIFT 16 +#define TID_OPFN_MAX_READ_MASK 0x3f +#define TID_OPFN_MAX_READ_SHIFT 10 +#define TID_OPFN_MAX_WRITE_MASK 0x3f +#define TID_OPFN_MAX_WRITE_SHIFT 4 + +/* + * OPFN TID layout + * + * 63 47 31 15 + * NNNNNNNNKKKKKKKK MMMMMMMMMMMTTTTT DDDDDDUVVVJJJJJJ RRRRRRWWWWWWCCCC + * 3210987654321098 7654321098765432 1098765432109876 5432109876543210 + * N - the context Number + * K - the Kdeth_qp + * M - Max_len + * T - Timeout + * D - reserveD + * V - version + * U - Urg capable + * J - Jkey + * R - max_Read + * W - max_Write + * C - Capcode + */ + +static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p) +{ + return + (((u64)p->qp & TID_OPFN_QP_CTXT_MASK) << + TID_OPFN_QP_CTXT_SHIFT) | + ((((u64)p->qp >> 16) & TID_OPFN_QP_KDETH_MASK) << + TID_OPFN_QP_KDETH_SHIFT) | + (((u64)((p->max_len >> PAGE_SHIFT) - 1) & + TID_OPFN_MAX_LEN_MASK) << TID_OPFN_MAX_LEN_SHIFT) | + (((u64)p->timeout & TID_OPFN_TIMEOUT_MASK) << + TID_OPFN_TIMEOUT_SHIFT) | + (((u64)p->urg & TID_OPFN_URG_MASK) << TID_OPFN_URG_SHIFT) | + (((u64)p->jkey & TID_OPFN_JKEY_MASK) << TID_OPFN_JKEY_SHIFT) | + (((u64)p->max_read & TID_OPFN_MAX_READ_MASK) << + TID_OPFN_MAX_READ_SHIFT) | + (((u64)p->max_write & TID_OPFN_MAX_WRITE_MASK) << + TID_OPFN_MAX_WRITE_SHIFT); +} + +static void tid_rdma_opfn_decode(struct tid_rdma_params *p, u64 data) +{ + p->max_len = (((data >> TID_OPFN_MAX_LEN_SHIFT) & + TID_OPFN_MAX_LEN_MASK) + 1) << PAGE_SHIFT; + p->jkey = (data >> TID_OPFN_JKEY_SHIFT) & TID_OPFN_JKEY_MASK; + p->max_write = (data >> TID_OPFN_MAX_WRITE_SHIFT) & + TID_OPFN_MAX_WRITE_MASK; + p->max_read = (data >> TID_OPFN_MAX_READ_SHIFT) & + TID_OPFN_MAX_READ_MASK; + p->qp = + ((((data >> TID_OPFN_QP_KDETH_SHIFT) & TID_OPFN_QP_KDETH_MASK) + << 16) | + ((data >> TID_OPFN_QP_CTXT_SHIFT) & TID_OPFN_QP_CTXT_MASK)); + p->urg = (data >> TID_OPFN_URG_SHIFT) & TID_OPFN_URG_MASK; + p->timeout = (data >> TID_OPFN_TIMEOUT_SHIFT) & TID_OPFN_TIMEOUT_MASK; +} + +void tid_rdma_opfn_init(struct rvt_qp *qp, struct tid_rdma_params *p) +{ + struct hfi1_qp_priv *priv = qp->priv; + + p->qp = (kdeth_qp << 16) | priv->rcd->ctxt; + p->max_len = TID_RDMA_MAX_SEGMENT_SIZE; + p->jkey = priv->rcd->jkey; + p->max_read = TID_RDMA_MAX_READ_SEGS_PER_REQ; + p->max_write = TID_RDMA_MAX_WRITE_SEGS_PER_REQ; + p->timeout = qp->timeout; + p->urg = is_urg_masked(priv->rcd); +} + +bool tid_rdma_conn_req(struct rvt_qp *qp, u64 *data) +{ + struct hfi1_qp_priv *priv = qp->priv; + + *data = tid_rdma_opfn_encode(&priv->tid_rdma.local); + return true; +} + +bool tid_rdma_conn_reply(struct rvt_qp *qp, u64 data) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct tid_rdma_params *remote, *old; + bool ret = true; + + old = rcu_dereference_protected(priv->tid_rdma.remote, + lockdep_is_held(&priv->opfn.lock)); + data &= ~0xfULL; + /* + * If data passed in is zero, return true so as not to continue the + * negotiation process + */ + if (!data || !HFI1_CAP_IS_KSET(TID_RDMA)) + goto null; + /* + * If kzalloc fails, return false. This will result in: + * * at the requester a new OPFN request being generated to retry + * the negotiation + * * at the responder, 0 being returned to the requester so as to + * disable TID RDMA at both the requester and the responder + */ + remote = kzalloc(sizeof(*remote), GFP_ATOMIC); + if (!remote) { + ret = false; + goto null; + } + + tid_rdma_opfn_decode(remote, data); + priv->tid_timer_timeout_jiffies = + usecs_to_jiffies((((4096UL * (1UL << remote->timeout)) / + 1000UL) << 3) * 7); + rcu_assign_pointer(priv->tid_rdma.remote, remote); + /* + * A TID RDMA READ request's segment size is not equal to + * remote->max_len only when the request's data length is smaller + * than remote->max_len. In that case, there will be only one segment. + * Therefore, when priv->pkts_ps is used to calculate req->cur_seg + * during retry, it will lead to req->cur_seg = 0, which is exactly + * what is expected. + */ + priv->pkts_ps = (u16)rvt_div_mtu(qp, remote->max_len); + priv->timeout_shift = ilog2(priv->pkts_ps - 1) + 1; + goto free; +null: + RCU_INIT_POINTER(priv->tid_rdma.remote, NULL); + priv->timeout_shift = 0; +free: + if (old) + kfree_rcu(old, rcu_head); + return ret; +} + +bool tid_rdma_conn_resp(struct rvt_qp *qp, u64 *data) +{ + bool ret; + + ret = tid_rdma_conn_reply(qp, *data); + *data = 0; + /* + * If tid_rdma_conn_reply() returns error, set *data as 0 to indicate + * TID RDMA could not be enabled. This will result in TID RDMA being + * disabled at the requester too. + */ + if (ret) + (void)tid_rdma_conn_req(qp, data); + return ret; +} + +void tid_rdma_conn_error(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct tid_rdma_params *old; + + old = rcu_dereference_protected(priv->tid_rdma.remote, + lockdep_is_held(&priv->opfn.lock)); + RCU_INIT_POINTER(priv->tid_rdma.remote, NULL); + if (old) + kfree_rcu(old, rcu_head); +} + +/* This is called at context initialization time */ +int hfi1_kern_exp_rcv_init(struct hfi1_ctxtdata *rcd, int reinit) +{ + if (reinit) + return 0; + + BUILD_BUG_ON(TID_RDMA_JKEY < HFI1_KERNEL_MIN_JKEY); + BUILD_BUG_ON(TID_RDMA_JKEY > HFI1_KERNEL_MAX_JKEY); + rcd->jkey = TID_RDMA_JKEY; + hfi1_set_ctxt_jkey(rcd->dd, rcd, rcd->jkey); + return 0; +} + /** * qp_to_rcd - determine the receive context used by a qp * @qp - the qp diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index 6fcd3adcdcc3..18c6d4333f1e 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -6,8 +6,34 @@ #ifndef HFI1_TID_RDMA_H #define HFI1_TID_RDMA_H +#define TID_RDMA_MAX_SEGMENT_SIZE BIT(18) /* 256 KiB (for now) */ + +struct tid_rdma_params { + struct rcu_head rcu_head; + u32 qp; + u32 max_len; + u16 jkey; + u8 max_read; + u8 max_write; + u8 timeout; + u8 urg; + u8 version; +}; + +struct tid_rdma_qp_params { + struct tid_rdma_params local; + struct tid_rdma_params __rcu *remote; +}; + +bool tid_rdma_conn_req(struct rvt_qp *qp, u64 *data); +bool tid_rdma_conn_reply(struct rvt_qp *qp, u64 data); +bool tid_rdma_conn_resp(struct rvt_qp *qp, u64 *data); +void tid_rdma_conn_error(struct rvt_qp *qp); +void tid_rdma_opfn_init(struct rvt_qp *qp, struct tid_rdma_params *p); + +int hfi1_kern_exp_rcv_init(struct hfi1_ctxtdata *rcd, int reinit); + int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp, struct ib_qp_init_attr *init_attr); #endif /* HFI1_TID_RDMA_H */ - diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index 8834119184b3..c8baa1e38ff6 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -161,8 +161,13 @@ struct hfi1_qp_priv { struct hfi1_ctxtdata *rcd; /* QP's receive context */ u8 s_sc; /* SC[0..4] for next packet */ struct iowait s_iowait; + struct hfi1_opfn_data opfn; + struct tid_rdma_qp_params tid_rdma; struct rvt_qp *owner; u8 hdr_type; /* 9B or 16B */ + unsigned long tid_timer_timeout_jiffies; + u16 pkts_ps; /* packets per segment */ + u8 timeout_shift; /* account for number of packets per segment */ }; /* -- cgit v1.2.3-59-g8ed1b From f01b4d5a43da47a9f61618a81a4ff3258ddc2c01 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 19:20:52 -0800 Subject: IB/hfi1: OPFN interface OPFN allows a pair of connected RC QPs to exchange a set of parameters in succession. The parameter exchange itself is done using the IB compare and swap request with a special virtual address. The request is triggered using a reserved IB work request opcode. This patch implements the OPFN interface to initialize, start, process, and reset the OPFN request. Reviewed-by: Mike Marciniszyn Signed-off-by: Ashutosh Dixit Signed-off-by: Mitko Haralanov Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/Makefile | 1 + drivers/infiniband/hw/hfi1/hfi.h | 2 + drivers/infiniband/hw/hfi1/init.c | 1 - drivers/infiniband/hw/hfi1/opfn.c | 304 ++++++++++++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/opfn.h | 27 ++++ 5 files changed, 334 insertions(+), 1 deletion(-) create mode 100644 drivers/infiniband/hw/hfi1/opfn.c (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/Makefile b/drivers/infiniband/hw/hfi1/Makefile index 3ce9dc8c3463..4044a8c8dbf4 100644 --- a/drivers/infiniband/hw/hfi1/Makefile +++ b/drivers/infiniband/hw/hfi1/Makefile @@ -24,6 +24,7 @@ hfi1-y := \ mad.o \ mmu_rb.o \ msix.o \ + opfn.o \ pcie.o \ pio.o \ pio_copy.o \ diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index ddfcf2fe40ca..9aa0357e17b7 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -99,6 +99,8 @@ #define NEIGHBOR_TYPE_HFI 0 #define NEIGHBOR_TYPE_SWITCH 1 +#define HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES 5 + extern unsigned long hfi1_cap_mask; #define HFI1_CAP_KGET_MASK(mask, cap) ((mask) & HFI1_CAP_##cap) #define HFI1_CAP_UGET_MASK(mask, cap) \ diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 2ba5a2a8b68f..09c898d0975c 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -72,7 +72,6 @@ #undef pr_fmt #define pr_fmt(fmt) DRIVER_NAME ": " fmt -#define HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES 5 /* * min buffers we want to have per context, after driver */ diff --git a/drivers/infiniband/hw/hfi1/opfn.c b/drivers/infiniband/hw/hfi1/opfn.c new file mode 100644 index 000000000000..2d46c91eb129 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/opfn.c @@ -0,0 +1,304 @@ +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) +/* + * Copyright(c) 2018 Intel Corporation. + * + */ +#include "hfi.h" +#include "trace.h" +#include "qp.h" +#include "opfn.h" + +#define IB_BTHE_E BIT(IB_BTHE_E_SHIFT) + +#define OPFN_CODE(code) BIT((code) - 1) +#define OPFN_MASK(code) OPFN_CODE(STL_VERBS_EXTD_##code) + +struct hfi1_opfn_type { + bool (*request)(struct rvt_qp *qp, u64 *data); + bool (*response)(struct rvt_qp *qp, u64 *data); + bool (*reply)(struct rvt_qp *qp, u64 data); + void (*error)(struct rvt_qp *qp); +}; + +static struct hfi1_opfn_type hfi1_opfn_handlers[STL_VERBS_EXTD_MAX] = { + [STL_VERBS_EXTD_TID_RDMA] = { + .request = tid_rdma_conn_req, + .response = tid_rdma_conn_resp, + .reply = tid_rdma_conn_reply, + .error = tid_rdma_conn_error, + }, +}; + +static struct workqueue_struct *opfn_wq; + +static void opfn_schedule_conn_request(struct rvt_qp *qp); + +static bool hfi1_opfn_extended(u32 bth1) +{ + return !!(bth1 & IB_BTHE_E); +} + +static void opfn_conn_request(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct ib_atomic_wr wr; + u16 mask, capcode; + struct hfi1_opfn_type *extd; + u64 data; + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&priv->opfn.lock, flags); + /* + * Exit if the extended bit is not set, or if nothing is requested, or + * if we have completed all requests, or if a previous request is in + * progress + */ + if (!priv->opfn.extended || !priv->opfn.requested || + priv->opfn.requested == priv->opfn.completed || priv->opfn.curr) + goto done; + + mask = priv->opfn.requested & ~priv->opfn.completed; + capcode = ilog2(mask & ~(mask - 1)) + 1; + if (capcode >= STL_VERBS_EXTD_MAX) { + priv->opfn.completed |= OPFN_CODE(capcode); + goto done; + } + + extd = &hfi1_opfn_handlers[capcode]; + if (!extd || !extd->request || !extd->request(qp, &data)) { + /* + * Either there is no handler for this capability or the request + * packet could not be generated. Either way, mark it as done so + * we don't keep attempting to complete it. + */ + priv->opfn.completed |= OPFN_CODE(capcode); + goto done; + } + + data = (data & ~0xf) | capcode; + + memset(&wr, 0, sizeof(wr)); + wr.wr.opcode = IB_WR_OPFN; + wr.remote_addr = HFI1_VERBS_E_ATOMIC_VADDR; + wr.compare_add = data; + + priv->opfn.curr = capcode; /* A new request is now in progress */ + /* Drop opfn.lock before calling ib_post_send() */ + spin_unlock_irqrestore(&priv->opfn.lock, flags); + + ret = ib_post_send(&qp->ibqp, &wr.wr, NULL); + if (ret) + goto err; + return; +err: + spin_lock_irqsave(&priv->opfn.lock, flags); + /* + * In case of an unexpected error return from ib_post_send + * clear opfn.curr and reschedule to try again + */ + priv->opfn.curr = STL_VERBS_EXTD_NONE; + opfn_schedule_conn_request(qp); +done: + spin_unlock_irqrestore(&priv->opfn.lock, flags); +} + +void opfn_send_conn_request(struct work_struct *work) +{ + struct hfi1_opfn_data *od; + struct hfi1_qp_priv *qpriv; + + od = container_of(work, struct hfi1_opfn_data, opfn_work); + qpriv = container_of(od, struct hfi1_qp_priv, opfn); + + opfn_conn_request(qpriv->owner); +} + +/* + * When QP s_lock is held in the caller, the OPFN request must be scheduled + * to a different workqueue to avoid double locking QP s_lock in call to + * ib_post_send in opfn_conn_request + */ +static void opfn_schedule_conn_request(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + + queue_work(opfn_wq, &priv->opfn.opfn_work); +} + +void opfn_conn_response(struct rvt_qp *qp, struct rvt_ack_entry *e, + struct ib_atomic_eth *ateth) +{ + struct hfi1_qp_priv *priv = qp->priv; + u64 data = be64_to_cpu(ateth->compare_data); + struct hfi1_opfn_type *extd; + u8 capcode; + unsigned long flags; + + capcode = data & 0xf; + if (!capcode || capcode >= STL_VERBS_EXTD_MAX) + return; + + extd = &hfi1_opfn_handlers[capcode]; + + if (!extd || !extd->response) { + e->atomic_data = capcode; + return; + } + + spin_lock_irqsave(&priv->opfn.lock, flags); + if (priv->opfn.completed & OPFN_CODE(capcode)) { + /* + * We are receiving a request for a feature that has already + * been negotiated. This may mean that the other side has reset + */ + priv->opfn.completed &= ~OPFN_CODE(capcode); + if (extd->error) + extd->error(qp); + } + + if (extd->response(qp, &data)) + priv->opfn.completed |= OPFN_CODE(capcode); + e->atomic_data = (data & ~0xf) | capcode; + spin_unlock_irqrestore(&priv->opfn.lock, flags); +} + +void opfn_conn_reply(struct rvt_qp *qp, u64 data) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct hfi1_opfn_type *extd; + u8 capcode; + unsigned long flags; + + capcode = data & 0xf; + if (!capcode || capcode >= STL_VERBS_EXTD_MAX) + return; + + spin_lock_irqsave(&priv->opfn.lock, flags); + /* + * Either there is no previous request or the reply is not for the + * current request + */ + if (!priv->opfn.curr || capcode != priv->opfn.curr) + goto done; + + extd = &hfi1_opfn_handlers[capcode]; + + if (!extd || !extd->reply) + goto clear; + + if (extd->reply(qp, data)) + priv->opfn.completed |= OPFN_CODE(capcode); +clear: + /* + * Clear opfn.curr to indicate that the previous request is no longer in + * progress + */ + priv->opfn.curr = STL_VERBS_EXTD_NONE; +done: + spin_unlock_irqrestore(&priv->opfn.lock, flags); +} + +void opfn_conn_error(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct hfi1_opfn_type *extd = NULL; + unsigned long flags; + u16 capcode; + + /* + * The QP has gone into the Error state. We have to invalidate all + * negotiated feature, including the one in progress (if any). The RC + * QP handling will clean the WQE for the connection request. + */ + spin_lock_irqsave(&priv->opfn.lock, flags); + while (priv->opfn.completed) { + capcode = priv->opfn.completed & ~(priv->opfn.completed - 1); + extd = &hfi1_opfn_handlers[ilog2(capcode) + 1]; + if (extd->error) + extd->error(qp); + priv->opfn.completed &= ~OPFN_CODE(capcode); + } + priv->opfn.extended = 0; + priv->opfn.requested = 0; + priv->opfn.curr = STL_VERBS_EXTD_NONE; + spin_unlock_irqrestore(&priv->opfn.lock, flags); +} + +void opfn_qp_init(struct rvt_qp *qp, struct ib_qp_attr *attr, int attr_mask) +{ + struct ib_qp *ibqp = &qp->ibqp; + struct hfi1_qp_priv *priv = qp->priv; + unsigned long flags; + + spin_lock_irqsave(&priv->opfn.lock, flags); + if (ibqp->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) { + struct tid_rdma_params *local = &priv->tid_rdma.local; + + if (qp->pmtu == enum_to_mtu(OPA_MTU_4096) || + qp->pmtu == enum_to_mtu(OPA_MTU_8192)) { + tid_rdma_opfn_init(qp, local); + /* + * We only want to set the OPFN requested bit when the + * QP transitions to RTS. + */ + if (attr_mask & IB_QP_STATE && + attr->qp_state == IB_QPS_RTS) { + priv->opfn.requested |= OPFN_MASK(TID_RDMA); + /* + * If the QP is transitioning to RTS and the + * opfn.completed for TID RDMA has already been + * set, the QP is being moved *back* into RTS. + * We can now renegotiate the TID RDMA + * parameters. + */ + if (priv->opfn.completed & + OPFN_MASK(TID_RDMA)) { + priv->opfn.completed &= + ~OPFN_MASK(TID_RDMA); + /* + * Since the opfn.completed bit was + * already set, it is safe to assume + * that the opfn.extended is also set. + */ + opfn_schedule_conn_request(qp); + } + } + } else { + memset(local, 0, sizeof(*local)); + } + } + spin_unlock_irqrestore(&priv->opfn.lock, flags); +} + +void opfn_trigger_conn_request(struct rvt_qp *qp, u32 bth1) +{ + struct hfi1_qp_priv *priv = qp->priv; + + if (!priv->opfn.extended && hfi1_opfn_extended(bth1) && + HFI1_CAP_IS_KSET(OPFN)) { + priv->opfn.extended = 1; + if (qp->state == IB_QPS_RTS) + opfn_conn_request(qp); + } +} + +int opfn_init(void) +{ + opfn_wq = alloc_workqueue("hfi_opfn", + WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE | + WQ_MEM_RECLAIM, + HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES); + if (!opfn_wq) + return -ENOMEM; + + return 0; +} + +void opfn_exit(void) +{ + if (opfn_wq) { + destroy_workqueue(opfn_wq); + opfn_wq = NULL; + } +} diff --git a/drivers/infiniband/hw/hfi1/opfn.h b/drivers/infiniband/hw/hfi1/opfn.h index 1927c9862b8f..5f2011cabc25 100644 --- a/drivers/infiniband/hw/hfi1/opfn.h +++ b/drivers/infiniband/hw/hfi1/opfn.h @@ -49,10 +49,37 @@ /* STL Verbs Extended */ #define IB_BTHE_E_SHIFT 24 +#define HFI1_VERBS_E_ATOMIC_VADDR U64_MAX + +struct ib_atomic_eth; + +enum hfi1_opfn_codes { + STL_VERBS_EXTD_NONE = 0, + STL_VERBS_EXTD_TID_RDMA, + STL_VERBS_EXTD_MAX +}; struct hfi1_opfn_data { + u8 extended; + u16 requested; + u16 completed; + enum hfi1_opfn_codes curr; /* serialize opfn function calls */ spinlock_t lock; + struct work_struct opfn_work; }; +/* WR opcode for OPFN */ +#define IB_WR_OPFN IB_WR_RESERVED3 + +void opfn_send_conn_request(struct work_struct *work); +void opfn_conn_response(struct rvt_qp *qp, struct rvt_ack_entry *e, + struct ib_atomic_eth *ateth); +void opfn_conn_reply(struct rvt_qp *qp, u64 data); +void opfn_conn_error(struct rvt_qp *qp); +void opfn_qp_init(struct rvt_qp *qp, struct ib_qp_attr *attr, int attr_mask); +void opfn_trigger_conn_request(struct rvt_qp *qp, u32 bth1); +int opfn_init(void); +void opfn_exit(void); + #endif /* _HFI1_OPFN_H */ -- cgit v1.2.3-59-g8ed1b From ddf922c31fedd19c5b89a269c35e5c8b68c64327 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 19:21:01 -0800 Subject: IB/hfi1, IB/rdmavt: Allow for extending of QP's s_ack_queue The OPFN protocol uses the COMPARE_SWAP request to exchange data between the requester and the responder and therefore needs to be stored in the QP's s_ack_queue when the request is received on the responder side. However, because the user does not know anything about the OPFN protocol, this extra entry in the queue cannot be advertised to the user. This patch adds an extra entry in a QP's s_ack_queue. Reviewed-by: Mike Marciniszyn Signed-off-by: Mitko Haralanov Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/rc.c | 13 +++++++------ drivers/infiniband/hw/hfi1/verbs.c | 1 + include/rdma/rdma_vt.h | 10 +++++++++- 3 files changed, 17 insertions(+), 7 deletions(-) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 940e9236c328..8970fc7ffd4b 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -122,7 +122,8 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, * response has been sent instead of only being * constructed. */ - if (++qp->s_tail_ack_queue > HFI1_MAX_RDMA_ATOMIC) + if (++qp->s_tail_ack_queue > + rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) qp->s_tail_ack_queue = 0; /* FALLTHROUGH */ case OP(SEND_ONLY): @@ -1818,7 +1819,7 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data, if (i) prev = i - 1; else - prev = HFI1_MAX_RDMA_ATOMIC; + prev = rvt_size_atomic(ib_to_rvt(qp->ibqp.device)); if (prev == qp->r_head_ack_queue) { e = NULL; break; @@ -1942,7 +1943,7 @@ static inline void update_ack_queue(struct rvt_qp *qp, unsigned n) unsigned next; next = n + 1; - if (next > HFI1_MAX_RDMA_ATOMIC) + if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) next = 0; qp->s_tail_ack_queue = next; qp->s_ack_state = OP(ACKNOWLEDGE); @@ -2298,8 +2299,8 @@ send_last: if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ))) goto nack_inv; next = qp->r_head_ack_queue + 1; - /* s_ack_queue is size HFI1_MAX_RDMA_ATOMIC+1 so use > not >= */ - if (next > HFI1_MAX_RDMA_ATOMIC) + /* s_ack_queue is size rvt_size_atomic()+1 so use > not >= */ + if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) next = 0; spin_lock_irqsave(&qp->s_lock, flags); if (unlikely(next == qp->s_tail_ack_queue)) { @@ -2373,7 +2374,7 @@ send_last: if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) goto nack_inv; next = qp->r_head_ack_queue + 1; - if (next > HFI1_MAX_RDMA_ATOMIC) + if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) next = 0; spin_lock_irqsave(&qp->s_lock, flags); if (unlikely(next == qp->s_tail_ack_queue)) { diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index c980345cf1e1..ec3899c0874c 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1735,6 +1735,7 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd) dd->verbs_dev.rdi.dparms.sge_copy_mode = sge_copy_mode; dd->verbs_dev.rdi.dparms.wss_threshold = wss_threshold; dd->verbs_dev.rdi.dparms.wss_clean_period = wss_clean_period; + dd->verbs_dev.rdi.dparms.extra_rdma_atomic = 1; /* post send table */ dd->verbs_dev.rdi.post_parms = hfi1_post_parms; diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index acb3bc96dfa7..168e40be183c 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -182,6 +182,7 @@ struct rvt_driver_params { u32 max_mad_size; u8 qos_shift; u8 max_rdma_atomic; + u8 extra_rdma_atomic; u8 reserved_operations; }; @@ -519,7 +520,14 @@ static inline unsigned rvt_get_npkeys(struct rvt_dev_info *rdi) */ static inline unsigned int rvt_max_atomic(struct rvt_dev_info *rdi) { - return rdi->dparms.max_rdma_atomic + 1; + return rdi->dparms.max_rdma_atomic + + rdi->dparms.extra_rdma_atomic + 1; +} + +static inline unsigned int rvt_size_atomic(struct rvt_dev_info *rdi) +{ + return rdi->dparms.max_rdma_atomic + + rdi->dparms.extra_rdma_atomic; } /* -- cgit v1.2.3-59-g8ed1b From 48a615dc00aed68d58244b835b10eb3244aae31d Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 19:21:11 -0800 Subject: IB/hfi1: Integrate OPFN into RC transactions OPFN parameter negotiation allows a pair of connected RC QPs to exchange a set of parameters in succession. This negotiation does not commence till the first ULP request. Because OPFN operations are operations private to the driver, they do not generate user completions or put the QP into error when they run out of retries. This patch integrates the OPFN protocol into the transactions of an RC QP. Reviewed-by: Mike Marciniszyn Signed-off-by: Ashutosh Dixit Signed-off-by: Mitko Haralanov Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/init.c | 7 +++++ drivers/infiniband/hw/hfi1/qp.c | 13 ++++++++ drivers/infiniband/hw/hfi1/rc.c | 58 +++++++++++++++++++++++++++++------ drivers/infiniband/hw/hfi1/tid_rdma.c | 11 +++++++ drivers/infiniband/hw/hfi1/tid_rdma.h | 1 + drivers/infiniband/hw/hfi1/verbs.c | 1 + 6 files changed, 81 insertions(+), 10 deletions(-) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 09c898d0975c..a8dbd0f191f5 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -1498,6 +1498,12 @@ static int __init hfi1_mod_init(void) /* sanitize link CRC options */ link_crc_mask &= SUPPORTED_CRCS; + ret = opfn_init(); + if (ret < 0) { + pr_err("Failed to allocate opfn_wq"); + goto bail_dev; + } + /* * These must be called before the driver is registered with * the PCI subsystem. @@ -1528,6 +1534,7 @@ module_init(hfi1_mod_init); static void __exit hfi1_mod_cleanup(void) { pci_unregister_driver(&hfi1_pci_driver); + opfn_exit(); node_affinity_destroy_all(); hfi1_dbg_exit(); diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index 5344e8993b28..f822f92b415f 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -132,6 +132,12 @@ const struct rvt_operation_params hfi1_post_parms[RVT_OPERATION_MAX] = { .qpt_support = BIT(IB_QPT_RC), }, +[IB_WR_OPFN] = { + .length = sizeof(struct ib_atomic_wr), + .qpt_support = BIT(IB_QPT_RC), + .flags = RVT_OPERATION_USE_RESERVE, +}, + }; static void flush_list_head(struct list_head *l) @@ -285,6 +291,8 @@ void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, priv->s_sendcontext = qp_to_send_context(qp, priv->s_sc); qp_set_16b(qp); } + + opfn_qp_init(qp, attr, attr_mask); } /** @@ -696,6 +704,7 @@ void qp_priv_free(struct rvt_dev_info *rdi, struct rvt_qp *qp) { struct hfi1_qp_priv *priv = qp->priv; + hfi1_qp_priv_tid_free(rdi, qp); kfree(priv->s_ahg); kfree(priv); } @@ -751,6 +760,10 @@ void notify_qp_reset(struct rvt_qp *qp) { qp->r_adefered = 0; clear_ahg(qp); + + /* Clear any OPFN state */ + if (qp->ibqp.qp_type == IB_QPT_RC) + opfn_conn_error(qp); } /* diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 8970fc7ffd4b..092d5eba980f 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -57,6 +57,10 @@ /* cut down ridiculously long IB macro names */ #define OP(x) RC_OP(x) +static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, + struct rvt_swqe *wqe, + struct hfi1_ibport *ibp); + static u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe, u32 psn, u32 pmtu) { @@ -517,10 +521,14 @@ no_flow_control: goto bail; } qp->s_num_rd_atomic++; - if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) - qp->s_lsn++; } - if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) { + + /* FALLTHROUGH */ + case IB_WR_OPFN: + if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) + qp->s_lsn++; + if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || + wqe->wr.opcode == IB_WR_OPFN) { qp->s_state = OP(COMPARE_SWAP); put_ib_ateth_swap(wqe->atomic_wr.swap, &ohdr->u.atomic_eth); @@ -1040,6 +1048,7 @@ done: */ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait) { + struct hfi1_qp_priv *priv = qp->priv; struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, qp->s_acked); struct hfi1_ibport *ibp; @@ -1050,8 +1059,26 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait) hfi1_migrate_qp(qp); qp->s_retry = qp->s_retry_cnt; } else if (qp->s_last == qp->s_acked) { - rvt_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR); - rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); + /* + * We need special handling for the OPFN request WQEs as + * they are not allowed to generate real user errors + */ + if (wqe->wr.opcode == IB_WR_OPFN) { + struct hfi1_ibport *ibp = + to_iport(qp->ibqp.device, qp->port_num); + /* + * Call opfn_conn_reply() with capcode and + * remaining data as 0 to close out the + * current request + */ + opfn_conn_reply(qp, priv->opfn.curr); + wqe = do_rc_completion(qp, wqe, ibp); + qp->s_flags &= ~RVT_S_WAIT_ACK; + } else { + rvt_send_complete(qp, wqe, + IB_WC_RETRY_EXC_ERR); + rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); + } return; } else { /* need to handle delayed completion */ return; @@ -1363,6 +1390,9 @@ static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, u64 *vaddr = wqe->sg_list[0].vaddr; *vaddr = val; } + if (wqe->wr.opcode == IB_WR_OPFN) + opfn_conn_reply(qp, val); + if (qp->s_num_rd_atomic && (wqe->wr.opcode == IB_WR_RDMA_READ || wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || @@ -2068,6 +2098,7 @@ void hfi1_rc_rcv(struct hfi1_packet *packet) return; fecn = process_ecn(qp, packet); + opfn_trigger_conn_request(qp, be32_to_cpu(ohdr->bth[1])); /* * Process responses (ACKs) before anything else. Note that the @@ -2363,15 +2394,18 @@ send_last: case OP(COMPARE_SWAP): case OP(FETCH_ADD): { - struct ib_atomic_eth *ateth; + struct ib_atomic_eth *ateth = &ohdr->u.atomic_eth; + u64 vaddr = get_ib_ateth_vaddr(ateth); + bool opfn = opcode == OP(COMPARE_SWAP) && + vaddr == HFI1_VERBS_E_ATOMIC_VADDR; struct rvt_ack_entry *e; - u64 vaddr; atomic64_t *maddr; u64 sdata; u32 rkey; u8 next; - if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC) && + !opfn)) goto nack_inv; next = qp->r_head_ack_queue + 1; if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) @@ -2387,8 +2421,11 @@ send_last: rvt_put_mr(e->rdma_sge.mr); e->rdma_sge.mr = NULL; } - ateth = &ohdr->u.atomic_eth; - vaddr = get_ib_ateth_vaddr(ateth); + /* Process OPFN special virtual address */ + if (opfn) { + opfn_conn_response(qp, e, ateth); + goto ack; + } if (unlikely(vaddr & (sizeof(u64) - 1))) goto nack_inv_unlck; rkey = be32_to_cpu(ateth->rkey); @@ -2407,6 +2444,7 @@ send_last: sdata); rvt_put_mr(qp->r_sge.sge.mr); qp->r_sge.num_sge = 0; +ack: e->opcode = opcode; e->sent = 0; e->psn = psn; diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index a8fd66f31fee..0c9f313d6229 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -246,5 +246,16 @@ int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp, qpriv->rcd = qp_to_rcd(rdi, qp); + spin_lock_init(&qpriv->opfn.lock); + INIT_WORK(&qpriv->opfn.opfn_work, opfn_send_conn_request); + return 0; } + +void hfi1_qp_priv_tid_free(struct rvt_dev_info *rdi, struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + + if (qp->ibqp.qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) + cancel_work_sync(&priv->opfn.opfn_work); +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index 18c6d4333f1e..ee8151558e3f 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -35,5 +35,6 @@ int hfi1_kern_exp_rcv_init(struct hfi1_ctxtdata *rcd, int reinit); int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp, struct ib_qp_init_attr *init_attr); +void hfi1_qp_priv_tid_free(struct rvt_dev_info *rdi, struct rvt_qp *qp); #endif /* HFI1_TID_RDMA_H */ diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index ec3899c0874c..571bfd549c2a 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1735,6 +1735,7 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd) dd->verbs_dev.rdi.dparms.sge_copy_mode = sge_copy_mode; dd->verbs_dev.rdi.dparms.wss_threshold = wss_threshold; dd->verbs_dev.rdi.dparms.wss_clean_period = wss_clean_period; + dd->verbs_dev.rdi.dparms.reserved_operations = 1; dd->verbs_dev.rdi.dparms.extra_rdma_atomic = 1; /* post send table */ -- cgit v1.2.3-59-g8ed1b From a131d16460971353e7dd6916d9fd34c1c946a782 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Thu, 24 Jan 2019 06:10:09 -0800 Subject: IB/hfi1: Add static trace for OPFN This patch adds the static trace to the OPFN code and moves tid related static trace code into a new header file. Reviewed-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/opfn.c | 14 ++ drivers/infiniband/hw/hfi1/tid_rdma.c | 3 + drivers/infiniband/hw/hfi1/trace.h | 1 + drivers/infiniband/hw/hfi1/trace_rx.h | 107 +---------- drivers/infiniband/hw/hfi1/trace_tid.h | 332 +++++++++++++++++++++++++++++++++ 5 files changed, 351 insertions(+), 106 deletions(-) create mode 100644 drivers/infiniband/hw/hfi1/trace_tid.h (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/opfn.c b/drivers/infiniband/hw/hfi1/opfn.c index 2d46c91eb129..2ca070690b2f 100644 --- a/drivers/infiniband/hw/hfi1/opfn.c +++ b/drivers/infiniband/hw/hfi1/opfn.c @@ -48,6 +48,7 @@ static void opfn_conn_request(struct rvt_qp *qp) unsigned long flags; int ret = 0; + trace_hfi1_opfn_state_conn_request(qp); spin_lock_irqsave(&priv->opfn.lock, flags); /* * Exit if the extended bit is not set, or if nothing is requested, or @@ -76,6 +77,7 @@ static void opfn_conn_request(struct rvt_qp *qp) goto done; } + trace_hfi1_opfn_data_conn_request(qp, capcode, data); data = (data & ~0xf) | capcode; memset(&wr, 0, sizeof(wr)); @@ -90,8 +92,11 @@ static void opfn_conn_request(struct rvt_qp *qp) ret = ib_post_send(&qp->ibqp, &wr.wr, NULL); if (ret) goto err; + trace_hfi1_opfn_state_conn_request(qp); return; err: + trace_hfi1_msg_opfn_conn_request(qp, "ib_ost_send failed: ret = ", + (u64)ret); spin_lock_irqsave(&priv->opfn.lock, flags); /* * In case of an unexpected error return from ib_post_send @@ -123,6 +128,7 @@ static void opfn_schedule_conn_request(struct rvt_qp *qp) { struct hfi1_qp_priv *priv = qp->priv; + trace_hfi1_opfn_state_sched_conn_request(qp); queue_work(opfn_wq, &priv->opfn.opfn_work); } @@ -135,7 +141,9 @@ void opfn_conn_response(struct rvt_qp *qp, struct rvt_ack_entry *e, u8 capcode; unsigned long flags; + trace_hfi1_opfn_state_conn_response(qp); capcode = data & 0xf; + trace_hfi1_opfn_data_conn_response(qp, capcode, data); if (!capcode || capcode >= STL_VERBS_EXTD_MAX) return; @@ -160,6 +168,7 @@ void opfn_conn_response(struct rvt_qp *qp, struct rvt_ack_entry *e, if (extd->response(qp, &data)) priv->opfn.completed |= OPFN_CODE(capcode); e->atomic_data = (data & ~0xf) | capcode; + trace_hfi1_opfn_state_conn_response(qp); spin_unlock_irqrestore(&priv->opfn.lock, flags); } @@ -170,7 +179,9 @@ void opfn_conn_reply(struct rvt_qp *qp, u64 data) u8 capcode; unsigned long flags; + trace_hfi1_opfn_state_conn_reply(qp); capcode = data & 0xf; + trace_hfi1_opfn_data_conn_reply(qp, capcode, data); if (!capcode || capcode >= STL_VERBS_EXTD_MAX) return; @@ -195,6 +206,7 @@ clear: * progress */ priv->opfn.curr = STL_VERBS_EXTD_NONE; + trace_hfi1_opfn_state_conn_reply(qp); done: spin_unlock_irqrestore(&priv->opfn.lock, flags); } @@ -206,6 +218,8 @@ void opfn_conn_error(struct rvt_qp *qp) unsigned long flags; u16 capcode; + trace_hfi1_opfn_state_conn_error(qp); + trace_hfi1_msg_opfn_conn_error(qp, "error. qp state ", (u64)qp->state); /* * The QP has gone into the Error state. We have to invalidate all * negotiated feature, including the one in progress (if any). The RC diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 0c9f313d6229..e8f57c0cd8bc 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -7,6 +7,7 @@ #include "hfi.h" #include "verbs.h" #include "tid_rdma.h" +#include "trace.h" /* * J_KEY for kernel contexts when TID RDMA is used. @@ -148,6 +149,8 @@ bool tid_rdma_conn_reply(struct rvt_qp *qp, u64 data) priv->tid_timer_timeout_jiffies = usecs_to_jiffies((((4096UL * (1UL << remote->timeout)) / 1000UL) << 3) * 7); + trace_hfi1_opfn_param(qp, 0, &priv->tid_rdma.local); + trace_hfi1_opfn_param(qp, 1, remote); rcu_assign_pointer(priv->tid_rdma.remote, remote); /* * A TID RDMA READ request's segment size is not equal to diff --git a/drivers/infiniband/hw/hfi1/trace.h b/drivers/infiniband/hw/hfi1/trace.h index 84458f1325e1..1ce551864118 100644 --- a/drivers/infiniband/hw/hfi1/trace.h +++ b/drivers/infiniband/hw/hfi1/trace.h @@ -63,3 +63,4 @@ __print_symbolic(etype, \ #include "trace_tx.h" #include "trace_mmu.h" #include "trace_iowait.h" +#include "trace_tid.h" diff --git a/drivers/infiniband/hw/hfi1/trace_rx.h b/drivers/infiniband/hw/hfi1/trace_rx.h index 7eceb57e0415..3cec960e9674 100644 --- a/drivers/infiniband/hw/hfi1/trace_rx.h +++ b/drivers/infiniband/hw/hfi1/trace_rx.h @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015 - 2017 Intel Corporation. + * Copyright(c) 2015 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -128,111 +128,6 @@ TRACE_EVENT(hfi1_receive_interrupt, ) ); -DECLARE_EVENT_CLASS( - hfi1_exp_tid_reg_unreg, - TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, - u32 npages, unsigned long va, unsigned long pa, - dma_addr_t dma), - TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma), - TP_STRUCT__entry( - __field(unsigned int, ctxt) - __field(u16, subctxt) - __field(u32, rarr) - __field(u32, npages) - __field(unsigned long, va) - __field(unsigned long, pa) - __field(dma_addr_t, dma) - ), - TP_fast_assign( - __entry->ctxt = ctxt; - __entry->subctxt = subctxt; - __entry->rarr = rarr; - __entry->npages = npages; - __entry->va = va; - __entry->pa = pa; - __entry->dma = dma; - ), - TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx", - __entry->ctxt, - __entry->subctxt, - __entry->rarr, - __entry->npages, - __entry->pa, - __entry->va, - __entry->dma - ) - ); - -DEFINE_EVENT( - hfi1_exp_tid_reg_unreg, hfi1_exp_tid_unreg, - TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages, - unsigned long va, unsigned long pa, dma_addr_t dma), - TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma)); - -DEFINE_EVENT( - hfi1_exp_tid_reg_unreg, hfi1_exp_tid_reg, - TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages, - unsigned long va, unsigned long pa, dma_addr_t dma), - TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma)); - -TRACE_EVENT( - hfi1_put_tid, - TP_PROTO(struct hfi1_devdata *dd, - u32 index, u32 type, unsigned long pa, u16 order), - TP_ARGS(dd, index, type, pa, order), - TP_STRUCT__entry( - DD_DEV_ENTRY(dd) - __field(unsigned long, pa); - __field(u32, index); - __field(u32, type); - __field(u16, order); - ), - TP_fast_assign( - DD_DEV_ASSIGN(dd); - __entry->pa = pa; - __entry->index = index; - __entry->type = type; - __entry->order = order; - ), - TP_printk("[%s] type %s pa %lx index %u order %u", - __get_str(dev), - show_tidtype(__entry->type), - __entry->pa, - __entry->index, - __entry->order - ) -); - -TRACE_EVENT(hfi1_exp_tid_inval, - TP_PROTO(unsigned int ctxt, u16 subctxt, unsigned long va, u32 rarr, - u32 npages, dma_addr_t dma), - TP_ARGS(ctxt, subctxt, va, rarr, npages, dma), - TP_STRUCT__entry( - __field(unsigned int, ctxt) - __field(u16, subctxt) - __field(unsigned long, va) - __field(u32, rarr) - __field(u32, npages) - __field(dma_addr_t, dma) - ), - TP_fast_assign( - __entry->ctxt = ctxt; - __entry->subctxt = subctxt; - __entry->va = va; - __entry->rarr = rarr; - __entry->npages = npages; - __entry->dma = dma; - ), - TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx dma: 0x%llx", - __entry->ctxt, - __entry->subctxt, - __entry->rarr, - __entry->npages, - __entry->va, - __entry->dma - ) - ); - TRACE_EVENT(hfi1_mmu_invalidate, TP_PROTO(unsigned int ctxt, u16 subctxt, const char *type, unsigned long start, unsigned long end), diff --git a/drivers/infiniband/hw/hfi1/trace_tid.h b/drivers/infiniband/hw/hfi1/trace_tid.h new file mode 100644 index 000000000000..57a973c97cde --- /dev/null +++ b/drivers/infiniband/hw/hfi1/trace_tid.h @@ -0,0 +1,332 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ +/* + * Copyright(c) 2018 Intel Corporation. + * + */ +#if !defined(__HFI1_TRACE_TID_H) || defined(TRACE_HEADER_MULTI_READ) +#define __HFI1_TRACE_TID_H + +#include +#include + +#include "hfi.h" + +#define tidtype_name(type) { PT_##type, #type } +#define show_tidtype(type) \ +__print_symbolic(type, \ + tidtype_name(EXPECTED), \ + tidtype_name(EAGER), \ + tidtype_name(INVALID)) \ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_tid + +#define OPFN_PARAM_PRN "[%s] qpn 0x%x %s OPFN: qp 0x%x, max read %u, " \ + "max write %u, max length %u, jkey 0x%x timeout %u " \ + "urg %u" + +DECLARE_EVENT_CLASS(/* class */ + hfi1_exp_tid_reg_unreg, + TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages, + unsigned long va, unsigned long pa, dma_addr_t dma), + TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma), + TP_STRUCT__entry(/* entry */ + __field(unsigned int, ctxt) + __field(u16, subctxt) + __field(u32, rarr) + __field(u32, npages) + __field(unsigned long, va) + __field(unsigned long, pa) + __field(dma_addr_t, dma) + ), + TP_fast_assign(/* assign */ + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->rarr = rarr; + __entry->npages = npages; + __entry->va = va; + __entry->pa = pa; + __entry->dma = dma; + ), + TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx", + __entry->ctxt, + __entry->subctxt, + __entry->rarr, + __entry->npages, + __entry->pa, + __entry->va, + __entry->dma + ) +); + +DEFINE_EVENT(/* exp_tid_unreg */ + hfi1_exp_tid_reg_unreg, hfi1_exp_tid_unreg, + TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages, + unsigned long va, unsigned long pa, dma_addr_t dma), + TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma) +); + +DEFINE_EVENT(/* exp_tid_reg */ + hfi1_exp_tid_reg_unreg, hfi1_exp_tid_reg, + TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages, + unsigned long va, unsigned long pa, dma_addr_t dma), + TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma) +); + +TRACE_EVENT(/* put_tid */ + hfi1_put_tid, + TP_PROTO(struct hfi1_devdata *dd, + u32 index, u32 type, unsigned long pa, u16 order), + TP_ARGS(dd, index, type, pa, order), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd) + __field(unsigned long, pa); + __field(u32, index); + __field(u32, type); + __field(u16, order); + ), + TP_fast_assign(/* assign */ + DD_DEV_ASSIGN(dd); + __entry->pa = pa; + __entry->index = index; + __entry->type = type; + __entry->order = order; + ), + TP_printk("[%s] type %s pa %lx index %u order %u", + __get_str(dev), + show_tidtype(__entry->type), + __entry->pa, + __entry->index, + __entry->order + ) +); + +TRACE_EVENT(/* exp_tid_inval */ + hfi1_exp_tid_inval, + TP_PROTO(unsigned int ctxt, u16 subctxt, unsigned long va, u32 rarr, + u32 npages, dma_addr_t dma), + TP_ARGS(ctxt, subctxt, va, rarr, npages, dma), + TP_STRUCT__entry(/* entry */ + __field(unsigned int, ctxt) + __field(u16, subctxt) + __field(unsigned long, va) + __field(u32, rarr) + __field(u32, npages) + __field(dma_addr_t, dma) + ), + TP_fast_assign(/* assign */ + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->va = va; + __entry->rarr = rarr; + __entry->npages = npages; + __entry->dma = dma; + ), + TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx dma: 0x%llx", + __entry->ctxt, + __entry->subctxt, + __entry->rarr, + __entry->npages, + __entry->va, + __entry->dma + ) +); + +DECLARE_EVENT_CLASS(/* opfn_state */ + hfi1_opfn_state_template, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(u16, requested) + __field(u16, completed) + __field(u8, curr) + ), + TP_fast_assign(/* assign */ + struct hfi1_qp_priv *priv = qp->priv; + + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->qpn = qp->ibqp.qp_num; + __entry->requested = priv->opfn.requested; + __entry->completed = priv->opfn.completed; + __entry->curr = priv->opfn.curr; + ), + TP_printk(/* print */ + "[%s] qpn 0x%x requested 0x%x completed 0x%x curr 0x%x", + __get_str(dev), + __entry->qpn, + __entry->requested, + __entry->completed, + __entry->curr + ) +); + +DEFINE_EVENT(/* event */ + hfi1_opfn_state_template, hfi1_opfn_state_conn_request, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_opfn_state_template, hfi1_opfn_state_sched_conn_request, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_opfn_state_template, hfi1_opfn_state_conn_response, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_opfn_state_template, hfi1_opfn_state_conn_reply, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_opfn_state_template, hfi1_opfn_state_conn_error, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DECLARE_EVENT_CLASS(/* opfn_data */ + hfi1_opfn_data_template, + TP_PROTO(struct rvt_qp *qp, u8 capcode, u64 data), + TP_ARGS(qp, capcode, data), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(u32, state) + __field(u8, capcode) + __field(u64, data) + ), + TP_fast_assign(/* assign */ + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->qpn = qp->ibqp.qp_num; + __entry->state = qp->state; + __entry->capcode = capcode; + __entry->data = data; + ), + TP_printk(/* printk */ + "[%s] qpn 0x%x (state 0x%x) Capcode %u data 0x%llx", + __get_str(dev), + __entry->qpn, + __entry->state, + __entry->capcode, + __entry->data + ) +); + +DEFINE_EVENT(/* event */ + hfi1_opfn_data_template, hfi1_opfn_data_conn_request, + TP_PROTO(struct rvt_qp *qp, u8 capcode, u64 data), + TP_ARGS(qp, capcode, data) +); + +DEFINE_EVENT(/* event */ + hfi1_opfn_data_template, hfi1_opfn_data_conn_response, + TP_PROTO(struct rvt_qp *qp, u8 capcode, u64 data), + TP_ARGS(qp, capcode, data) +); + +DEFINE_EVENT(/* event */ + hfi1_opfn_data_template, hfi1_opfn_data_conn_reply, + TP_PROTO(struct rvt_qp *qp, u8 capcode, u64 data), + TP_ARGS(qp, capcode, data) +); + +DECLARE_EVENT_CLASS(/* opfn_param */ + hfi1_opfn_param_template, + TP_PROTO(struct rvt_qp *qp, char remote, + struct tid_rdma_params *param), + TP_ARGS(qp, remote, param), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(char, remote) + __field(u32, param_qp) + __field(u32, max_len) + __field(u16, jkey) + __field(u8, max_read) + __field(u8, max_write) + __field(u8, timeout) + __field(u8, urg) + ), + TP_fast_assign(/* assign */ + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->qpn = qp->ibqp.qp_num; + __entry->remote = remote; + __entry->param_qp = param->qp; + __entry->max_len = param->max_len; + __entry->jkey = param->jkey; + __entry->max_read = param->max_read; + __entry->max_write = param->max_write; + __entry->timeout = param->timeout; + __entry->urg = param->urg; + ), + TP_printk(/* print */ + OPFN_PARAM_PRN, + __get_str(dev), + __entry->qpn, + __entry->remote ? "remote" : "local", + __entry->param_qp, + __entry->max_read, + __entry->max_write, + __entry->max_len, + __entry->jkey, + __entry->timeout, + __entry->urg + ) +); + +DEFINE_EVENT(/* event */ + hfi1_opfn_param_template, hfi1_opfn_param, + TP_PROTO(struct rvt_qp *qp, char remote, + struct tid_rdma_params *param), + TP_ARGS(qp, remote, param) +); + +DECLARE_EVENT_CLASS(/* msg */ + hfi1_msg_template, + TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more), + TP_ARGS(qp, msg, more), + TP_STRUCT__entry(/* entry */ + __field(u32, qpn) + __string(msg, msg) + __field(u64, more) + ), + TP_fast_assign(/* assign */ + __entry->qpn = qp ? qp->ibqp.qp_num : 0; + __assign_str(msg, msg); + __entry->more = more; + ), + TP_printk(/* print */ + "qpn 0x%x %s 0x%llx", + __entry->qpn, + __get_str(msg), + __entry->more + ) +); + +DEFINE_EVENT(/* event */ + hfi1_msg_template, hfi1_msg_opfn_conn_request, + TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more), + TP_ARGS(qp, msg, more) +); + +DEFINE_EVENT(/* event */ + hfi1_msg_template, hfi1_msg_opfn_conn_error, + TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more), + TP_ARGS(qp, msg, more) +); + +#endif /* __HFI1_TRACE_TID_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_tid +#include -- cgit v1.2.3-59-g8ed1b From 385156c5f2a61834666f079ee66338f177c65c28 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 19:29:44 -0800 Subject: IB/hfi: Move RC functions into a header file This patch moves some RC helper functions into a header file so that they can be called from both RC and TID RDMA functions. In addition, a common function for rewinding a request is created in rdmavt so that it can be shared between qib and hfi1 driver. Reviewed-by: Mike Marciniszyn Signed-off-by: Mitko Haralanov Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/rc.c | 119 +++++++++++++++---------------------- drivers/infiniband/hw/hfi1/rc.h | 50 ++++++++++++++++ drivers/infiniband/hw/qib/qib_rc.c | 7 +-- drivers/infiniband/sw/rdmavt/rc.c | 13 ++++ include/rdma/rdmavt_qp.h | 10 ++++ 5 files changed, 123 insertions(+), 76 deletions(-) create mode 100644 drivers/infiniband/hw/hfi1/rc.h (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 092d5eba980f..6e74cd3814b8 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -51,28 +51,48 @@ #include "hfi.h" #include "qp.h" +#include "rc.h" #include "verbs_txreq.h" #include "trace.h" -/* cut down ridiculously long IB macro names */ -#define OP(x) RC_OP(x) - -static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, - struct rvt_swqe *wqe, - struct hfi1_ibport *ibp); - -static u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe, - u32 psn, u32 pmtu) +struct rvt_ack_entry *find_prev_entry(struct rvt_qp *qp, u32 psn, u8 *prev, + u8 *prev_ack, bool *scheduled) + __must_hold(&qp->s_lock) { - u32 len; + struct rvt_ack_entry *e = NULL; + u8 i, p; + bool s = true; - len = delta_psn(psn, wqe->psn) * pmtu; - ss->sge = wqe->sg_list[0]; - ss->sg_list = wqe->sg_list + 1; - ss->num_sge = wqe->wr.num_sge; - ss->total_len = wqe->length; - rvt_skip_sge(ss, len, false); - return wqe->length - len; + for (i = qp->r_head_ack_queue; ; i = p) { + if (i == qp->s_tail_ack_queue) + s = false; + if (i) + p = i - 1; + else + p = rvt_size_atomic(ib_to_rvt(qp->ibqp.device)); + if (p == qp->r_head_ack_queue) { + e = NULL; + break; + } + e = &qp->s_ack_queue[p]; + if (!e->opcode) { + e = NULL; + break; + } + if (cmp_psn(psn, e->psn) >= 0) { + if (p == qp->s_tail_ack_queue && + cmp_psn(psn, e->lpsn) <= 0) + s = false; + break; + } + } + if (prev) + *prev = p; + if (prev_ack) + *prev_ack = i; + if (scheduled) + *scheduled = s; + return e; } /** @@ -1229,9 +1249,9 @@ static inline void update_last_psn(struct rvt_qp *qp, u32 psn) * This is similar to hfi1_send_complete but has to check to be sure * that the SGEs are not being referenced if the SWQE is being resent. */ -static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, - struct rvt_swqe *wqe, - struct hfi1_ibport *ibp) +struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, + struct rvt_swqe *wqe, + struct hfi1_ibport *ibp) { lockdep_assert_held(&qp->s_lock); /* @@ -1314,8 +1334,8 @@ static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, * May be called at interrupt level, with the QP s_lock held. * Returns 1 if OK, 0 if current operation should be aborted (NAK). */ -static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, - u64 val, struct hfi1_ctxtdata *rcd) +int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, + u64 val, struct hfi1_ctxtdata *rcd) { struct hfi1_ibport *ibp; enum ib_wc_status status; @@ -1754,16 +1774,6 @@ bail: return; } -static inline void rc_defered_ack(struct hfi1_ctxtdata *rcd, - struct rvt_qp *qp) -{ - if (list_empty(&qp->rspwait)) { - qp->r_flags |= RVT_R_RSP_NAK; - rvt_get_qp(qp); - list_add_tail(&qp->rspwait, &rcd->qp_wait_list); - } -} - static inline void rc_cancel_ack(struct rvt_qp *qp) { qp->r_adefered = 0; @@ -1796,8 +1806,9 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data, struct hfi1_ibport *ibp = rcd_to_iport(rcd); struct rvt_ack_entry *e; unsigned long flags; - u8 i, prev; - int old_req; + u8 prev; + u8 mra; /* most recent ACK */ + bool old_req; trace_hfi1_rcv_error(qp, psn); if (diff > 0) { @@ -1843,29 +1854,8 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data, spin_lock_irqsave(&qp->s_lock, flags); - for (i = qp->r_head_ack_queue; ; i = prev) { - if (i == qp->s_tail_ack_queue) - old_req = 0; - if (i) - prev = i - 1; - else - prev = rvt_size_atomic(ib_to_rvt(qp->ibqp.device)); - if (prev == qp->r_head_ack_queue) { - e = NULL; - break; - } - e = &qp->s_ack_queue[prev]; - if (!e->opcode) { - e = NULL; - break; - } - if (cmp_psn(psn, e->psn) >= 0) { - if (prev == qp->s_tail_ack_queue && - cmp_psn(psn, e->lpsn) <= 0) - old_req = 0; - break; - } - } + e = find_prev_entry(qp, psn, &prev, &mra, &old_req); + switch (opcode) { case OP(RDMA_READ_REQUEST): { struct ib_reth *reth; @@ -1940,7 +1930,7 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data, * Resend the most recent ACK if this request is * after all the previous RDMA reads and atomics. */ - if (i == qp->r_head_ack_queue) { + if (mra == qp->r_head_ack_queue) { spin_unlock_irqrestore(&qp->s_lock, flags); qp->r_nak_state = 0; qp->r_ack_psn = qp->r_psn - 1; @@ -1951,7 +1941,7 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data, * Resend the RDMA read or atomic op which * ACKs this duplicate request. */ - qp->s_tail_ack_queue = i; + qp->s_tail_ack_queue = mra; break; } qp->s_ack_state = OP(ACKNOWLEDGE); @@ -1968,17 +1958,6 @@ send_ack: return 0; } -static inline void update_ack_queue(struct rvt_qp *qp, unsigned n) -{ - unsigned next; - - next = n + 1; - if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) - next = 0; - qp->s_tail_ack_queue = next; - qp->s_ack_state = OP(ACKNOWLEDGE); -} - static void log_cca_event(struct hfi1_pportdata *ppd, u8 sl, u32 rlid, u32 lqpn, u32 rqpn, u8 svc_type) { diff --git a/drivers/infiniband/hw/hfi1/rc.h b/drivers/infiniband/hw/hfi1/rc.h new file mode 100644 index 000000000000..4329eadcb3df --- /dev/null +++ b/drivers/infiniband/hw/hfi1/rc.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ +/* + * Copyright(c) 2018 Intel Corporation. + * + */ + +#ifndef HFI1_RC_H +#define HFI1_RC_H + +/* cut down ridiculously long IB macro names */ +#define OP(x) IB_OPCODE_RC_##x + +static inline void update_ack_queue(struct rvt_qp *qp, unsigned int n) +{ + unsigned int next; + + next = n + 1; + if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) + next = 0; + qp->s_tail_ack_queue = next; + qp->s_ack_state = OP(ACKNOWLEDGE); +} + +static inline void rc_defered_ack(struct hfi1_ctxtdata *rcd, + struct rvt_qp *qp) +{ + if (list_empty(&qp->rspwait)) { + qp->r_flags |= RVT_R_RSP_NAK; + rvt_get_qp(qp); + list_add_tail(&qp->rspwait, &rcd->qp_wait_list); + } +} + +static inline u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe, + u32 psn, u32 pmtu) +{ + u32 len; + + len = delta_psn(psn, wqe->psn) * pmtu; + return rvt_restart_sge(ss, wqe, len); +} + +struct rvt_ack_entry *find_prev_entry(struct rvt_qp *qp, u32 psn, u8 *prev, + u8 *prev_ack, bool *scheduled); +int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, u64 val, + struct hfi1_ctxtdata *rcd); +struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, struct rvt_swqe *wqe, + struct hfi1_ibport *ibp); + +#endif /* HFI1_RC_H */ diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c index 6fa002940451..50dd9811b088 100644 --- a/drivers/infiniband/hw/qib/qib_rc.c +++ b/drivers/infiniband/hw/qib/qib_rc.c @@ -45,12 +45,7 @@ static u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe, u32 len; len = ((psn - wqe->psn) & QIB_PSN_MASK) * pmtu; - ss->sge = wqe->sg_list[0]; - ss->sg_list = wqe->sg_list + 1; - ss->num_sge = wqe->wr.num_sge; - ss->total_len = wqe->length; - rvt_skip_sge(ss, len, false); - return wqe->length - len; + return rvt_restart_sge(ss, wqe, len); } /** diff --git a/drivers/infiniband/sw/rdmavt/rc.c b/drivers/infiniband/sw/rdmavt/rc.c index 6131cc558bdb..8d71647820a8 100644 --- a/drivers/infiniband/sw/rdmavt/rc.c +++ b/drivers/infiniband/sw/rdmavt/rc.c @@ -187,3 +187,16 @@ void rvt_get_credit(struct rvt_qp *qp, u32 aeth) } } EXPORT_SYMBOL(rvt_get_credit); + +/* rvt_restart_sge - rewind the sge state for a wqe */ +u32 rvt_restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe, u32 len) +{ + ss->sge = wqe->sg_list[0]; + ss->sg_list = wqe->sg_list + 1; + ss->num_sge = wqe->wr.num_sge; + ss->total_len = wqe->length; + rvt_skip_sge(ss, len, false); + return wqe->length - len; +} +EXPORT_SYMBOL(rvt_restart_sge); + diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index cbafb1878669..56a9221378d9 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -628,6 +628,16 @@ __be32 rvt_compute_aeth(struct rvt_qp *qp); */ void rvt_get_credit(struct rvt_qp *qp, u32 aeth); +/** + * rvt_restart_sge - rewind the sge state for a wqe + * @ss: the sge state pointer + * @wqe: the wqe to rewind + * @len: the data length from the start of the wqe in bytes + * + * Returns the remaining data length. + */ +u32 rvt_restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe, u32 len); + /** * @qp - the qp pair * @len - the length -- cgit v1.2.3-59-g8ed1b From 37356e78328186814e994e0ad1a1cfd6a142bef4 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Tue, 5 Feb 2019 14:13:13 -0800 Subject: IB/hfi1: TID RDMA flow allocation The hfi1 hardware flow is a hardware flow-control mechanism for a KDETH data packet that is received on a hfi1 port. It validates the packet by checking both the generation and sequence. Each QP that uses the TID RDMA mechanism will allocate a hardware flow from its receiving context for any incoming KDETH data packets. This patch implements: (1) a function to allocate hardware flow (2) a function to free hardware flow (3) a function to initialize hardware flow generation for a receiving context (4) a wait mechanism if the hardware flow is not available (4) a function to remove the qp from the wait queue for hardware flow when the qp is reset or destroyed. Signed-off-by: Mitko Haralanov Signed-off-by: Ashutosh Dixit Signed-off-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/common.h | 4 + drivers/infiniband/hw/hfi1/hfi.h | 15 ++ drivers/infiniband/hw/hfi1/init.c | 7 + drivers/infiniband/hw/hfi1/qp.c | 3 + drivers/infiniband/hw/hfi1/qp.h | 2 + drivers/infiniband/hw/hfi1/tid_rdma.c | 440 ++++++++++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/tid_rdma.h | 17 ++ drivers/infiniband/hw/hfi1/verbs.h | 3 + 8 files changed, 491 insertions(+) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/common.h b/drivers/infiniband/hw/hfi1/common.h index 40d3cfb58bd1..7310a5dba420 100644 --- a/drivers/infiniband/hw/hfi1/common.h +++ b/drivers/infiniband/hw/hfi1/common.h @@ -340,6 +340,10 @@ struct diag_pkt { #define HFI1_PSM_IOC_BASE_SEQ 0x0 +/* Number of BTH.PSN bits used for sequence number in expected rcvs */ +#define HFI1_KDETH_BTH_SEQ_SHIFT 11 +#define HFI1_KDETH_BTH_SEQ_MASK (BIT(HFI1_KDETH_BTH_SEQ_SHIFT) - 1) + static inline __u64 rhf_to_cpu(const __le32 *rbuf) { return __le64_to_cpu(*((__le64 *)rbuf)); diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index 9aa0357e17b7..78aa344c7403 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -198,6 +198,14 @@ struct exp_tid_set { }; typedef int (*rhf_rcv_function_ptr)(struct hfi1_packet *packet); + +struct tid_queue { + struct list_head queue_head; + /* queue head for QP TID resource waiters */ + u32 enqueue; /* count of tid enqueues */ + u32 dequeue; /* count of tid dequeues */ +}; + struct hfi1_ctxtdata { /* rcvhdrq base, needs mmap before useful */ void *rcvhdrq; @@ -291,6 +299,10 @@ struct hfi1_ctxtdata { /* PSM Specific fields */ /* lock protecting all Expected TID data */ struct mutex exp_mutex; + /* lock protecting all Expected TID data of kernel contexts */ + spinlock_t exp_lock; + /* Queue for QP's waiting for HW TID flows */ + struct tid_queue flow_queue; /* when waiting for rcv or pioavail */ wait_queue_head_t wait; /* uuid from PSM */ @@ -323,6 +335,9 @@ struct hfi1_ctxtdata { */ u8 subctxt_cnt; + /* Bit mask to track free TID RDMA HW flows */ + unsigned long flow_mask; + struct tid_flow_state flows[RXE_NUM_TID_FLOWS]; }; /** diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index a8dbd0f191f5..56830a514b92 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -370,6 +370,8 @@ int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa, rcd->rhf_rcv_function_map = normal_rhf_rcv_functions; mutex_init(&rcd->exp_mutex); + spin_lock_init(&rcd->exp_lock); + INIT_LIST_HEAD(&rcd->flow_queue.queue_head); hfi1_cdbg(PROC, "setting up context %u\n", rcd->ctxt); @@ -472,6 +474,9 @@ int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa, GFP_KERNEL, numa); if (!rcd->opstats) goto bail; + + /* Initialize TID flow generations for the context */ + hfi1_kern_init_ctxt_generations(rcd); } *context = rcd; @@ -771,6 +776,8 @@ static void enable_chip(struct hfi1_devdata *dd) rcvmask |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB; if (HFI1_CAP_KGET_MASK(rcd->flags, NODROP_EGR_FULL)) rcvmask |= HFI1_RCVCTRL_NO_EGR_DROP_ENB; + if (HFI1_CAP_IS_KSET(TID_RDMA)) + rcvmask |= HFI1_RCVCTRL_TIDFLOW_ENB; hfi1_rcvctrl(dd, rcvmask, rcd); sc_enable(rcd->sc); hfi1_rcd_put(rcd); diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index f822f92b415f..69c38af49492 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -738,6 +738,7 @@ void flush_qp_waiters(struct rvt_qp *qp) { lockdep_assert_held(&qp->s_lock); flush_iowait(qp); + hfi1_tid_rdma_flush_wait(qp); } void stop_send_queue(struct rvt_qp *qp) @@ -745,6 +746,8 @@ void stop_send_queue(struct rvt_qp *qp) struct hfi1_qp_priv *priv = qp->priv; iowait_cancel_work(&priv->s_iowait); + if (cancel_work_sync(&priv->tid_rdma.trigger_work)) + rvt_put_qp(qp); } void quiesce_qp(struct rvt_qp *qp) diff --git a/drivers/infiniband/hw/hfi1/qp.h b/drivers/infiniband/hw/hfi1/qp.h index 7adb6dff6813..ce25a27aa4a1 100644 --- a/drivers/infiniband/hw/hfi1/qp.h +++ b/drivers/infiniband/hw/hfi1/qp.h @@ -63,11 +63,13 @@ extern const struct rvt_operation_params hfi1_post_parms[]; * HFI1_S_AHG_VALID - ahg header valid on chip * HFI1_S_AHG_CLEAR - have send engine clear ahg state * HFI1_S_WAIT_PIO_DRAIN - qp waiting for PIOs to drain + * HFI1_S_WAIT_TID_SPACE - a QP is waiting for TID resource * HFI1_S_MIN_BIT_MASK - the lowest bit that can be used by hfi1 */ #define HFI1_S_AHG_VALID 0x80000000 #define HFI1_S_AHG_CLEAR 0x40000000 #define HFI1_S_WAIT_PIO_DRAIN 0x20000000 +#define HFI1_S_WAIT_TID_SPACE 0x10000000 #define HFI1_S_MIN_BIT_MASK 0x01000000 /* diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index e8f57c0cd8bc..70671212808f 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -5,10 +5,28 @@ */ #include "hfi.h" +#include "qp.h" #include "verbs.h" #include "tid_rdma.h" #include "trace.h" +#define RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK BIT_ULL(32) +#define RCV_TID_FLOW_TABLE_CTRL_HDR_SUPP_EN_SMASK BIT_ULL(33) +#define RCV_TID_FLOW_TABLE_CTRL_KEEP_AFTER_SEQ_ERR_SMASK BIT_ULL(34) +#define RCV_TID_FLOW_TABLE_CTRL_KEEP_ON_GEN_ERR_SMASK BIT_ULL(35) +#define RCV_TID_FLOW_TABLE_STATUS_SEQ_MISMATCH_SMASK BIT_ULL(37) +#define RCV_TID_FLOW_TABLE_STATUS_GEN_MISMATCH_SMASK BIT_ULL(38) + +#define GENERATION_MASK 0xFFFFF + +static u32 mask_generation(u32 a) +{ + return a & GENERATION_MASK; +} + +/* Reserved generation value to set to unused flows for kernel contexts */ +#define KERN_GENERATION_RESERVED mask_generation(U32_MAX) + /* * J_KEY for kernel contexts when TID RDMA is used. * See generate_jkey() in hfi.h for more information. @@ -60,6 +78,8 @@ * C - Capcode */ +static void tid_rdma_trigger_resume(struct work_struct *work); + static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p) { return @@ -251,6 +271,12 @@ int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp, spin_lock_init(&qpriv->opfn.lock); INIT_WORK(&qpriv->opfn.opfn_work, opfn_send_conn_request); + INIT_WORK(&qpriv->tid_rdma.trigger_work, tid_rdma_trigger_resume); + qpriv->flow_state.psn = 0; + qpriv->flow_state.index = RXE_NUM_TID_FLOWS; + qpriv->flow_state.last_index = RXE_NUM_TID_FLOWS; + qpriv->flow_state.generation = KERN_GENERATION_RESERVED; + INIT_LIST_HEAD(&qpriv->tid_wait); return 0; } @@ -262,3 +288,417 @@ void hfi1_qp_priv_tid_free(struct rvt_dev_info *rdi, struct rvt_qp *qp) if (qp->ibqp.qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) cancel_work_sync(&priv->opfn.opfn_work); } + +/* Flow and tid waiter functions */ +/** + * DOC: lock ordering + * + * There are two locks involved with the queuing + * routines: the qp s_lock and the exp_lock. + * + * Since the tid space allocation is called from + * the send engine, the qp s_lock is already held. + * + * The allocation routines will get the exp_lock. + * + * The first_qp() call is provided to allow the head of + * the rcd wait queue to be fetched under the exp_lock and + * followed by a drop of the exp_lock. + * + * Any qp in the wait list will have the qp reference count held + * to hold the qp in memory. + */ + +/* + * return head of rcd wait list + * + * Must hold the exp_lock. + * + * Get a reference to the QP to hold the QP in memory. + * + * The caller must release the reference when the local + * is no longer being used. + */ +static struct rvt_qp *first_qp(struct hfi1_ctxtdata *rcd, + struct tid_queue *queue) + __must_hold(&rcd->exp_lock) +{ + struct hfi1_qp_priv *priv; + + lockdep_assert_held(&rcd->exp_lock); + priv = list_first_entry_or_null(&queue->queue_head, + struct hfi1_qp_priv, + tid_wait); + if (!priv) + return NULL; + rvt_get_qp(priv->owner); + return priv->owner; +} + +/** + * kernel_tid_waiters - determine rcd wait + * @rcd: the receive context + * @qp: the head of the qp being processed + * + * This routine will return false IFF + * the list is NULL or the head of the + * list is the indicated qp. + * + * Must hold the qp s_lock and the exp_lock. + * + * Return: + * false if either of the conditions below are statisfied: + * 1. The list is empty or + * 2. The indicated qp is at the head of the list and the + * HFI1_S_WAIT_TID_SPACE bit is set in qp->s_flags. + * true is returned otherwise. + */ +static bool kernel_tid_waiters(struct hfi1_ctxtdata *rcd, + struct tid_queue *queue, struct rvt_qp *qp) + __must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock) +{ + struct rvt_qp *fqp; + bool ret = true; + + lockdep_assert_held(&qp->s_lock); + lockdep_assert_held(&rcd->exp_lock); + fqp = first_qp(rcd, queue); + if (!fqp || (fqp == qp && (qp->s_flags & HFI1_S_WAIT_TID_SPACE))) + ret = false; + rvt_put_qp(fqp); + return ret; +} + +/** + * dequeue_tid_waiter - dequeue the qp from the list + * @qp - the qp to remove the wait list + * + * This routine removes the indicated qp from the + * wait list if it is there. + * + * This should be done after the hardware flow and + * tid array resources have been allocated. + * + * Must hold the qp s_lock and the rcd exp_lock. + * + * It assumes the s_lock to protect the s_flags + * field and to reliably test the HFI1_S_WAIT_TID_SPACE flag. + */ +static void dequeue_tid_waiter(struct hfi1_ctxtdata *rcd, + struct tid_queue *queue, struct rvt_qp *qp) + __must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock) +{ + struct hfi1_qp_priv *priv = qp->priv; + + lockdep_assert_held(&qp->s_lock); + lockdep_assert_held(&rcd->exp_lock); + if (list_empty(&priv->tid_wait)) + return; + list_del_init(&priv->tid_wait); + qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE; + queue->dequeue++; + rvt_put_qp(qp); +} + +/** + * queue_qp_for_tid_wait - suspend QP on tid space + * @rcd: the receive context + * @qp: the qp + * + * The qp is inserted at the tail of the rcd + * wait queue and the HFI1_S_WAIT_TID_SPACE s_flag is set. + * + * Must hold the qp s_lock and the exp_lock. + */ +static void queue_qp_for_tid_wait(struct hfi1_ctxtdata *rcd, + struct tid_queue *queue, struct rvt_qp *qp) + __must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock) +{ + struct hfi1_qp_priv *priv = qp->priv; + + lockdep_assert_held(&qp->s_lock); + lockdep_assert_held(&rcd->exp_lock); + if (list_empty(&priv->tid_wait)) { + qp->s_flags |= HFI1_S_WAIT_TID_SPACE; + list_add_tail(&priv->tid_wait, &queue->queue_head); + priv->tid_enqueue = ++queue->enqueue; + trace_hfi1_qpsleep(qp, HFI1_S_WAIT_TID_SPACE); + rvt_get_qp(qp); + } +} + +/** + * __trigger_tid_waiter - trigger tid waiter + * @qp: the qp + * + * This is a private entrance to schedule the qp + * assuming the caller is holding the qp->s_lock. + */ +static void __trigger_tid_waiter(struct rvt_qp *qp) + __must_hold(&qp->s_lock) +{ + lockdep_assert_held(&qp->s_lock); + if (!(qp->s_flags & HFI1_S_WAIT_TID_SPACE)) + return; + trace_hfi1_qpwakeup(qp, HFI1_S_WAIT_TID_SPACE); + hfi1_schedule_send(qp); +} + +/** + * tid_rdma_schedule_tid_wakeup - schedule wakeup for a qp + * @qp - the qp + * + * trigger a schedule or a waiting qp in a deadlock + * safe manner. The qp reference is held prior + * to this call via first_qp(). + * + * If the qp trigger was already scheduled (!rval) + * the the reference is dropped, otherwise the resume + * or the destroy cancel will dispatch the reference. + */ +static void tid_rdma_schedule_tid_wakeup(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv; + struct hfi1_ibport *ibp; + struct hfi1_pportdata *ppd; + struct hfi1_devdata *dd; + bool rval; + + if (!qp) + return; + + priv = qp->priv; + ibp = to_iport(qp->ibqp.device, qp->port_num); + ppd = ppd_from_ibp(ibp); + dd = dd_from_ibdev(qp->ibqp.device); + + rval = queue_work_on(priv->s_sde ? + priv->s_sde->cpu : + cpumask_first(cpumask_of_node(dd->node)), + ppd->hfi1_wq, + &priv->tid_rdma.trigger_work); + if (!rval) + rvt_put_qp(qp); +} + +/** + * tid_rdma_trigger_resume - field a trigger work request + * @work - the work item + * + * Complete the off qp trigger processing by directly + * calling the progress routine. + */ +static void tid_rdma_trigger_resume(struct work_struct *work) +{ + struct tid_rdma_qp_params *tr; + struct hfi1_qp_priv *priv; + struct rvt_qp *qp; + + tr = container_of(work, struct tid_rdma_qp_params, trigger_work); + priv = container_of(tr, struct hfi1_qp_priv, tid_rdma); + qp = priv->owner; + spin_lock_irq(&qp->s_lock); + if (qp->s_flags & HFI1_S_WAIT_TID_SPACE) { + spin_unlock_irq(&qp->s_lock); + hfi1_do_send(priv->owner, true); + } else { + spin_unlock_irq(&qp->s_lock); + } + rvt_put_qp(qp); +} + +/** + * tid_rdma_flush_wait - unwind any tid space wait + * + * This is called when resetting a qp to + * allow a destroy or reset to get rid + * of any tid space linkage and reference counts. + */ +static void _tid_rdma_flush_wait(struct rvt_qp *qp, struct tid_queue *queue) + __must_hold(&qp->s_lock) +{ + struct hfi1_qp_priv *priv; + + if (!qp) + return; + lockdep_assert_held(&qp->s_lock); + priv = qp->priv; + qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE; + spin_lock(&priv->rcd->exp_lock); + if (!list_empty(&priv->tid_wait)) { + list_del_init(&priv->tid_wait); + qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE; + queue->dequeue++; + rvt_put_qp(qp); + } + spin_unlock(&priv->rcd->exp_lock); +} + +void hfi1_tid_rdma_flush_wait(struct rvt_qp *qp) + __must_hold(&qp->s_lock) +{ + struct hfi1_qp_priv *priv = qp->priv; + + _tid_rdma_flush_wait(qp, &priv->rcd->flow_queue); +} + +/* Flow functions */ +/** + * kern_reserve_flow - allocate a hardware flow + * @rcd - the context to use for allocation + * @last - the index of the preferred flow. Use RXE_NUM_TID_FLOWS to + * signify "don't care". + * + * Use a bit mask based allocation to reserve a hardware + * flow for use in receiving KDETH data packets. If a preferred flow is + * specified the function will attempt to reserve that flow again, if + * available. + * + * The exp_lock must be held. + * + * Return: + * On success: a value postive value between 0 and RXE_NUM_TID_FLOWS - 1 + * On failure: -EAGAIN + */ +static int kern_reserve_flow(struct hfi1_ctxtdata *rcd, int last) + __must_hold(&rcd->exp_lock) +{ + int nr; + + /* Attempt to reserve the preferred flow index */ + if (last >= 0 && last < RXE_NUM_TID_FLOWS && + !test_and_set_bit(last, &rcd->flow_mask)) + return last; + + nr = ffz(rcd->flow_mask); + BUILD_BUG_ON(RXE_NUM_TID_FLOWS >= + (sizeof(rcd->flow_mask) * BITS_PER_BYTE)); + if (nr > (RXE_NUM_TID_FLOWS - 1)) + return -EAGAIN; + set_bit(nr, &rcd->flow_mask); + return nr; +} + +static void kern_set_hw_flow(struct hfi1_ctxtdata *rcd, u32 generation, + u32 flow_idx) +{ + u64 reg; + + reg = ((u64)generation << HFI1_KDETH_BTH_SEQ_SHIFT) | + RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK | + RCV_TID_FLOW_TABLE_CTRL_KEEP_AFTER_SEQ_ERR_SMASK | + RCV_TID_FLOW_TABLE_CTRL_KEEP_ON_GEN_ERR_SMASK | + RCV_TID_FLOW_TABLE_STATUS_SEQ_MISMATCH_SMASK | + RCV_TID_FLOW_TABLE_STATUS_GEN_MISMATCH_SMASK; + + if (generation != KERN_GENERATION_RESERVED) + reg |= RCV_TID_FLOW_TABLE_CTRL_HDR_SUPP_EN_SMASK; + + write_uctxt_csr(rcd->dd, rcd->ctxt, + RCV_TID_FLOW_TABLE + 8 * flow_idx, reg); +} + +static u32 kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, u32 flow_idx) + __must_hold(&rcd->exp_lock) +{ + u32 generation = rcd->flows[flow_idx].generation; + + kern_set_hw_flow(rcd, generation, flow_idx); + return generation; +} + +static u32 kern_flow_generation_next(u32 gen) +{ + u32 generation = mask_generation(gen + 1); + + if (generation == KERN_GENERATION_RESERVED) + generation = mask_generation(generation + 1); + return generation; +} + +static void kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, u32 flow_idx) + __must_hold(&rcd->exp_lock) +{ + rcd->flows[flow_idx].generation = + kern_flow_generation_next(rcd->flows[flow_idx].generation); + kern_set_hw_flow(rcd, KERN_GENERATION_RESERVED, flow_idx); +} + +int hfi1_kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp) +{ + struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv; + struct tid_flow_state *fs = &qpriv->flow_state; + struct rvt_qp *fqp; + unsigned long flags; + int ret = 0; + + /* The QP already has an allocated flow */ + if (fs->index != RXE_NUM_TID_FLOWS) + return ret; + + spin_lock_irqsave(&rcd->exp_lock, flags); + if (kernel_tid_waiters(rcd, &rcd->flow_queue, qp)) + goto queue; + + ret = kern_reserve_flow(rcd, fs->last_index); + if (ret < 0) + goto queue; + fs->index = ret; + fs->last_index = fs->index; + + /* Generation received in a RESYNC overrides default flow generation */ + if (fs->generation != KERN_GENERATION_RESERVED) + rcd->flows[fs->index].generation = fs->generation; + fs->generation = kern_setup_hw_flow(rcd, fs->index); + fs->psn = 0; + fs->flags = 0; + dequeue_tid_waiter(rcd, &rcd->flow_queue, qp); + /* get head before dropping lock */ + fqp = first_qp(rcd, &rcd->flow_queue); + spin_unlock_irqrestore(&rcd->exp_lock, flags); + + tid_rdma_schedule_tid_wakeup(fqp); + return 0; +queue: + queue_qp_for_tid_wait(rcd, &rcd->flow_queue, qp); + spin_unlock_irqrestore(&rcd->exp_lock, flags); + return -EAGAIN; +} + +void hfi1_kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp) +{ + struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv; + struct tid_flow_state *fs = &qpriv->flow_state; + struct rvt_qp *fqp; + unsigned long flags; + + if (fs->index >= RXE_NUM_TID_FLOWS) + return; + spin_lock_irqsave(&rcd->exp_lock, flags); + kern_clear_hw_flow(rcd, fs->index); + clear_bit(fs->index, &rcd->flow_mask); + fs->index = RXE_NUM_TID_FLOWS; + fs->psn = 0; + fs->generation = KERN_GENERATION_RESERVED; + + /* get head before dropping lock */ + fqp = first_qp(rcd, &rcd->flow_queue); + spin_unlock_irqrestore(&rcd->exp_lock, flags); + + if (fqp == qp) { + __trigger_tid_waiter(fqp); + rvt_put_qp(fqp); + } else { + tid_rdma_schedule_tid_wakeup(fqp); + } +} + +void hfi1_kern_init_ctxt_generations(struct hfi1_ctxtdata *rcd) +{ + int i; + + for (i = 0; i < RXE_NUM_TID_FLOWS; i++) { + rcd->flows[i].generation = mask_generation(prandom_u32()); + kern_set_hw_flow(rcd, KERN_GENERATION_RESERVED, i); + } +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index ee8151558e3f..3bc0aaf9568f 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -21,10 +21,21 @@ struct tid_rdma_params { }; struct tid_rdma_qp_params { + struct work_struct trigger_work; struct tid_rdma_params local; struct tid_rdma_params __rcu *remote; }; +/* Track state for each hardware flow */ +struct tid_flow_state { + u32 generation; + u32 psn; + u32 r_next_psn; /* next PSN to be received (in TID space) */ + u8 index; + u8 last_index; + u8 flags; +}; + bool tid_rdma_conn_req(struct rvt_qp *qp, u64 *data); bool tid_rdma_conn_reply(struct rvt_qp *qp, u64 data); bool tid_rdma_conn_resp(struct rvt_qp *qp, u64 *data); @@ -37,4 +48,10 @@ int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp, struct ib_qp_init_attr *init_attr); void hfi1_qp_priv_tid_free(struct rvt_dev_info *rdi, struct rvt_qp *qp); +void hfi1_tid_rdma_flush_wait(struct rvt_qp *qp); + +int hfi1_kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp); +void hfi1_kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp); +void hfi1_kern_init_ctxt_generations(struct hfi1_ctxtdata *rcd); + #endif /* HFI1_TID_RDMA_H */ diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index c8baa1e38ff6..9065e470bebb 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -159,9 +159,12 @@ struct hfi1_qp_priv { struct sdma_engine *s_sde; /* current sde */ struct send_context *s_sendcontext; /* current sendcontext */ struct hfi1_ctxtdata *rcd; /* QP's receive context */ + u32 tid_enqueue; /* saved when tid waited */ u8 s_sc; /* SC[0..4] for next packet */ struct iowait s_iowait; + struct list_head tid_wait; /* for queueing tid space */ struct hfi1_opfn_data opfn; + struct tid_flow_state flow_state; struct tid_rdma_qp_params tid_rdma; struct rvt_qp *owner; u8 hdr_type; /* 9B or 16B */ -- cgit v1.2.3-59-g8ed1b From 838b6fd2d9ca29998869e4d1ecf4566efe807666 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 19:30:07 -0800 Subject: IB/hfi1: TID RDMA RcvArray programming and TID allocation TID entries are used by hfi1 hardware to receive data payload from incoming packets directly into a user buffer and thus avoid data copying by software. This patch implements the functions for TID allocation, freeing, and programming TID RcvArray entries in hardware for kernel clients. TID entries are managed via lists of TID groups similar to PSM. Furthermore, to track TID resource allocation for each request, software flows are also allocated and freed as needed. Since software flows consume large amount of memory for tracking TID allocation and freeing, it is generally desirable to allocate them dynamically in the send queue and only for TID RDMA requests, but pre-allocate them for receive queue because the send queue could have thousands of entries while the receive queue has only a limited number of entries. Signed-off-by: Mitko Haralanov Signed-off-by: Ashutosh Dixit Signed-off-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/hfi.h | 2 + drivers/infiniband/hw/hfi1/init.c | 3 +- drivers/infiniband/hw/hfi1/tid_rdma.c | 877 +++++++++++++++++++++++++++++- drivers/infiniband/hw/hfi1/tid_rdma.h | 101 ++++ drivers/infiniband/hw/hfi1/user_exp_rcv.h | 1 - drivers/infiniband/hw/hfi1/verbs.c | 29 +- drivers/infiniband/hw/hfi1/verbs.h | 34 ++ drivers/infiniband/sw/rdmavt/qp.c | 2 +- include/rdma/rdmavt_qp.h | 2 + 9 files changed, 1033 insertions(+), 18 deletions(-) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index 78aa344c7403..1412ed157c98 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -303,6 +303,8 @@ struct hfi1_ctxtdata { spinlock_t exp_lock; /* Queue for QP's waiting for HW TID flows */ struct tid_queue flow_queue; + /* Queue for QP's waiting for HW receive array entries */ + struct tid_queue rarr_queue; /* when waiting for rcv or pioavail */ wait_queue_head_t wait; /* uuid from PSM */ diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 56830a514b92..d13304f7340d 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -372,6 +372,7 @@ int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa, mutex_init(&rcd->exp_mutex); spin_lock_init(&rcd->exp_lock); INIT_LIST_HEAD(&rcd->flow_queue.queue_head); + INIT_LIST_HEAD(&rcd->rarr_queue.queue_head); hfi1_cdbg(PROC, "setting up context %u\n", rcd->ctxt); @@ -1596,7 +1597,7 @@ static void cleanup_device_data(struct hfi1_devdata *dd) struct hfi1_ctxtdata *rcd = dd->rcd[ctxt]; if (rcd) { - hfi1_clear_tids(rcd); + hfi1_free_ctxt_rcv_groups(rcd); hfi1_free_ctxt(rcd); } } diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 70671212808f..1d02b12590f6 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -8,6 +8,7 @@ #include "qp.h" #include "verbs.h" #include "tid_rdma.h" +#include "exp_rcv.h" #include "trace.h" #define RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK BIT_ULL(32) @@ -35,8 +36,14 @@ static u32 mask_generation(u32 a) #define HFI1_KERNEL_MIN_JKEY HFI1_ADMIN_JKEY_RANGE #define HFI1_KERNEL_MAX_JKEY (2 * HFI1_ADMIN_JKEY_RANGE - 1) +/* Maximum number of segments in flight per QP request. */ #define TID_RDMA_MAX_READ_SEGS_PER_REQ 6 #define TID_RDMA_MAX_WRITE_SEGS_PER_REQ 4 +#define MAX_REQ max_t(u16, TID_RDMA_MAX_READ_SEGS_PER_REQ, \ + TID_RDMA_MAX_WRITE_SEGS_PER_REQ) +#define MAX_FLOWS roundup_pow_of_two(MAX_REQ + 1) + +#define MAX_EXPECTED_PAGES (MAX_EXPECTED_BUFFER / PAGE_SIZE) #define TID_OPFN_QP_CTXT_MASK 0xff #define TID_OPFN_QP_CTXT_SHIFT 56 @@ -79,6 +86,11 @@ static u32 mask_generation(u32 a) */ static void tid_rdma_trigger_resume(struct work_struct *work); +static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req); +static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req, + gfp_t gfp); +static void hfi1_init_trdma_req(struct rvt_qp *qp, + struct tid_rdma_request *req); static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p) { @@ -230,7 +242,7 @@ int hfi1_kern_exp_rcv_init(struct hfi1_ctxtdata *rcd, int reinit) BUILD_BUG_ON(TID_RDMA_JKEY > HFI1_KERNEL_MAX_JKEY); rcd->jkey = TID_RDMA_JKEY; hfi1_set_ctxt_jkey(rcd->dd, rcd, rcd->jkey); - return 0; + return hfi1_alloc_ctxt_rcv_groups(rcd); } /** @@ -266,6 +278,7 @@ int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp, struct ib_qp_init_attr *init_attr) { struct hfi1_qp_priv *qpriv = qp->priv; + int i, ret; qpriv->rcd = qp_to_rcd(rdi, qp); @@ -278,15 +291,75 @@ int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp, qpriv->flow_state.generation = KERN_GENERATION_RESERVED; INIT_LIST_HEAD(&qpriv->tid_wait); + if (init_attr->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) { + struct hfi1_devdata *dd = qpriv->rcd->dd; + + qpriv->pages = kzalloc_node(TID_RDMA_MAX_PAGES * + sizeof(*qpriv->pages), + GFP_KERNEL, dd->node); + if (!qpriv->pages) + return -ENOMEM; + for (i = 0; i < qp->s_size; i++) { + struct hfi1_swqe_priv *priv; + struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, i); + + priv = kzalloc_node(sizeof(*priv), GFP_KERNEL, + dd->node); + if (!priv) + return -ENOMEM; + + hfi1_init_trdma_req(qp, &priv->tid_req); + priv->tid_req.e.swqe = wqe; + wqe->priv = priv; + } + for (i = 0; i < rvt_max_atomic(rdi); i++) { + struct hfi1_ack_priv *priv; + + priv = kzalloc_node(sizeof(*priv), GFP_KERNEL, + dd->node); + if (!priv) + return -ENOMEM; + + hfi1_init_trdma_req(qp, &priv->tid_req); + priv->tid_req.e.ack = &qp->s_ack_queue[i]; + + ret = hfi1_kern_exp_rcv_alloc_flows(&priv->tid_req, + GFP_KERNEL); + if (ret) { + kfree(priv); + return ret; + } + qp->s_ack_queue[i].priv = priv; + } + } + return 0; } void hfi1_qp_priv_tid_free(struct rvt_dev_info *rdi, struct rvt_qp *qp) { - struct hfi1_qp_priv *priv = qp->priv; - - if (qp->ibqp.qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) - cancel_work_sync(&priv->opfn.opfn_work); + struct hfi1_qp_priv *qpriv = qp->priv; + struct rvt_swqe *wqe; + u32 i; + + if (qp->ibqp.qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) { + for (i = 0; i < qp->s_size; i++) { + wqe = rvt_get_swqe_ptr(qp, i); + kfree(wqe->priv); + wqe->priv = NULL; + } + for (i = 0; i < rvt_max_atomic(rdi); i++) { + struct hfi1_ack_priv *priv = qp->s_ack_queue[i].priv; + + if (priv) + hfi1_kern_exp_rcv_free_flows(&priv->tid_req); + kfree(priv); + qp->s_ack_queue[i].priv = NULL; + } + cancel_work_sync(&qpriv->opfn.opfn_work); + kfree(qpriv->pages); + qpriv->pages = NULL; + } } /* Flow and tid waiter functions */ @@ -540,6 +613,7 @@ void hfi1_tid_rdma_flush_wait(struct rvt_qp *qp) struct hfi1_qp_priv *priv = qp->priv; _tid_rdma_flush_wait(qp, &priv->rcd->flow_queue); + _tid_rdma_flush_wait(qp, &priv->rcd->rarr_queue); } /* Flow functions */ @@ -702,3 +776,796 @@ void hfi1_kern_init_ctxt_generations(struct hfi1_ctxtdata *rcd) kern_set_hw_flow(rcd, KERN_GENERATION_RESERVED, i); } } + +/* TID allocation functions */ +static u8 trdma_pset_order(struct tid_rdma_pageset *s) +{ + u8 count = s->count; + + return ilog2(count) + 1; +} + +/** + * tid_rdma_find_phys_blocks_4k - get groups base on mr info + * @npages - number of pages + * @pages - pointer to an array of page structs + * @list - page set array to return + * + * This routine returns the number of groups associated with + * the current sge information. This implementation is based + * on the expected receive find_phys_blocks() adjusted to + * use the MR information vs. the pfn. + * + * Return: + * the number of RcvArray entries + */ +static u32 tid_rdma_find_phys_blocks_4k(struct tid_rdma_flow *flow, + struct page **pages, + u32 npages, + struct tid_rdma_pageset *list) +{ + u32 pagecount, pageidx, setcount = 0, i; + void *vaddr, *this_vaddr; + + if (!npages) + return 0; + + /* + * Look for sets of physically contiguous pages in the user buffer. + * This will allow us to optimize Expected RcvArray entry usage by + * using the bigger supported sizes. + */ + vaddr = page_address(pages[0]); + for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) { + this_vaddr = i < npages ? page_address(pages[i]) : NULL; + /* + * If the vaddr's are not sequential, pages are not physically + * contiguous. + */ + if (this_vaddr != (vaddr + PAGE_SIZE)) { + /* + * At this point we have to loop over the set of + * physically contiguous pages and break them down it + * sizes supported by the HW. + * There are two main constraints: + * 1. The max buffer size is MAX_EXPECTED_BUFFER. + * If the total set size is bigger than that + * program only a MAX_EXPECTED_BUFFER chunk. + * 2. The buffer size has to be a power of two. If + * it is not, round down to the closes power of + * 2 and program that size. + */ + while (pagecount) { + int maxpages = pagecount; + u32 bufsize = pagecount * PAGE_SIZE; + + if (bufsize > MAX_EXPECTED_BUFFER) + maxpages = + MAX_EXPECTED_BUFFER >> + PAGE_SHIFT; + else if (!is_power_of_2(bufsize)) + maxpages = + rounddown_pow_of_two(bufsize) >> + PAGE_SHIFT; + + list[setcount].idx = pageidx; + list[setcount].count = maxpages; + pagecount -= maxpages; + pageidx += maxpages; + setcount++; + } + pageidx = i; + pagecount = 1; + vaddr = this_vaddr; + } else { + vaddr += PAGE_SIZE; + pagecount++; + } + } + /* insure we always return an even number of sets */ + if (setcount & 1) + list[setcount++].count = 0; + return setcount; +} + +/** + * tid_flush_pages - dump out pages into pagesets + * @list - list of pagesets + * @idx - pointer to current page index + * @pages - number of pages to dump + * @sets - current number of pagesset + * + * This routine flushes out accumuated pages. + * + * To insure an even number of sets the + * code may add a filler. + * + * This can happen with when pages is not + * a power of 2 or pages is a power of 2 + * less than the maximum pages. + * + * Return: + * The new number of sets + */ + +static u32 tid_flush_pages(struct tid_rdma_pageset *list, + u32 *idx, u32 pages, u32 sets) +{ + while (pages) { + u32 maxpages = pages; + + if (maxpages > MAX_EXPECTED_PAGES) + maxpages = MAX_EXPECTED_PAGES; + else if (!is_power_of_2(maxpages)) + maxpages = rounddown_pow_of_two(maxpages); + list[sets].idx = *idx; + list[sets++].count = maxpages; + *idx += maxpages; + pages -= maxpages; + } + /* might need a filler */ + if (sets & 1) + list[sets++].count = 0; + return sets; +} + +/** + * tid_rdma_find_phys_blocks_8k - get groups base on mr info + * @pages - pointer to an array of page structs + * @npages - number of pages + * @list - page set array to return + * + * This routine parses an array of pages to compute pagesets + * in an 8k compatible way. + * + * pages are tested two at a time, i, i + 1 for contiguous + * pages and i - 1 and i contiguous pages. + * + * If any condition is false, any accumlated pages are flushed and + * v0,v1 are emitted as separate PAGE_SIZE pagesets + * + * Otherwise, the current 8k is totaled for a future flush. + * + * Return: + * The number of pagesets + * list set with the returned number of pagesets + * + */ +static u32 tid_rdma_find_phys_blocks_8k(struct tid_rdma_flow *flow, + struct page **pages, + u32 npages, + struct tid_rdma_pageset *list) +{ + u32 idx, sets = 0, i; + u32 pagecnt = 0; + void *v0, *v1, *vm1; + + if (!npages) + return 0; + for (idx = 0, i = 0, vm1 = NULL; i < npages; i += 2) { + /* get a new v0 */ + v0 = page_address(pages[i]); + v1 = i + 1 < npages ? + page_address(pages[i + 1]) : NULL; + /* compare i, i + 1 vaddr */ + if (v1 != (v0 + PAGE_SIZE)) { + /* flush out pages */ + sets = tid_flush_pages(list, &idx, pagecnt, sets); + /* output v0,v1 as two pagesets */ + list[sets].idx = idx++; + list[sets++].count = 1; + if (v1) { + list[sets].count = 1; + list[sets++].idx = idx++; + } else { + list[sets++].count = 0; + } + vm1 = NULL; + pagecnt = 0; + continue; + } + /* i,i+1 consecutive, look at i-1,i */ + if (vm1 && v0 != (vm1 + PAGE_SIZE)) { + /* flush out pages */ + sets = tid_flush_pages(list, &idx, pagecnt, sets); + pagecnt = 0; + } + /* pages will always be a multiple of 8k */ + pagecnt += 2; + /* save i-1 */ + vm1 = v1; + /* move to next pair */ + } + /* dump residual pages at end */ + sets = tid_flush_pages(list, &idx, npages - idx, sets); + /* by design cannot be odd sets */ + WARN_ON(sets & 1); + return sets; +} + +/** + * Find pages for one segment of a sge array represented by @ss. The function + * does not check the sge, the sge must have been checked for alignment with a + * prior call to hfi1_kern_trdma_ok. Other sge checking is done as part of + * rvt_lkey_ok and rvt_rkey_ok. Also, the function only modifies the local sge + * copy maintained in @ss->sge, the original sge is not modified. + * + * Unlike IB RDMA WRITE, we can't decrement ss->num_sge here because we are not + * releasing the MR reference count at the same time. Otherwise, we'll "leak" + * references to the MR. This difference requires that we keep track of progress + * into the sg_list. This is done by the cur_seg cursor in the tid_rdma_request + * structure. + */ +static u32 kern_find_pages(struct tid_rdma_flow *flow, + struct page **pages, + struct rvt_sge_state *ss, bool *last) +{ + struct tid_rdma_request *req = flow->req; + struct rvt_sge *sge = &ss->sge; + u32 length = flow->req->seg_len; + u32 len = PAGE_SIZE; + u32 i = 0; + + while (length && req->isge < ss->num_sge) { + pages[i++] = virt_to_page(sge->vaddr); + + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (!sge->sge_length) { + if (++req->isge < ss->num_sge) + *sge = ss->sg_list[req->isge - 1]; + } else if (sge->length == 0 && sge->mr->lkey) { + if (++sge->n >= RVT_SEGSZ) { + ++sge->m; + sge->n = 0; + } + sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = sge->mr->map[sge->m]->segs[sge->n].length; + } + length -= len; + } + + flow->length = flow->req->seg_len - length; + *last = req->isge == ss->num_sge ? false : true; + return i; +} + +static void dma_unmap_flow(struct tid_rdma_flow *flow) +{ + struct hfi1_devdata *dd; + int i; + struct tid_rdma_pageset *pset; + + dd = flow->req->rcd->dd; + for (i = 0, pset = &flow->pagesets[0]; i < flow->npagesets; + i++, pset++) { + if (pset->count && pset->addr) { + dma_unmap_page(&dd->pcidev->dev, + pset->addr, + PAGE_SIZE * pset->count, + DMA_FROM_DEVICE); + pset->mapped = 0; + } + } +} + +static int dma_map_flow(struct tid_rdma_flow *flow, struct page **pages) +{ + int i; + struct hfi1_devdata *dd = flow->req->rcd->dd; + struct tid_rdma_pageset *pset; + + for (i = 0, pset = &flow->pagesets[0]; i < flow->npagesets; + i++, pset++) { + if (pset->count) { + pset->addr = dma_map_page(&dd->pcidev->dev, + pages[pset->idx], + 0, + PAGE_SIZE * pset->count, + DMA_FROM_DEVICE); + + if (dma_mapping_error(&dd->pcidev->dev, pset->addr)) { + dma_unmap_flow(flow); + return -ENOMEM; + } + pset->mapped = 1; + } + } + return 0; +} + +static inline bool dma_mapped(struct tid_rdma_flow *flow) +{ + return !!flow->pagesets[0].mapped; +} + +/* + * Get pages pointers and identify contiguous physical memory chunks for a + * segment. All segments are of length flow->req->seg_len. + */ +static int kern_get_phys_blocks(struct tid_rdma_flow *flow, + struct page **pages, + struct rvt_sge_state *ss, bool *last) +{ + u8 npages; + + /* Reuse previously computed pagesets, if any */ + if (flow->npagesets) { + if (!dma_mapped(flow)) + return dma_map_flow(flow, pages); + return 0; + } + + npages = kern_find_pages(flow, pages, ss, last); + + if (flow->req->qp->pmtu == enum_to_mtu(OPA_MTU_4096)) + flow->npagesets = + tid_rdma_find_phys_blocks_4k(flow, pages, npages, + flow->pagesets); + else + flow->npagesets = + tid_rdma_find_phys_blocks_8k(flow, pages, npages, + flow->pagesets); + + return dma_map_flow(flow, pages); +} + +static inline void kern_add_tid_node(struct tid_rdma_flow *flow, + struct hfi1_ctxtdata *rcd, char *s, + struct tid_group *grp, u8 cnt) +{ + struct kern_tid_node *node = &flow->tnode[flow->tnode_cnt++]; + + WARN_ON_ONCE(flow->tnode_cnt >= + (TID_RDMA_MAX_SEGMENT_SIZE >> PAGE_SHIFT)); + if (WARN_ON_ONCE(cnt & 1)) + dd_dev_err(rcd->dd, + "unexpected odd allocation cnt %u map 0x%x used %u", + cnt, grp->map, grp->used); + + node->grp = grp; + node->map = grp->map; + node->cnt = cnt; +} + +/* + * Try to allocate pageset_count TID's from TID groups for a context + * + * This function allocates TID's without moving groups between lists or + * modifying grp->map. This is done as follows, being cogizant of the lists + * between which the TID groups will move: + * 1. First allocate complete groups of 8 TID's since this is more efficient, + * these groups will move from group->full without affecting used + * 2. If more TID's are needed allocate from used (will move from used->full or + * stay in used) + * 3. If we still don't have the required number of TID's go back and look again + * at a complete group (will move from group->used) + */ +static int kern_alloc_tids(struct tid_rdma_flow *flow) +{ + struct hfi1_ctxtdata *rcd = flow->req->rcd; + struct hfi1_devdata *dd = rcd->dd; + u32 ngroups, pageidx = 0; + struct tid_group *group = NULL, *used; + u8 use; + + flow->tnode_cnt = 0; + ngroups = flow->npagesets / dd->rcv_entries.group_size; + if (!ngroups) + goto used_list; + + /* First look at complete groups */ + list_for_each_entry(group, &rcd->tid_group_list.list, list) { + kern_add_tid_node(flow, rcd, "complete groups", group, + group->size); + + pageidx += group->size; + if (!--ngroups) + break; + } + + if (pageidx >= flow->npagesets) + goto ok; + +used_list: + /* Now look at partially used groups */ + list_for_each_entry(used, &rcd->tid_used_list.list, list) { + use = min_t(u32, flow->npagesets - pageidx, + used->size - used->used); + kern_add_tid_node(flow, rcd, "used groups", used, use); + + pageidx += use; + if (pageidx >= flow->npagesets) + goto ok; + } + + /* + * Look again at a complete group, continuing from where we left. + * However, if we are at the head, we have reached the end of the + * complete groups list from the first loop above + */ + if (group && &group->list == &rcd->tid_group_list.list) + goto bail_eagain; + group = list_prepare_entry(group, &rcd->tid_group_list.list, + list); + if (list_is_last(&group->list, &rcd->tid_group_list.list)) + goto bail_eagain; + group = list_next_entry(group, list); + use = min_t(u32, flow->npagesets - pageidx, group->size); + kern_add_tid_node(flow, rcd, "complete continue", group, use); + pageidx += use; + if (pageidx >= flow->npagesets) + goto ok; +bail_eagain: + return -EAGAIN; +ok: + return 0; +} + +static void kern_program_rcv_group(struct tid_rdma_flow *flow, int grp_num, + u32 *pset_idx) +{ + struct hfi1_ctxtdata *rcd = flow->req->rcd; + struct hfi1_devdata *dd = rcd->dd; + struct kern_tid_node *node = &flow->tnode[grp_num]; + struct tid_group *grp = node->grp; + struct tid_rdma_pageset *pset; + u32 pmtu_pg = flow->req->qp->pmtu >> PAGE_SHIFT; + u32 rcventry, npages = 0, pair = 0, tidctrl; + u8 i, cnt = 0; + + for (i = 0; i < grp->size; i++) { + rcventry = grp->base + i; + + if (node->map & BIT(i) || cnt >= node->cnt) { + rcv_array_wc_fill(dd, rcventry); + continue; + } + pset = &flow->pagesets[(*pset_idx)++]; + if (pset->count) { + hfi1_put_tid(dd, rcventry, PT_EXPECTED, + pset->addr, trdma_pset_order(pset)); + } else { + hfi1_put_tid(dd, rcventry, PT_INVALID, 0, 0); + } + npages += pset->count; + + rcventry -= rcd->expected_base; + tidctrl = pair ? 0x3 : rcventry & 0x1 ? 0x2 : 0x1; + /* + * A single TID entry will be used to use a rcvarr pair (with + * tidctrl 0x3), if ALL these are true (a) the bit pos is even + * (b) the group map shows current and the next bits as free + * indicating two consecutive rcvarry entries are available (c) + * we actually need 2 more entries + */ + pair = !(i & 0x1) && !((node->map >> i) & 0x3) && + node->cnt >= cnt + 2; + if (!pair) { + if (!pset->count) + tidctrl = 0x1; + flow->tid_entry[flow->tidcnt++] = + EXP_TID_SET(IDX, rcventry >> 1) | + EXP_TID_SET(CTRL, tidctrl) | + EXP_TID_SET(LEN, npages); + /* Efficient DIV_ROUND_UP(npages, pmtu_pg) */ + flow->npkts += (npages + pmtu_pg - 1) >> ilog2(pmtu_pg); + npages = 0; + } + + if (grp->used == grp->size - 1) + tid_group_move(grp, &rcd->tid_used_list, + &rcd->tid_full_list); + else if (!grp->used) + tid_group_move(grp, &rcd->tid_group_list, + &rcd->tid_used_list); + + grp->used++; + grp->map |= BIT(i); + cnt++; + } +} + +static void kern_unprogram_rcv_group(struct tid_rdma_flow *flow, int grp_num) +{ + struct hfi1_ctxtdata *rcd = flow->req->rcd; + struct hfi1_devdata *dd = rcd->dd; + struct kern_tid_node *node = &flow->tnode[grp_num]; + struct tid_group *grp = node->grp; + u32 rcventry; + u8 i, cnt = 0; + + for (i = 0; i < grp->size; i++) { + rcventry = grp->base + i; + + if (node->map & BIT(i) || cnt >= node->cnt) { + rcv_array_wc_fill(dd, rcventry); + continue; + } + + hfi1_put_tid(dd, rcventry, PT_INVALID, 0, 0); + + grp->used--; + grp->map &= ~BIT(i); + cnt++; + + if (grp->used == grp->size - 1) + tid_group_move(grp, &rcd->tid_full_list, + &rcd->tid_used_list); + else if (!grp->used) + tid_group_move(grp, &rcd->tid_used_list, + &rcd->tid_group_list); + } + if (WARN_ON_ONCE(cnt & 1)) { + struct hfi1_ctxtdata *rcd = flow->req->rcd; + struct hfi1_devdata *dd = rcd->dd; + + dd_dev_err(dd, "unexpected odd free cnt %u map 0x%x used %u", + cnt, grp->map, grp->used); + } +} + +static void kern_program_rcvarray(struct tid_rdma_flow *flow) +{ + u32 pset_idx = 0; + int i; + + flow->npkts = 0; + flow->tidcnt = 0; + for (i = 0; i < flow->tnode_cnt; i++) + kern_program_rcv_group(flow, i, &pset_idx); +} + +/** + * hfi1_kern_exp_rcv_setup() - setup TID's and flow for one segment of a + * TID RDMA request + * + * @req: TID RDMA request for which the segment/flow is being set up + * @ss: sge state, maintains state across successive segments of a sge + * @last: set to true after the last sge segment has been processed + * + * This function + * (1) finds a free flow entry in the flow circular buffer + * (2) finds pages and continuous physical chunks constituing one segment + * of an sge + * (3) allocates TID group entries for those chunks + * (4) programs rcvarray entries in the hardware corresponding to those + * TID's + * (5) computes a tidarray with formatted TID entries which can be sent + * to the sender + * (6) Reserves and programs HW flows. + * (7) It also manages queing the QP when TID/flow resources are not + * available. + * + * @req points to struct tid_rdma_request of which the segments are a part. The + * function uses qp, rcd and seg_len members of @req. In the absence of errors, + * req->flow_idx is the index of the flow which has been prepared in this + * invocation of function call. With flow = &req->flows[req->flow_idx], + * flow->tid_entry contains the TID array which the sender can use for TID RDMA + * sends and flow->npkts contains number of packets required to send the + * segment. + * + * hfi1_check_sge_align should be called prior to calling this function and if + * it signals error TID RDMA cannot be used for this sge and this function + * should not be called. + * + * For the queuing, caller must hold the flow->req->qp s_lock from the send + * engine and the function will procure the exp_lock. + * + * Return: + * The function returns -EAGAIN if sufficient number of TID/flow resources to + * map the segment could not be allocated. In this case the function should be + * called again with previous arguments to retry the TID allocation. There are + * no other error returns. The function returns 0 on success. + */ +int hfi1_kern_exp_rcv_setup(struct tid_rdma_request *req, + struct rvt_sge_state *ss, bool *last) + __must_hold(&req->qp->s_lock) +{ + struct tid_rdma_flow *flow = &req->flows[req->setup_head]; + struct hfi1_ctxtdata *rcd = req->rcd; + struct hfi1_qp_priv *qpriv = req->qp->priv; + unsigned long flags; + struct rvt_qp *fqp; + u16 clear_tail = req->clear_tail; + + lockdep_assert_held(&req->qp->s_lock); + /* + * We return error if either (a) we don't have space in the flow + * circular buffer, or (b) we already have max entries in the buffer. + * Max entries depend on the type of request we are processing and the + * negotiated TID RDMA parameters. + */ + if (!CIRC_SPACE(req->setup_head, clear_tail, MAX_FLOWS) || + CIRC_CNT(req->setup_head, clear_tail, MAX_FLOWS) >= + req->n_flows) + return -EINVAL; + + /* + * Get pages, identify contiguous physical memory chunks for the segment + * If we can not determine a DMA address mapping we will treat it just + * like if we ran out of space above. + */ + if (kern_get_phys_blocks(flow, qpriv->pages, ss, last)) { + hfi1_wait_kmem(flow->req->qp); + return -ENOMEM; + } + + spin_lock_irqsave(&rcd->exp_lock, flags); + if (kernel_tid_waiters(rcd, &rcd->rarr_queue, flow->req->qp)) + goto queue; + + /* + * At this point we know the number of pagesets and hence the number of + * TID's to map the segment. Allocate the TID's from the TID groups. If + * we cannot allocate the required number we exit and try again later + */ + if (kern_alloc_tids(flow)) + goto queue; + /* + * Finally program the TID entries with the pagesets, compute the + * tidarray and enable the HW flow + */ + kern_program_rcvarray(flow); + + /* + * Setup the flow state with relevant information. + * This information is used for tracking the sequence of data packets + * for the segment. + * The flow is setup here as this is the most accurate time and place + * to do so. Doing at a later time runs the risk of the flow data in + * qpriv getting out of sync. + */ + memset(&flow->flow_state, 0x0, sizeof(flow->flow_state)); + flow->idx = qpriv->flow_state.index; + flow->flow_state.generation = qpriv->flow_state.generation; + flow->flow_state.spsn = qpriv->flow_state.psn; + flow->flow_state.lpsn = flow->flow_state.spsn + flow->npkts - 1; + flow->flow_state.r_next_psn = + full_flow_psn(flow, flow->flow_state.spsn); + qpriv->flow_state.psn += flow->npkts; + + dequeue_tid_waiter(rcd, &rcd->rarr_queue, flow->req->qp); + /* get head before dropping lock */ + fqp = first_qp(rcd, &rcd->rarr_queue); + spin_unlock_irqrestore(&rcd->exp_lock, flags); + tid_rdma_schedule_tid_wakeup(fqp); + + req->setup_head = (req->setup_head + 1) & (MAX_FLOWS - 1); + return 0; +queue: + queue_qp_for_tid_wait(rcd, &rcd->rarr_queue, flow->req->qp); + spin_unlock_irqrestore(&rcd->exp_lock, flags); + return -EAGAIN; +} + +static void hfi1_tid_rdma_reset_flow(struct tid_rdma_flow *flow) +{ + flow->npagesets = 0; +} + +/* + * This function is called after one segment has been successfully sent to + * release the flow and TID HW/SW resources for that segment. The segments for a + * TID RDMA request are setup and cleared in FIFO order which is managed using a + * circular buffer. + */ +int hfi1_kern_exp_rcv_clear(struct tid_rdma_request *req) + __must_hold(&req->qp->s_lock) +{ + struct tid_rdma_flow *flow = &req->flows[req->clear_tail]; + struct hfi1_ctxtdata *rcd = req->rcd; + unsigned long flags; + int i; + struct rvt_qp *fqp; + + lockdep_assert_held(&req->qp->s_lock); + /* Exit if we have nothing in the flow circular buffer */ + if (!CIRC_CNT(req->setup_head, req->clear_tail, MAX_FLOWS)) + return -EINVAL; + + spin_lock_irqsave(&rcd->exp_lock, flags); + + for (i = 0; i < flow->tnode_cnt; i++) + kern_unprogram_rcv_group(flow, i); + /* To prevent double unprogramming */ + flow->tnode_cnt = 0; + /* get head before dropping lock */ + fqp = first_qp(rcd, &rcd->rarr_queue); + spin_unlock_irqrestore(&rcd->exp_lock, flags); + + dma_unmap_flow(flow); + + hfi1_tid_rdma_reset_flow(flow); + req->clear_tail = (req->clear_tail + 1) & (MAX_FLOWS - 1); + + if (fqp == req->qp) { + __trigger_tid_waiter(fqp); + rvt_put_qp(fqp); + } else { + tid_rdma_schedule_tid_wakeup(fqp); + } + + return 0; +} + +/* + * This function is called to release all the tid entries for + * a request. + */ +void hfi1_kern_exp_rcv_clear_all(struct tid_rdma_request *req) + __must_hold(&req->qp->s_lock) +{ + /* Use memory barrier for proper ordering */ + while (CIRC_CNT(req->setup_head, req->clear_tail, MAX_FLOWS)) { + if (hfi1_kern_exp_rcv_clear(req)) + break; + } +} + +/** + * hfi1_kern_exp_rcv_free_flows - free priviously allocated flow information + * @req - the tid rdma request to be cleaned + */ +static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req) +{ + kfree(req->flows); + req->flows = NULL; +} + +/** + * __trdma_clean_swqe - clean up for large sized QPs + * @qp: the queue patch + * @wqe: the send wqe + */ +void __trdma_clean_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe) +{ + struct hfi1_swqe_priv *p = wqe->priv; + + hfi1_kern_exp_rcv_free_flows(&p->tid_req); +} + +/* + * This can be called at QP create time or in the data path. + */ +static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req, + gfp_t gfp) +{ + struct tid_rdma_flow *flows; + int i; + + if (likely(req->flows)) + return 0; + flows = kmalloc_node(MAX_FLOWS * sizeof(*flows), gfp, + req->rcd->numa_id); + if (!flows) + return -ENOMEM; + /* mini init */ + for (i = 0; i < MAX_FLOWS; i++) { + flows[i].req = req; + flows[i].npagesets = 0; + flows[i].pagesets[0].mapped = 0; + } + req->flows = flows; + return 0; +} + +static void hfi1_init_trdma_req(struct rvt_qp *qp, + struct tid_rdma_request *req) +{ + struct hfi1_qp_priv *qpriv = qp->priv; + + /* + * Initialize various TID RDMA request variables. + * These variables are "static", which is why they + * can be pre-initialized here before the WRs has + * even been submitted. + * However, non-NULL values for these variables do not + * imply that this WQE has been enabled for TID RDMA. + * Drivers should check the WQE's opcode to determine + * if a request is a TID RDMA one or not. + */ + req->qp = qp; + req->rcd = qpriv->rcd; +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index 3bc0aaf9568f..524baf8c8fac 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -6,7 +6,16 @@ #ifndef HFI1_TID_RDMA_H #define HFI1_TID_RDMA_H +#include +#include "common.h" + +/* Add a convenience helper */ +#define CIRC_ADD(val, add, size) (((val) + (add)) & ((size) - 1)) +#define CIRC_NEXT(val, size) CIRC_ADD(val, 1, size) +#define CIRC_PREV(val, size) CIRC_ADD(val, -1, size) + #define TID_RDMA_MAX_SEGMENT_SIZE BIT(18) /* 256 KiB (for now) */ +#define TID_RDMA_MAX_PAGES (BIT(18) >> PAGE_SHIFT) struct tid_rdma_params { struct rcu_head rcu_head; @@ -36,6 +45,81 @@ struct tid_flow_state { u8 flags; }; +struct tid_rdma_request { + struct rvt_qp *qp; + struct hfi1_ctxtdata *rcd; + union { + struct rvt_swqe *swqe; + struct rvt_ack_entry *ack; + } e; + + struct tid_rdma_flow *flows; /* array of tid flows */ + u16 n_flows; /* size of the flow buffer window */ + u16 setup_head; /* flow index we are setting up */ + u16 clear_tail; /* flow index we are clearing */ + u16 flow_idx; /* flow index most recently set up */ + + u32 seg_len; + + u32 isge; /* index of "current" sge */ +}; + +/* + * When header suppression is used, PSNs associated with a "flow" are + * relevant (and not the PSNs maintained by verbs). Track per-flow + * PSNs here for a TID RDMA segment. + * + */ +struct flow_state { + u32 flags; + u32 resp_ib_psn; /* The IB PSN of the response for this flow */ + u32 generation; /* generation of flow */ + u32 spsn; /* starting PSN in TID space */ + u32 lpsn; /* last PSN in TID space */ + u32 r_next_psn; /* next PSN to be received (in TID space) */ +}; + +struct tid_rdma_pageset { + dma_addr_t addr : 48; /* Only needed for the first page */ + u8 idx: 8; + u8 count : 7; + u8 mapped: 1; +}; + +/** + * kern_tid_node - used for managing TID's in TID groups + * + * @grp_idx: rcd relative index to tid_group + * @map: grp->map captured prior to programming this TID group in HW + * @cnt: Only @cnt of available group entries are actually programmed + */ +struct kern_tid_node { + struct tid_group *grp; + u8 map; + u8 cnt; +}; + +/* Overall info for a TID RDMA segment */ +struct tid_rdma_flow { + /* + * While a TID RDMA segment is being transferred, it uses a QP number + * from the "KDETH section of QP numbers" (which is different from the + * QP number that originated the request). Bits 11-15 of these QP + * numbers identify the "TID flow" for the segment. + */ + struct flow_state flow_state; + struct tid_rdma_request *req; + u32 length; + u8 tnode_cnt; + u8 tidcnt; + u8 idx; + u8 npagesets; + u8 npkts; + struct kern_tid_node tnode[TID_RDMA_MAX_PAGES]; + struct tid_rdma_pageset pagesets[TID_RDMA_MAX_PAGES]; + u32 tid_entry[TID_RDMA_MAX_PAGES]; +}; + bool tid_rdma_conn_req(struct rvt_qp *qp, u64 *data); bool tid_rdma_conn_reply(struct rvt_qp *qp, u64 data); bool tid_rdma_conn_resp(struct rvt_qp *qp, u64 *data); @@ -43,6 +127,23 @@ void tid_rdma_conn_error(struct rvt_qp *qp); void tid_rdma_opfn_init(struct rvt_qp *qp, struct tid_rdma_params *p); int hfi1_kern_exp_rcv_init(struct hfi1_ctxtdata *rcd, int reinit); +int hfi1_kern_exp_rcv_setup(struct tid_rdma_request *req, + struct rvt_sge_state *ss, bool *last); +int hfi1_kern_exp_rcv_clear(struct tid_rdma_request *req); +void hfi1_kern_exp_rcv_clear_all(struct tid_rdma_request *req); +void __trdma_clean_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe); + +/** + * trdma_clean_swqe - clean flows for swqe if large send queue + * @qp: the qp + * @wqe: the send wqe + */ +static inline void trdma_clean_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe) +{ + if (!wqe->priv) + return; + __trdma_clean_swqe(qp, wqe); +} int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp, struct ib_qp_init_attr *init_attr); diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.h b/drivers/infiniband/hw/hfi1/user_exp_rcv.h index e383cc01a2bf..43b105de1d54 100644 --- a/drivers/infiniband/hw/hfi1/user_exp_rcv.h +++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.h @@ -48,7 +48,6 @@ */ #include "hfi.h" - #include "exp_rcv.h" struct tid_pageset { diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 571bfd549c2a..02c1873a976c 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -504,11 +504,28 @@ static void verbs_sdma_complete( hfi1_put_txreq(tx); } +void hfi1_wait_kmem(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct ib_qp *ibqp = &qp->ibqp; + struct ib_device *ibdev = ibqp->device; + struct hfi1_ibdev *dev = to_idev(ibdev); + + if (list_empty(&priv->s_iowait.list)) { + if (list_empty(&dev->memwait)) + mod_timer(&dev->mem_timer, jiffies + 1); + qp->s_flags |= RVT_S_WAIT_KMEM; + list_add_tail(&priv->s_iowait.list, &dev->memwait); + priv->s_iowait.lock = &dev->iowait_lock; + trace_hfi1_qpsleep(qp, RVT_S_WAIT_KMEM); + rvt_get_qp(qp); + } +} + static int wait_kmem(struct hfi1_ibdev *dev, struct rvt_qp *qp, struct hfi1_pkt_state *ps) { - struct hfi1_qp_priv *priv = qp->priv; unsigned long flags; int ret = 0; @@ -517,15 +534,7 @@ static int wait_kmem(struct hfi1_ibdev *dev, write_seqlock(&dev->iowait_lock); list_add_tail(&ps->s_txreq->txreq.list, &ps->wait->tx_head); - if (list_empty(&priv->s_iowait.list)) { - if (list_empty(&dev->memwait)) - mod_timer(&dev->mem_timer, jiffies + 1); - qp->s_flags |= RVT_S_WAIT_KMEM; - list_add_tail(&priv->s_iowait.list, &dev->memwait); - priv->s_iowait.lock = &dev->iowait_lock; - trace_hfi1_qpsleep(qp, RVT_S_WAIT_KMEM); - rvt_get_qp(qp); - } + hfi1_wait_kmem(qp); write_sequnlock(&dev->iowait_lock); hfi1_qp_unbusy(qp, ps->wait); ret = -EBUSY; diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index 9065e470bebb..94f198b47239 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -159,6 +159,7 @@ struct hfi1_qp_priv { struct sdma_engine *s_sde; /* current sde */ struct send_context *s_sendcontext; /* current sendcontext */ struct hfi1_ctxtdata *rcd; /* QP's receive context */ + struct page **pages; /* for TID page scan */ u32 tid_enqueue; /* saved when tid waited */ u8 s_sc; /* SC[0..4] for next packet */ struct iowait s_iowait; @@ -173,6 +174,14 @@ struct hfi1_qp_priv { u8 timeout_shift; /* account for number of packets per segment */ }; +struct hfi1_swqe_priv { + struct tid_rdma_request tid_req; +}; + +struct hfi1_ack_priv { + struct tid_rdma_request tid_req; +}; + /* * This structure is used to hold commonly lookedup and computed values during * the send engine progress. @@ -321,6 +330,21 @@ static inline u32 delta_psn(u32 a, u32 b) return (((int)a - (int)b) << PSN_SHIFT) >> PSN_SHIFT; } +/* + * Look through all the active flows for a TID RDMA request and find + * the one (if it exists) that contains the specified PSN. + */ +static inline u32 __full_flow_psn(struct flow_state *state, u32 psn) +{ + return mask_psn((state->generation << HFI1_KDETH_BTH_SEQ_SHIFT) | + (psn & HFI1_KDETH_BTH_SEQ_MASK)); +} + +static inline u32 full_flow_psn(struct tid_rdma_flow *flow, u32 psn) +{ + return __full_flow_psn(&flow->flow_state, psn); +} + struct verbs_txreq; void hfi1_put_txreq(struct verbs_txreq *tx); @@ -403,6 +427,16 @@ static inline bool opa_bth_is_migration(struct ib_other_headers *ohdr) return ohdr->bth[1] & cpu_to_be32(OPA_BTH_MIG_REQ); } +void hfi1_wait_kmem(struct rvt_qp *qp); + +static inline void hfi1_trdma_send_complete(struct rvt_qp *qp, + struct rvt_swqe *wqe, + enum ib_wc_status status) +{ + trdma_clean_swqe(qp, wqe); + rvt_send_complete(qp, wqe, status); +} + extern const enum ib_wc_opcode ib_hfi1_wc_opcode[]; extern const u8 hdr_len_by_opcode[]; diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 16247d2a671d..c8e70cf69a8a 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -1642,11 +1642,11 @@ int rvt_destroy_qp(struct ib_qp *ibqp) kref_put(&qp->ip->ref, rvt_release_mmap_info); else vfree(qp->r_rq.wq); - vfree(qp->s_wq); rdi->driver_f.qp_priv_free(rdi, qp); kfree(qp->s_ack_queue); rdma_destroy_ah_attr(&qp->remote_ah_attr); rdma_destroy_ah_attr(&qp->alt_ah_attr); + vfree(qp->s_wq); kfree(qp); return 0; } diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index 56a9221378d9..9095a0b71250 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -174,6 +174,7 @@ struct rvt_swqe { u32 lpsn; /* last packet sequence number */ u32 ssn; /* send sequence number */ u32 length; /* total length of data in sg_list */ + void *priv; /* driver dependent field */ struct rvt_sge sg_list[0]; }; @@ -235,6 +236,7 @@ struct rvt_ack_entry { u32 lpsn; u8 opcode; u8 sent; + void *priv; }; #define RC_QP_SCALING_INTERVAL 5 -- cgit v1.2.3-59-g8ed1b From 2f16a696a05d34ba8c920b2133a51f18107fdb8b Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 19:30:18 -0800 Subject: IB/hfi1: Add the counter n_tidwait This patch adds the counter n_tidwait to count the number of times the TID resource allocator has to wait for TID resources. Reviewed-by: Mike Marciniszyn Signed-off-by: Ashutosh Dixit Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/chip.c | 2 ++ drivers/infiniband/hw/hfi1/chip.h | 1 + drivers/infiniband/hw/hfi1/tid_rdma.c | 9 +++++++++ drivers/infiniband/hw/hfi1/tid_rdma.h | 4 ++++ drivers/infiniband/hw/hfi1/verbs.h | 1 + 5 files changed, 17 insertions(+) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index 4d40311f082e..612f04190ed8 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -4253,6 +4253,8 @@ static struct cntr_entry dev_cntrs[DEV_CNTR_LAST] = { access_sw_pio_drain), [C_SW_KMEM_WAIT] = CNTR_ELEM("KmemWait", 0, 0, CNTR_NORMAL, access_sw_kmem_wait), +[C_SW_TID_WAIT] = CNTR_ELEM("TidWait", 0, 0, CNTR_NORMAL, + hfi1_access_sw_tid_wait), [C_SW_SEND_SCHED] = CNTR_ELEM("SendSched", 0, 0, CNTR_NORMAL, access_sw_send_schedule), [C_SDMA_DESC_FETCHED_CNT] = CNTR_ELEM("SDEDscFdCn", diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h index ba3d99e6e33b..6c27c1c6a868 100644 --- a/drivers/infiniband/hw/hfi1/chip.h +++ b/drivers/infiniband/hw/hfi1/chip.h @@ -927,6 +927,7 @@ enum { C_SW_PIO_WAIT, C_SW_PIO_DRAIN, C_SW_KMEM_WAIT, + C_SW_TID_WAIT, C_SW_SEND_SCHED, C_SDMA_DESC_FETCHED_CNT, C_SDMA_INT_CNT, diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 1d02b12590f6..43c595f30b3e 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -495,6 +495,7 @@ static void queue_qp_for_tid_wait(struct hfi1_ctxtdata *rcd, qp->s_flags |= HFI1_S_WAIT_TID_SPACE; list_add_tail(&priv->tid_wait, &queue->queue_head); priv->tid_enqueue = ++queue->enqueue; + rcd->dd->verbs_dev.n_tidwait++; trace_hfi1_qpsleep(qp, HFI1_S_WAIT_TID_SPACE); rvt_get_qp(qp); } @@ -1569,3 +1570,11 @@ static void hfi1_init_trdma_req(struct rvt_qp *qp, req->qp = qp; req->rcd = qpriv->rcd; } + +u64 hfi1_access_sw_tid_wait(const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = context; + + return dd->verbs_dev.n_tidwait; +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index 524baf8c8fac..3dbeaa8cb5b3 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -155,4 +155,8 @@ int hfi1_kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp); void hfi1_kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp); void hfi1_kern_init_ctxt_generations(struct hfi1_ctxtdata *rcd); +struct cntr_entry; +u64 hfi1_access_sw_tid_wait(const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data); + #endif /* HFI1_TID_RDMA_H */ diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index 94f198b47239..20729454f181 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -243,6 +243,7 @@ struct hfi1_ibdev { struct kmem_cache *verbs_txreq_cache; u64 n_txwait; u64 n_kmem_wait; + u64 n_tidwait; /* protect iowait lists */ seqlock_t iowait_lock ____cacheline_aligned_in_smp; -- cgit v1.2.3-59-g8ed1b From 84f4a40d46d83003bc762df0d3dd051087cc30ea Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 19:30:28 -0800 Subject: IB/hfi1: Add static trace for flow and TID management functions This patch adds the static trace for the flow and TID management functions to help debugging in the filed. Reviewed-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/tid_rdma.c | 19 +++ drivers/infiniband/hw/hfi1/trace.c | 16 +++ drivers/infiniband/hw/hfi1/trace_tid.h | 234 +++++++++++++++++++++++++++++++++ 3 files changed, 269 insertions(+) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 43c595f30b3e..506b5a59ded5 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -817,8 +817,11 @@ static u32 tid_rdma_find_phys_blocks_4k(struct tid_rdma_flow *flow, * using the bigger supported sizes. */ vaddr = page_address(pages[0]); + trace_hfi1_tid_flow_page(flow->req->qp, flow, 0, 0, 0, vaddr); for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) { this_vaddr = i < npages ? page_address(pages[i]) : NULL; + trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 0, 0, + this_vaddr); /* * If the vaddr's are not sequential, pages are not physically * contiguous. @@ -851,6 +854,9 @@ static u32 tid_rdma_find_phys_blocks_4k(struct tid_rdma_flow *flow, list[setcount].idx = pageidx; list[setcount].count = maxpages; + trace_hfi1_tid_pageset(flow->req->qp, setcount, + list[setcount].idx, + list[setcount].count); pagecount -= maxpages; pageidx += maxpages; setcount++; @@ -946,8 +952,10 @@ static u32 tid_rdma_find_phys_blocks_8k(struct tid_rdma_flow *flow, for (idx = 0, i = 0, vm1 = NULL; i < npages; i += 2) { /* get a new v0 */ v0 = page_address(pages[i]); + trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 1, 0, v0); v1 = i + 1 < npages ? page_address(pages[i + 1]) : NULL; + trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 1, 1, v1); /* compare i, i + 1 vaddr */ if (v1 != (v0 + PAGE_SIZE)) { /* flush out pages */ @@ -1093,6 +1101,8 @@ static int kern_get_phys_blocks(struct tid_rdma_flow *flow, /* Reuse previously computed pagesets, if any */ if (flow->npagesets) { + trace_hfi1_tid_flow_alloc(flow->req->qp, flow->req->setup_head, + flow); if (!dma_mapped(flow)) return dma_map_flow(flow, pages); return 0; @@ -1128,6 +1138,8 @@ static inline void kern_add_tid_node(struct tid_rdma_flow *flow, node->grp = grp; node->map = grp->map; node->cnt = cnt; + trace_hfi1_tid_node_add(flow->req->qp, s, flow->tnode_cnt - 1, + grp->base, grp->map, grp->used, cnt); } /* @@ -1199,6 +1211,8 @@ used_list: if (pageidx >= flow->npagesets) goto ok; bail_eagain: + trace_hfi1_msg_alloc_tids(flow->req->qp, " insufficient tids: needed ", + (u64)flow->npagesets); return -EAGAIN; ok: return 0; @@ -1250,6 +1264,10 @@ static void kern_program_rcv_group(struct tid_rdma_flow *flow, int grp_num, EXP_TID_SET(IDX, rcventry >> 1) | EXP_TID_SET(CTRL, tidctrl) | EXP_TID_SET(LEN, npages); + trace_hfi1_tid_entry_alloc(/* entry */ + flow->req->qp, flow->tidcnt - 1, + flow->tid_entry[flow->tidcnt - 1]); + /* Efficient DIV_ROUND_UP(npages, pmtu_pg) */ flow->npkts += (npages + pmtu_pg - 1) >> ilog2(pmtu_pg); npages = 0; @@ -1316,6 +1334,7 @@ static void kern_program_rcvarray(struct tid_rdma_flow *flow) flow->tidcnt = 0; for (i = 0; i < flow->tnode_cnt; i++) kern_program_rcv_group(flow, i, &pset_idx); + trace_hfi1_tid_flow_alloc(flow->req->qp, flow->req->setup_head, flow); } /** diff --git a/drivers/infiniband/hw/hfi1/trace.c b/drivers/infiniband/hw/hfi1/trace.c index 7c8aed0ffc07..f1154c3c294e 100644 --- a/drivers/infiniband/hw/hfi1/trace.c +++ b/drivers/infiniband/hw/hfi1/trace.c @@ -46,6 +46,7 @@ */ #define CREATE_TRACE_POINTS #include "trace.h" +#include "exp_rcv.h" static u8 __get_ib_hdr_len(struct ib_header *hdr) { @@ -394,6 +395,21 @@ const char *print_u32_array( return ret; } +u8 hfi1_trace_get_tid_ctrl(u32 ent) +{ + return EXP_TID_GET(ent, CTRL); +} + +u16 hfi1_trace_get_tid_len(u32 ent) +{ + return EXP_TID_GET(ent, LEN); +} + +u16 hfi1_trace_get_tid_idx(u32 ent) +{ + return EXP_TID_GET(ent, IDX); +} + __hfi1_trace_fn(AFFINITY); __hfi1_trace_fn(PKT); __hfi1_trace_fn(PROC); diff --git a/drivers/infiniband/hw/hfi1/trace_tid.h b/drivers/infiniband/hw/hfi1/trace_tid.h index 57a973c97cde..c1da744f44a5 100644 --- a/drivers/infiniband/hw/hfi1/trace_tid.h +++ b/drivers/infiniband/hw/hfi1/trace_tid.h @@ -21,10 +21,21 @@ __print_symbolic(type, \ #undef TRACE_SYSTEM #define TRACE_SYSTEM hfi1_tid +u8 hfi1_trace_get_tid_ctrl(u32 ent); +u16 hfi1_trace_get_tid_len(u32 ent); +u16 hfi1_trace_get_tid_idx(u32 ent); + #define OPFN_PARAM_PRN "[%s] qpn 0x%x %s OPFN: qp 0x%x, max read %u, " \ "max write %u, max length %u, jkey 0x%x timeout %u " \ "urg %u" +#define TID_FLOW_PRN "[%s] qpn 0x%x flow %d: idx %d resp_ib_psn 0x%x " \ + "generation 0x%x fpsn 0x%x-%x r_next_psn 0x%x " \ + "npagesets %u tnode_cnt %u tidcnt %u length %u" + +#define TID_NODE_PRN "[%s] qpn 0x%x %s idx %u grp base 0x%x map 0x%x " \ + "used %u cnt %u" + DECLARE_EVENT_CLASS(/* class */ hfi1_exp_tid_reg_unreg, TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages, @@ -323,6 +334,229 @@ DEFINE_EVENT(/* event */ TP_ARGS(qp, msg, more) ); +DEFINE_EVENT(/* event */ + hfi1_msg_template, hfi1_msg_alloc_tids, + TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more), + TP_ARGS(qp, msg, more) +); + +DECLARE_EVENT_CLASS(/* tid_flow_page */ + hfi1_tid_flow_page_template, + TP_PROTO(struct rvt_qp *qp, struct tid_rdma_flow *flow, u32 index, + char mtu8k, char v1, void *vaddr), + TP_ARGS(qp, flow, index, mtu8k, v1, vaddr), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(char, mtu8k) + __field(char, v1) + __field(u32, index) + __field(u64, page) + __field(u64, vaddr) + ), + TP_fast_assign(/* assign */ + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->qpn = qp->ibqp.qp_num; + __entry->mtu8k = mtu8k; + __entry->v1 = v1; + __entry->index = index; + __entry->page = vaddr ? (u64)virt_to_page(vaddr) : 0ULL; + __entry->vaddr = (u64)vaddr; + ), + TP_printk(/* print */ + "[%s] qpn 0x%x page[%u]: page 0x%llx %s 0x%llx", + __get_str(dev), + __entry->qpn, + __entry->index, + __entry->page, + __entry->mtu8k ? (__entry->v1 ? "v1" : "v0") : "vaddr", + __entry->vaddr + ) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_flow_page_template, hfi1_tid_flow_page, + TP_PROTO(struct rvt_qp *qp, struct tid_rdma_flow *flow, u32 index, + char mtu8k, char v1, void *vaddr), + TP_ARGS(qp, flow, index, mtu8k, v1, vaddr) +); + +DECLARE_EVENT_CLASS(/* tid_pageset */ + hfi1_tid_pageset_template, + TP_PROTO(struct rvt_qp *qp, u32 index, u16 idx, u16 count), + TP_ARGS(qp, index, idx, count), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(u32, index) + __field(u16, idx) + __field(u16, count) + ), + TP_fast_assign(/* assign */ + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->qpn = qp->ibqp.qp_num; + __entry->index = index; + __entry->idx = idx; + __entry->count = count; + ), + TP_printk(/* print */ + "[%s] qpn 0x%x list[%u]: idx %u count %u", + __get_str(dev), + __entry->qpn, + __entry->index, + __entry->idx, + __entry->count + ) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_pageset_template, hfi1_tid_pageset, + TP_PROTO(struct rvt_qp *qp, u32 index, u16 idx, u16 count), + TP_ARGS(qp, index, idx, count) +); + +DECLARE_EVENT_CLASS(/* tid_fow */ + hfi1_tid_flow_template, + TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow), + TP_ARGS(qp, index, flow), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(int, index) + __field(int, idx) + __field(u32, resp_ib_psn) + __field(u32, generation) + __field(u32, fspsn) + __field(u32, flpsn) + __field(u32, r_next_psn) + __field(u32, npagesets) + __field(u32, tnode_cnt) + __field(u32, tidcnt) + __field(u32, length) + ), + TP_fast_assign(/* assign */ + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->qpn = qp->ibqp.qp_num; + __entry->index = index; + __entry->idx = flow->idx; + __entry->resp_ib_psn = flow->flow_state.resp_ib_psn; + __entry->generation = flow->flow_state.generation; + __entry->fspsn = full_flow_psn(flow, + flow->flow_state.spsn); + __entry->flpsn = full_flow_psn(flow, + flow->flow_state.lpsn); + __entry->r_next_psn = flow->flow_state.r_next_psn; + __entry->npagesets = flow->npagesets; + __entry->tnode_cnt = flow->tnode_cnt; + __entry->tidcnt = flow->tidcnt; + __entry->length = flow->length; + ), + TP_printk(/* print */ + TID_FLOW_PRN, + __get_str(dev), + __entry->qpn, + __entry->index, + __entry->idx, + __entry->resp_ib_psn, + __entry->generation, + __entry->fspsn, + __entry->flpsn, + __entry->r_next_psn, + __entry->npagesets, + __entry->tnode_cnt, + __entry->tidcnt, + __entry->length + ) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_flow_template, hfi1_tid_flow_alloc, + TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow), + TP_ARGS(qp, index, flow) +); + +DECLARE_EVENT_CLASS(/* tid_node */ + hfi1_tid_node_template, + TP_PROTO(struct rvt_qp *qp, const char *msg, u32 index, u32 base, + u8 map, u8 used, u8 cnt), + TP_ARGS(qp, msg, index, base, map, used, cnt), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __string(msg, msg) + __field(u32, index) + __field(u32, base) + __field(u8, map) + __field(u8, used) + __field(u8, cnt) + ), + TP_fast_assign(/* assign */ + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->qpn = qp->ibqp.qp_num; + __assign_str(msg, msg); + __entry->index = index; + __entry->base = base; + __entry->map = map; + __entry->used = used; + __entry->cnt = cnt; + ), + TP_printk(/* print */ + TID_NODE_PRN, + __get_str(dev), + __entry->qpn, + __get_str(msg), + __entry->index, + __entry->base, + __entry->map, + __entry->used, + __entry->cnt + ) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_node_template, hfi1_tid_node_add, + TP_PROTO(struct rvt_qp *qp, const char *msg, u32 index, u32 base, + u8 map, u8 used, u8 cnt), + TP_ARGS(qp, msg, index, base, map, used, cnt) +); + +DECLARE_EVENT_CLASS(/* tid_entry */ + hfi1_tid_entry_template, + TP_PROTO(struct rvt_qp *qp, int index, u32 ent), + TP_ARGS(qp, index, ent), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(int, index) + __field(u8, ctrl) + __field(u16, idx) + __field(u16, len) + ), + TP_fast_assign(/* assign */ + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->qpn = qp->ibqp.qp_num; + __entry->index = index; + __entry->ctrl = hfi1_trace_get_tid_ctrl(ent); + __entry->idx = hfi1_trace_get_tid_idx(ent); + __entry->len = hfi1_trace_get_tid_len(ent); + ), + TP_printk(/* print */ + "[%s] qpn 0x%x TID entry %d: idx %u len %u ctrl 0x%x", + __get_str(dev), + __entry->qpn, + __entry->index, + __entry->idx, + __entry->len, + __entry->ctrl + ) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_entry_template, hfi1_tid_entry_alloc, + TP_PROTO(struct rvt_qp *qp, int index, u32 entry), + TP_ARGS(qp, index, entry) +); + #endif /* __HFI1_TRACE_TID_H */ #undef TRACE_INCLUDE_PATH -- cgit v1.2.3-59-g8ed1b From 742a3826cf82395e304df99f6494d04b0dd03a84 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 19:30:40 -0800 Subject: IB/hfi1: Add functions to build TID RDMA READ request This patch adds the helper functions to build the TID RDMA READ request on the requester side. The key is to allocate TID resources (TID flow and TID entries) and send the resource information to the responder side along with the read request. Since the TID resources are limited, each TID RDMA READ request has to be split into segments with a default segment size of 256K. A software flow is allocated to track the data transaction for each segment. The work request opcode, packet opcode, and packet formats for TID RDMA READ protocol are also defined in this patch. Reviewed-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/tid_rdma.c | 200 ++++++++++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/tid_rdma.h | 32 ++++++ drivers/infiniband/hw/hfi1/verbs.h | 9 ++ include/rdma/ib_hdrs.h | 9 +- include/rdma/tid_rdma_defs.h | 52 +++++++++ 5 files changed, 301 insertions(+), 1 deletion(-) create mode 100644 include/rdma/tid_rdma_defs.h (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 506b5a59ded5..56c8c10b5a85 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -6,11 +6,27 @@ #include "hfi.h" #include "qp.h" +#include "rc.h" #include "verbs.h" #include "tid_rdma.h" #include "exp_rcv.h" #include "trace.h" +/** + * DOC: TID RDMA READ protocol + * + * This is an end-to-end protocol at the hfi1 level between two nodes that + * improves performance by avoiding data copy on the requester side. It + * converts a qualified RDMA READ request into a TID RDMA READ request on + * the requester side and thereafter handles the request and response + * differently. To be qualified, the RDMA READ request should meet the + * following: + * -- The total data length should be greater than 256K; + * -- The total data length should be a multiple of 4K page size; + * -- Each local scatter-gather entry should be 4K page aligned; + * -- Each local scatter-gather entry should be a multiple of 4K page size; + */ + #define RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK BIT_ULL(32) #define RCV_TID_FLOW_TABLE_CTRL_HDR_SUPP_EN_SMASK BIT_ULL(33) #define RCV_TID_FLOW_TABLE_CTRL_KEEP_AFTER_SEQ_ERR_SMASK BIT_ULL(34) @@ -18,6 +34,9 @@ #define RCV_TID_FLOW_TABLE_STATUS_SEQ_MISMATCH_SMASK BIT_ULL(37) #define RCV_TID_FLOW_TABLE_STATUS_GEN_MISMATCH_SMASK BIT_ULL(38) +/* Maximum number of packets within a flow generation. */ +#define MAX_TID_FLOW_PSN BIT(HFI1_KDETH_BTH_SEQ_SHIFT) + #define GENERATION_MASK 0xFFFFF static u32 mask_generation(u32 a) @@ -45,6 +64,9 @@ static u32 mask_generation(u32 a) #define MAX_EXPECTED_PAGES (MAX_EXPECTED_BUFFER / PAGE_SIZE) +#define TID_RDMA_DESTQP_FLOW_SHIFT 11 +#define TID_RDMA_DESTQP_FLOW_MASK 0x1f + #define TID_OPFN_QP_CTXT_MASK 0xff #define TID_OPFN_QP_CTXT_SHIFT 56 #define TID_OPFN_QP_KDETH_MASK 0xff @@ -1597,3 +1619,181 @@ u64 hfi1_access_sw_tid_wait(const struct cntr_entry *entry, return dd->verbs_dev.n_tidwait; } + +/* TID RDMA READ functions */ +u32 hfi1_build_tid_rdma_read_packet(struct rvt_swqe *wqe, + struct ib_other_headers *ohdr, u32 *bth1, + u32 *bth2, u32 *len) +{ + struct tid_rdma_request *req = wqe_to_tid_req(wqe); + struct tid_rdma_flow *flow = &req->flows[req->flow_idx]; + struct rvt_qp *qp = req->qp; + struct hfi1_qp_priv *qpriv = qp->priv; + struct hfi1_swqe_priv *wpriv = wqe->priv; + struct tid_rdma_read_req *rreq = &ohdr->u.tid_rdma.r_req; + struct tid_rdma_params *remote; + u32 req_len = 0; + void *req_addr = NULL; + + /* This is the IB psn used to send the request */ + *bth2 = mask_psn(flow->flow_state.ib_spsn + flow->pkt); + + /* TID Entries for TID RDMA READ payload */ + req_addr = &flow->tid_entry[flow->tid_idx]; + req_len = sizeof(*flow->tid_entry) * + (flow->tidcnt - flow->tid_idx); + + memset(&ohdr->u.tid_rdma.r_req, 0, sizeof(ohdr->u.tid_rdma.r_req)); + wpriv->ss.sge.vaddr = req_addr; + wpriv->ss.sge.sge_length = req_len; + wpriv->ss.sge.length = wpriv->ss.sge.sge_length; + /* + * We can safely zero these out. Since the first SGE covers the + * entire packet, nothing else should even look at the MR. + */ + wpriv->ss.sge.mr = NULL; + wpriv->ss.sge.m = 0; + wpriv->ss.sge.n = 0; + + wpriv->ss.sg_list = NULL; + wpriv->ss.total_len = wpriv->ss.sge.sge_length; + wpriv->ss.num_sge = 1; + + /* Construct the TID RDMA READ REQ packet header */ + rcu_read_lock(); + remote = rcu_dereference(qpriv->tid_rdma.remote); + + KDETH_RESET(rreq->kdeth0, KVER, 0x1); + KDETH_RESET(rreq->kdeth1, JKEY, remote->jkey); + rreq->reth.vaddr = cpu_to_be64(wqe->rdma_wr.remote_addr + + req->cur_seg * req->seg_len + flow->sent); + rreq->reth.rkey = cpu_to_be32(wqe->rdma_wr.rkey); + rreq->reth.length = cpu_to_be32(*len); + rreq->tid_flow_psn = + cpu_to_be32((flow->flow_state.generation << + HFI1_KDETH_BTH_SEQ_SHIFT) | + ((flow->flow_state.spsn + flow->pkt) & + HFI1_KDETH_BTH_SEQ_MASK)); + rreq->tid_flow_qp = + cpu_to_be32(qpriv->tid_rdma.local.qp | + ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) << + TID_RDMA_DESTQP_FLOW_SHIFT) | + qpriv->rcd->ctxt); + rreq->verbs_qp = cpu_to_be32(qp->remote_qpn); + *bth1 &= ~RVT_QPN_MASK; + *bth1 |= remote->qp; + *bth2 |= IB_BTH_REQ_ACK; + rcu_read_unlock(); + + /* We are done with this segment */ + flow->sent += *len; + req->cur_seg++; + qp->s_state = TID_OP(READ_REQ); + req->ack_pending++; + req->flow_idx = (req->flow_idx + 1) & (MAX_FLOWS - 1); + qpriv->pending_tid_r_segs++; + qp->s_num_rd_atomic++; + + /* Set the TID RDMA READ request payload size */ + *len = req_len; + + return sizeof(ohdr->u.tid_rdma.r_req) / sizeof(u32); +} + +/* + * @len: contains the data length to read upon entry and the read request + * payload length upon exit. + */ +u32 hfi1_build_tid_rdma_read_req(struct rvt_qp *qp, struct rvt_swqe *wqe, + struct ib_other_headers *ohdr, u32 *bth1, + u32 *bth2, u32 *len) + __must_hold(&qp->s_lock) +{ + struct hfi1_qp_priv *qpriv = qp->priv; + struct tid_rdma_request *req = wqe_to_tid_req(wqe); + struct tid_rdma_flow *flow = NULL; + u32 hdwords = 0; + bool last; + bool retry = true; + u32 npkts = rvt_div_round_up_mtu(qp, *len); + + /* + * Check sync conditions. Make sure that there are no pending + * segments before freeing the flow. + */ +sync_check: + if (req->state == TID_REQUEST_SYNC) { + if (qpriv->pending_tid_r_segs) + goto done; + + hfi1_kern_clear_hw_flow(req->rcd, qp); + req->state = TID_REQUEST_ACTIVE; + } + + /* + * If the request for this segment is resent, the tid resources should + * have been allocated before. In this case, req->flow_idx should + * fall behind req->setup_head. + */ + if (req->flow_idx == req->setup_head) { + retry = false; + if (req->state == TID_REQUEST_RESEND) { + /* + * This is the first new segment for a request whose + * earlier segments have been re-sent. We need to + * set up the sge pointer correctly. + */ + restart_sge(&qp->s_sge, wqe, req->s_next_psn, + qp->pmtu); + req->isge = 0; + req->state = TID_REQUEST_ACTIVE; + } + + /* + * Check sync. The last PSN of each generation is reserved for + * RESYNC. + */ + if ((qpriv->flow_state.psn + npkts) > MAX_TID_FLOW_PSN - 1) { + req->state = TID_REQUEST_SYNC; + goto sync_check; + } + + /* Allocate the flow if not yet */ + if (hfi1_kern_setup_hw_flow(qpriv->rcd, qp)) + goto done; + + /* + * The following call will advance req->setup_head after + * allocating the tid entries. + */ + if (hfi1_kern_exp_rcv_setup(req, &qp->s_sge, &last)) { + req->state = TID_REQUEST_QUEUED; + + /* + * We don't have resources for this segment. The QP has + * already been queued. + */ + goto done; + } + } + + /* req->flow_idx should only be one slot behind req->setup_head */ + flow = &req->flows[req->flow_idx]; + flow->pkt = 0; + flow->tid_idx = 0; + flow->sent = 0; + if (!retry) { + /* Set the first and last IB PSN for the flow in use.*/ + flow->flow_state.ib_spsn = req->s_next_psn; + flow->flow_state.ib_lpsn = + flow->flow_state.ib_spsn + flow->npkts - 1; + } + + /* Calculate the next segment start psn.*/ + req->s_next_psn += flow->npkts; + + /* Build the packet header */ + hdwords = hfi1_build_tid_rdma_read_packet(wqe, ohdr, bth1, bth2, len); +done: + return hdwords; +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index 3dbeaa8cb5b3..f692f3ff9419 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -45,6 +45,19 @@ struct tid_flow_state { u8 flags; }; +enum tid_rdma_req_state { + TID_REQUEST_INACTIVE = 0, + TID_REQUEST_INIT, + TID_REQUEST_INIT_RESEND, + TID_REQUEST_ACTIVE, + TID_REQUEST_RESEND, + TID_REQUEST_RESEND_ACTIVE, + TID_REQUEST_QUEUED, + TID_REQUEST_SYNC, + TID_REQUEST_RNR_NAK, + TID_REQUEST_COMPLETE, +}; + struct tid_rdma_request { struct rvt_qp *qp; struct hfi1_ctxtdata *rcd; @@ -60,8 +73,13 @@ struct tid_rdma_request { u16 flow_idx; /* flow index most recently set up */ u32 seg_len; + u32 s_next_psn; /* IB PSN of next segment start for read */ + u32 cur_seg; /* index of current segment */ u32 isge; /* index of "current" sge */ + u32 ack_pending; /* num acks pending for this request */ + + enum tid_rdma_req_state state; }; /* @@ -77,6 +95,10 @@ struct flow_state { u32 spsn; /* starting PSN in TID space */ u32 lpsn; /* last PSN in TID space */ u32 r_next_psn; /* next PSN to be received (in TID space) */ + + /* For tid rdma read */ + u32 ib_spsn; /* starting PSN in Verbs space */ + u32 ib_lpsn; /* last PSn in Verbs space */ }; struct tid_rdma_pageset { @@ -110,11 +132,14 @@ struct tid_rdma_flow { struct flow_state flow_state; struct tid_rdma_request *req; u32 length; + u32 sent; u8 tnode_cnt; u8 tidcnt; + u8 tid_idx; u8 idx; u8 npagesets; u8 npkts; + u8 pkt; struct kern_tid_node tnode[TID_RDMA_MAX_PAGES]; struct tid_rdma_pageset pagesets[TID_RDMA_MAX_PAGES]; u32 tid_entry[TID_RDMA_MAX_PAGES]; @@ -159,4 +184,11 @@ struct cntr_entry; u64 hfi1_access_sw_tid_wait(const struct cntr_entry *entry, void *context, int vl, int mode, u64 data); +u32 hfi1_build_tid_rdma_read_packet(struct rvt_swqe *wqe, + struct ib_other_headers *ohdr, + u32 *bth1, u32 *bth2, u32 *len); +u32 hfi1_build_tid_rdma_read_req(struct rvt_qp *qp, struct rvt_swqe *wqe, + struct ib_other_headers *ohdr, u32 *bth1, + u32 *bth2, u32 *len); + #endif /* HFI1_TID_RDMA_H */ diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index 20729454f181..2965b0957855 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -170,12 +170,16 @@ struct hfi1_qp_priv { struct rvt_qp *owner; u8 hdr_type; /* 9B or 16B */ unsigned long tid_timer_timeout_jiffies; + + /* For TID RDMA READ */ + u32 pending_tid_r_segs; /* Num of pending tid read segments */ u16 pkts_ps; /* packets per segment */ u8 timeout_shift; /* account for number of packets per segment */ }; struct hfi1_swqe_priv { struct tid_rdma_request tid_req; + struct rvt_sge_state ss; /* Used for TID RDMA READ Request */ }; struct hfi1_ack_priv { @@ -331,6 +335,11 @@ static inline u32 delta_psn(u32 a, u32 b) return (((int)a - (int)b) << PSN_SHIFT) >> PSN_SHIFT; } +static inline struct tid_rdma_request *wqe_to_tid_req(struct rvt_swqe *wqe) +{ + return &((struct hfi1_swqe_priv *)wqe->priv)->tid_req; +} + /* * Look through all the active flows for a TID RDMA request and find * the one (if it exists) that contains the specified PSN. diff --git a/include/rdma/ib_hdrs.h b/include/rdma/ib_hdrs.h index 6e35416170a3..58a0a0f99e7f 100644 --- a/include/rdma/ib_hdrs.h +++ b/include/rdma/ib_hdrs.h @@ -1,5 +1,5 @@ /* - * Copyright(c) 2016 Intel Corporation. + * Copyright(c) 2016 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -100,6 +100,8 @@ struct ib_atomic_eth { __be64 compare_data; /* potentially unaligned */ } __packed; +#include + union ib_ehdrs { struct { __be32 deth[2]; @@ -117,6 +119,11 @@ union ib_ehdrs { __be32 aeth; __be32 ieth; struct ib_atomic_eth atomic_eth; + /* TID RDMA headers */ + union { + struct tid_rdma_read_req r_req; + struct tid_rdma_read_resp r_rsp; + } tid_rdma; } __packed; struct ib_other_headers { diff --git a/include/rdma/tid_rdma_defs.h b/include/rdma/tid_rdma_defs.h new file mode 100644 index 000000000000..1c431ea32b52 --- /dev/null +++ b/include/rdma/tid_rdma_defs.h @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ +/* + * Copyright(c) 2018 Intel Corporation. + * + */ + +#ifndef TID_RDMA_DEFS_H +#define TID_RDMA_DEFS_H + +#include + +struct tid_rdma_read_req { + __le32 kdeth0; + __le32 kdeth1; + struct ib_reth reth; + __be32 tid_flow_psn; + __be32 tid_flow_qp; + __be32 verbs_qp; +}; + +struct tid_rdma_read_resp { + __le32 kdeth0; + __le32 kdeth1; + __be32 aeth; + __be32 reserved[4]; + __be32 verbs_psn; + __be32 verbs_qp; +}; + +/* + * TID RDMA Opcodes + */ +#define IB_OPCODE_TID_RDMA 0xe0 +enum { + IB_OPCODE_READ_REQ = 0x4, + IB_OPCODE_READ_RESP = 0x5, + + IB_OPCODE(TID_RDMA, READ_REQ), + IB_OPCODE(TID_RDMA, READ_RESP), +}; + +#define TID_OP(x) IB_OPCODE_TID_RDMA_##x + +/* + * Define TID RDMA specific WR opcodes. The ib_wr_opcode + * enum already provides some reserved values for use by + * low level drivers. Two of those are used but renamed + * to be more descriptive. + */ +#define IB_WR_TID_RDMA_READ IB_WR_RESERVED2 + +#endif /* TID_RDMA_DEFS_H */ -- cgit v1.2.3-59-g8ed1b From 6b6cf9357f78057292ae662438f441cb84f93a25 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 19:30:51 -0800 Subject: IB/hfi1: Set PbcInsertHcrc for TID RDMA packets All TID RDMA packets are in KDETH packet format and therefore the PbcInsertHcrc must be set properly before sending the packet to hardware. Otherwise, the packets will be dropped by the receiver. By default, HCRC is not inserted for 9B packets without KDETH, and this patch adds that back for TID RDMA packets. Reviewed-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/verbs.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 02c1873a976c..8887a71edb98 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -308,7 +308,7 @@ static inline opcode_handler qp_ok(struct hfi1_packet *packet) static u64 hfi1_fault_tx(struct rvt_qp *qp, u8 opcode, u64 pbc) { #ifdef CONFIG_FAULT_INJECTION - if ((opcode & IB_OPCODE_MSP) == IB_OPCODE_MSP) + if ((opcode & IB_OPCODE_MSP) == IB_OPCODE_MSP) { /* * In order to drop non-IB traffic we * set PbcInsertHrc to NONE (0x2). @@ -319,8 +319,9 @@ static u64 hfi1_fault_tx(struct rvt_qp *qp, u8 opcode, u64 pbc) * packet will not be delivered to the * correct context. */ + pbc &= ~PBC_INSERT_HCRC_SMASK; pbc |= (u64)PBC_IHCRC_NONE << PBC_INSERT_HCRC_SHIFT; - else + } else { /* * In order to drop regular verbs * traffic we set the PbcTestEbp @@ -330,6 +331,7 @@ static u64 hfi1_fault_tx(struct rvt_qp *qp, u8 opcode, u64 pbc) * triggered and will be dropped. */ pbc |= PBC_TEST_EBP; + } #endif return pbc; } @@ -683,6 +685,15 @@ bail_txadd: return ret; } +static u64 update_hcrc(u8 opcode, u64 pbc) +{ + if ((opcode & IB_OPCODE_TID_RDMA) == IB_OPCODE_TID_RDMA) { + pbc &= ~PBC_INSERT_HCRC_SMASK; + pbc |= (u64)PBC_IHCRC_LKDETH << PBC_INSERT_HCRC_SHIFT; + } + return pbc; +} + int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps, u64 pbc) { @@ -728,6 +739,9 @@ int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps, qp->srate_mbps, vl, plen); + + /* Update HCRC based on packet opcode */ + pbc = update_hcrc(ps->opcode, pbc); } tx->wqe = qp->s_wqe; ret = build_verbs_tx_desc(tx->sde, len, tx, ahg_info, pbc); @@ -876,6 +890,9 @@ int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps, if (unlikely(hfi1_dbg_should_fault_tx(qp, ps->opcode))) pbc = hfi1_fault_tx(qp, ps->opcode, pbc); pbc = create_pbc(ppd, pbc, qp->srate_mbps, vl, plen); + + /* Update HCRC based on packet opcode */ + pbc = update_hcrc(ps->opcode, pbc); } if (cb) iowait_pio_inc(&priv->s_iowait); -- cgit v1.2.3-59-g8ed1b From d0d564a1caacc7f3f28f3e351ed89ed000e2de75 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 19:31:02 -0800 Subject: IB/hfi1: Add functions to receive TID RDMA READ request This patch adds the functions to receive TID RDMA READ request. The TID resource information will be stored and tracked. Duplicate request will also be handled properly. Reviewed-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/tid_rdma.c | 329 ++++++++++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/tid_rdma.h | 8 + drivers/infiniband/hw/hfi1/verbs.h | 5 + 3 files changed, 342 insertions(+) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 56c8c10b5a85..d8a46b7ddca0 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -1797,3 +1797,332 @@ sync_check: done: return hdwords; } + +/* + * Validate and accept the TID RDMA READ request parameters. + * Return 0 if the request is accepted successfully; + * Return 1 otherwise. + */ +static int tid_rdma_rcv_read_request(struct rvt_qp *qp, + struct rvt_ack_entry *e, + struct hfi1_packet *packet, + struct ib_other_headers *ohdr, + u32 bth0, u32 psn, u64 vaddr, u32 len) +{ + struct hfi1_qp_priv *qpriv = qp->priv; + struct tid_rdma_request *req; + struct tid_rdma_flow *flow; + u32 flow_psn, i, tidlen = 0, pktlen, tlen; + + req = ack_to_tid_req(e); + + /* Validate the payload first */ + flow = &req->flows[req->setup_head]; + + /* payload length = packet length - (header length + ICRC length) */ + pktlen = packet->tlen - (packet->hlen + 4); + if (pktlen > sizeof(flow->tid_entry)) + return 1; + memcpy(flow->tid_entry, packet->ebuf, pktlen); + flow->tidcnt = pktlen / sizeof(*flow->tid_entry); + + /* + * Walk the TID_ENTRY list to make sure we have enough space for a + * complete segment. Also calculate the number of required packets. + */ + flow->npkts = rvt_div_round_up_mtu(qp, len); + for (i = 0; i < flow->tidcnt; i++) { + tlen = EXP_TID_GET(flow->tid_entry[i], LEN); + if (!tlen) + return 1; + + /* + * For tid pair (tidctr == 3), the buffer size of the pair + * should be the sum of the buffer size described by each + * tid entry. However, only the first entry needs to be + * specified in the request (see WFR HAS Section 8.5.7.1). + */ + tidlen += tlen; + } + if (tidlen * PAGE_SIZE < len) + return 1; + + /* Empty the flow array */ + req->clear_tail = req->setup_head; + flow->pkt = 0; + flow->tid_idx = 0; + flow->tid_offset = 0; + flow->sent = 0; + flow->tid_qpn = be32_to_cpu(ohdr->u.tid_rdma.r_req.tid_flow_qp); + flow->idx = (flow->tid_qpn >> TID_RDMA_DESTQP_FLOW_SHIFT) & + TID_RDMA_DESTQP_FLOW_MASK; + flow_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_req.tid_flow_psn)); + flow->flow_state.generation = flow_psn >> HFI1_KDETH_BTH_SEQ_SHIFT; + flow->flow_state.spsn = flow_psn & HFI1_KDETH_BTH_SEQ_MASK; + flow->length = len; + + flow->flow_state.lpsn = flow->flow_state.spsn + + flow->npkts - 1; + flow->flow_state.ib_spsn = psn; + flow->flow_state.ib_lpsn = flow->flow_state.ib_spsn + flow->npkts - 1; + + /* Set the initial flow index to the current flow. */ + req->flow_idx = req->setup_head; + + /* advance circular buffer head */ + req->setup_head = (req->setup_head + 1) & (MAX_FLOWS - 1); + + /* + * Compute last PSN for request. + */ + e->opcode = (bth0 >> 24) & 0xff; + e->psn = psn; + e->lpsn = psn + flow->npkts - 1; + e->sent = 0; + + req->n_flows = qpriv->tid_rdma.local.max_read; + req->state = TID_REQUEST_ACTIVE; + req->cur_seg = 0; + req->comp_seg = 0; + req->ack_seg = 0; + req->isge = 0; + req->seg_len = qpriv->tid_rdma.local.max_len; + req->total_len = len; + req->total_segs = 1; + req->r_flow_psn = e->psn; + + return 0; +} + +static int tid_rdma_rcv_error(struct hfi1_packet *packet, + struct ib_other_headers *ohdr, + struct rvt_qp *qp, u32 psn, int diff) +{ + struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + struct hfi1_ctxtdata *rcd = ((struct hfi1_qp_priv *)qp->priv)->rcd; + struct rvt_ack_entry *e; + struct tid_rdma_request *req; + unsigned long flags; + u8 prev; + bool old_req; + + if (diff > 0) { + /* sequence error */ + if (!qp->r_nak_state) { + ibp->rvp.n_rc_seqnak++; + qp->r_nak_state = IB_NAK_PSN_ERROR; + qp->r_ack_psn = qp->r_psn; + rc_defered_ack(rcd, qp); + } + goto done; + } + + ibp->rvp.n_rc_dupreq++; + + spin_lock_irqsave(&qp->s_lock, flags); + e = find_prev_entry(qp, psn, &prev, NULL, &old_req); + if (!e || e->opcode != TID_OP(READ_REQ)) + goto unlock; + + req = ack_to_tid_req(e); + req->r_flow_psn = psn; + + if (e->opcode == TID_OP(READ_REQ)) { + struct ib_reth *reth; + u32 offset; + u32 len; + u32 rkey; + u64 vaddr; + int ok; + u32 bth0; + + reth = &ohdr->u.tid_rdma.r_req.reth; + /* + * The requester always restarts from the start of the original + * request. + */ + offset = delta_psn(psn, e->psn) * qp->pmtu; + len = be32_to_cpu(reth->length); + if (psn != e->psn || len != req->total_len) + goto unlock; + + if (e->rdma_sge.mr) { + rvt_put_mr(e->rdma_sge.mr); + e->rdma_sge.mr = NULL; + } + + rkey = be32_to_cpu(reth->rkey); + vaddr = get_ib_reth_vaddr(reth); + + qp->r_len = len; + ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey, + IB_ACCESS_REMOTE_READ); + if (unlikely(!ok)) + goto unlock; + + /* + * If all the response packets for the current request have + * been sent out and this request is complete (old_request + * == false) and the TID flow may be unusable (the + * req->clear_tail is advanced). However, when an earlier + * request is received, this request will not be complete any + * more (qp->s_tail_ack_queue is moved back, see below). + * Consequently, we need to update the TID flow info everytime + * a duplicate request is received. + */ + bth0 = be32_to_cpu(ohdr->bth[0]); + if (tid_rdma_rcv_read_request(qp, e, packet, ohdr, bth0, psn, + vaddr, len)) + goto unlock; + + /* + * True if the request is already scheduled (between + * qp->s_tail_ack_queue and qp->r_head_ack_queue); + */ + if (old_req) + goto unlock; + } + /* Re-process old requests.*/ + qp->s_tail_ack_queue = prev; + /* + * Since the qp->s_tail_ack_queue is modified, the + * qp->s_ack_state must be changed to re-initialize + * qp->s_ack_rdma_sge; Otherwise, we will end up in + * wrong memory region. + */ + qp->s_ack_state = OP(ACKNOWLEDGE); + qp->r_state = e->opcode; + qp->r_nak_state = 0; + qp->s_flags |= RVT_S_RESP_PENDING; + hfi1_schedule_send(qp); +unlock: + spin_unlock_irqrestore(&qp->s_lock, flags); +done: + return 1; +} + +void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet) +{ + /* HANDLER FOR TID RDMA READ REQUEST packet (Responder side)*/ + + /* + * 1. Verify TID RDMA READ REQ as per IB_OPCODE_RC_RDMA_READ + * (see hfi1_rc_rcv()) + * 2. Put TID RDMA READ REQ into the response queueu (s_ack_queue) + * - Setup struct tid_rdma_req with request info + * - Initialize struct tid_rdma_flow info; + * - Copy TID entries; + * 3. Set the qp->s_ack_state. + * 4. Set RVT_S_RESP_PENDING in s_flags. + * 5. Kick the send engine (hfi1_schedule_send()) + */ + struct hfi1_ctxtdata *rcd = packet->rcd; + struct rvt_qp *qp = packet->qp; + struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + struct ib_other_headers *ohdr = packet->ohdr; + struct rvt_ack_entry *e; + unsigned long flags; + struct ib_reth *reth; + struct hfi1_qp_priv *qpriv = qp->priv; + u32 bth0, psn, len, rkey; + bool is_fecn; + u8 next; + u64 vaddr; + int diff; + u8 nack_state = IB_NAK_INVALID_REQUEST; + + bth0 = be32_to_cpu(ohdr->bth[0]); + if (hfi1_ruc_check_hdr(ibp, packet)) + return; + + is_fecn = process_ecn(qp, packet); + psn = mask_psn(be32_to_cpu(ohdr->bth[2])); + + if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST)) + rvt_comm_est(qp); + + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ))) + goto nack_inv; + + reth = &ohdr->u.tid_rdma.r_req.reth; + vaddr = be64_to_cpu(reth->vaddr); + len = be32_to_cpu(reth->length); + /* The length needs to be in multiples of PAGE_SIZE */ + if (!len || len & ~PAGE_MASK || len > qpriv->tid_rdma.local.max_len) + goto nack_inv; + + diff = delta_psn(psn, qp->r_psn); + if (unlikely(diff)) { + if (tid_rdma_rcv_error(packet, ohdr, qp, psn, diff)) + return; + goto send_ack; + } + + /* We've verified the request, insert it into the ack queue. */ + next = qp->r_head_ack_queue + 1; + if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) + next = 0; + spin_lock_irqsave(&qp->s_lock, flags); + if (unlikely(next == qp->s_tail_ack_queue)) { + if (!qp->s_ack_queue[next].sent) { + nack_state = IB_NAK_REMOTE_OPERATIONAL_ERROR; + goto nack_inv_unlock; + } + update_ack_queue(qp, next); + } + e = &qp->s_ack_queue[qp->r_head_ack_queue]; + if (e->rdma_sge.mr) { + rvt_put_mr(e->rdma_sge.mr); + e->rdma_sge.mr = NULL; + } + + rkey = be32_to_cpu(reth->rkey); + qp->r_len = len; + + if (unlikely(!rvt_rkey_ok(qp, &e->rdma_sge, qp->r_len, vaddr, + rkey, IB_ACCESS_REMOTE_READ))) + goto nack_acc; + + /* Accept the request parameters */ + if (tid_rdma_rcv_read_request(qp, e, packet, ohdr, bth0, psn, vaddr, + len)) + goto nack_inv_unlock; + + qp->r_state = e->opcode; + qp->r_nak_state = 0; + /* + * We need to increment the MSN here instead of when we + * finish sending the result since a duplicate request would + * increment it more than once. + */ + qp->r_msn++; + qp->r_psn += e->lpsn - e->psn + 1; + + qp->r_head_ack_queue = next; + + /* Schedule the send tasklet. */ + qp->s_flags |= RVT_S_RESP_PENDING; + hfi1_schedule_send(qp); + + spin_unlock_irqrestore(&qp->s_lock, flags); + if (is_fecn) + goto send_ack; + return; + +nack_inv_unlock: + spin_unlock_irqrestore(&qp->s_lock, flags); +nack_inv: + rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR); + qp->r_nak_state = nack_state; + qp->r_ack_psn = qp->r_psn; + /* Queue NAK for later */ + rc_defered_ack(rcd, qp); + return; +nack_acc: + spin_unlock_irqrestore(&qp->s_lock, flags); + rvt_rc_error(qp, IB_WC_LOC_PROT_ERR); + qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR; + qp->r_ack_psn = qp->r_psn; +send_ack: + hfi1_send_rc_ack(packet, is_fecn); +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index f692f3ff9419..439329398ccc 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -73,9 +73,14 @@ struct tid_rdma_request { u16 flow_idx; /* flow index most recently set up */ u32 seg_len; + u32 total_len; + u32 r_flow_psn; /* IB PSN of next segment start */ u32 s_next_psn; /* IB PSN of next segment start for read */ + u32 total_segs; /* segments required to complete a request */ u32 cur_seg; /* index of current segment */ + u32 comp_seg; /* index of last completed segment */ + u32 ack_seg; /* index of last ack'ed segment */ u32 isge; /* index of "current" sge */ u32 ack_pending; /* num acks pending for this request */ @@ -131,6 +136,8 @@ struct tid_rdma_flow { */ struct flow_state flow_state; struct tid_rdma_request *req; + u32 tid_qpn; + u32 tid_offset; u32 length; u32 sent; u8 tnode_cnt; @@ -190,5 +197,6 @@ u32 hfi1_build_tid_rdma_read_packet(struct rvt_swqe *wqe, u32 hfi1_build_tid_rdma_read_req(struct rvt_qp *qp, struct rvt_swqe *wqe, struct ib_other_headers *ohdr, u32 *bth1, u32 *bth2, u32 *len); +void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet); #endif /* HFI1_TID_RDMA_H */ diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index 2965b0957855..5e910c508360 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -340,6 +340,11 @@ static inline struct tid_rdma_request *wqe_to_tid_req(struct rvt_swqe *wqe) return &((struct hfi1_swqe_priv *)wqe->priv)->tid_req; } +static inline struct tid_rdma_request *ack_to_tid_req(struct rvt_ack_entry *e) +{ + return &((struct hfi1_ack_priv *)e->priv)->tid_req; +} + /* * Look through all the active flows for a TID RDMA request and find * the one (if it exists) that contains the specified PSN. -- cgit v1.2.3-59-g8ed1b From 1db21b50502856c1da5628e3644bd29710e928f0 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 19:31:12 -0800 Subject: IB/hfi1: Add a function to build TID RDMA READ response This patch adds the function to build TID RDMA READ response packet. The previously received TID resource information will be used to build the KDETH packet, which will direct the delivery of packet payload by hardware. Reviewed-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/tid_rdma.c | 67 +++++++++++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/tid_rdma.h | 3 ++ 2 files changed, 70 insertions(+) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index d8a46b7ddca0..888954a79dac 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -2126,3 +2126,70 @@ nack_acc: send_ack: hfi1_send_rc_ack(packet, is_fecn); } + +u32 hfi1_build_tid_rdma_read_resp(struct rvt_qp *qp, struct rvt_ack_entry *e, + struct ib_other_headers *ohdr, u32 *bth0, + u32 *bth1, u32 *bth2, u32 *len, bool *last) +{ + struct hfi1_ack_priv *epriv = e->priv; + struct tid_rdma_request *req = &epriv->tid_req; + struct hfi1_qp_priv *qpriv = qp->priv; + struct tid_rdma_flow *flow = &req->flows[req->clear_tail]; + u32 tidentry = flow->tid_entry[flow->tid_idx]; + u32 tidlen = EXP_TID_GET(tidentry, LEN) << PAGE_SHIFT; + struct tid_rdma_read_resp *resp = &ohdr->u.tid_rdma.r_rsp; + u32 next_offset, om = KDETH_OM_LARGE; + bool last_pkt; + u32 hdwords = 0; + struct tid_rdma_params *remote; + + *len = min_t(u32, qp->pmtu, tidlen - flow->tid_offset); + flow->sent += *len; + next_offset = flow->tid_offset + *len; + last_pkt = (flow->sent >= flow->length); + + rcu_read_lock(); + remote = rcu_dereference(qpriv->tid_rdma.remote); + if (!remote) { + rcu_read_unlock(); + goto done; + } + KDETH_RESET(resp->kdeth0, KVER, 0x1); + KDETH_SET(resp->kdeth0, SH, !last_pkt); + KDETH_SET(resp->kdeth0, INTR, !!(!last_pkt && remote->urg)); + KDETH_SET(resp->kdeth0, TIDCTRL, EXP_TID_GET(tidentry, CTRL)); + KDETH_SET(resp->kdeth0, TID, EXP_TID_GET(tidentry, IDX)); + KDETH_SET(resp->kdeth0, OM, om == KDETH_OM_LARGE); + KDETH_SET(resp->kdeth0, OFFSET, flow->tid_offset / om); + KDETH_RESET(resp->kdeth1, JKEY, remote->jkey); + resp->verbs_qp = cpu_to_be32(qp->remote_qpn); + rcu_read_unlock(); + + resp->aeth = rvt_compute_aeth(qp); + resp->verbs_psn = cpu_to_be32(mask_psn(flow->flow_state.ib_spsn + + flow->pkt)); + + *bth0 = TID_OP(READ_RESP) << 24; + *bth1 = flow->tid_qpn; + *bth2 = mask_psn(((flow->flow_state.spsn + flow->pkt++) & + HFI1_KDETH_BTH_SEQ_MASK) | + (flow->flow_state.generation << + HFI1_KDETH_BTH_SEQ_SHIFT)); + *last = last_pkt; + if (last_pkt) + /* Advance to next flow */ + req->clear_tail = (req->clear_tail + 1) & + (MAX_FLOWS - 1); + + if (next_offset >= tidlen) { + flow->tid_offset = 0; + flow->tid_idx++; + } else { + flow->tid_offset = next_offset; + } + + hdwords = sizeof(ohdr->u.tid_rdma.r_rsp) / sizeof(u32); + +done: + return hdwords; +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index 439329398ccc..01ded7c0c302 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -198,5 +198,8 @@ u32 hfi1_build_tid_rdma_read_req(struct rvt_qp *qp, struct rvt_swqe *wqe, struct ib_other_headers *ohdr, u32 *bth1, u32 *bth2, u32 *len); void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet); +u32 hfi1_build_tid_rdma_read_resp(struct rvt_qp *qp, struct rvt_ack_entry *e, + struct ib_other_headers *ohdr, u32 *bth0, + u32 *bth1, u32 *bth2, u32 *len, bool *last); #endif /* HFI1_TID_RDMA_H */ -- cgit v1.2.3-59-g8ed1b From 9905bf06e890c2a845ac8fd19d7e6b8987ef8df6 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Tue, 5 Feb 2019 14:13:30 -0800 Subject: IB/hfi1: Add functions to receive TID RDMA READ response This patch adds the functions to receive TID RDMA READ response. The TID resource information in the KDETH packet header will direct the hardware to deliver the packet payload to the user buffer automatically and the software will handle the packet header for the last packet of a segment as all other packet headers are suppressed by default. The TID entries will be freed when all packets for a segment have been received. This patch also adds the functions to handle KDETH eflag errors, including flow sequence and generation errors, when a TID RDMA READ response packet is received . The flow sequence error can be recovered by software checking of the flow sequence and will disappear when the hardware flow is programmed with a new generation number. Reviewed-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/tid_rdma.c | 521 ++++++++++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/tid_rdma.h | 6 + drivers/infiniband/hw/hfi1/verbs.h | 2 + 3 files changed, 529 insertions(+) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 888954a79dac..da8b63ec0f8d 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -67,6 +67,8 @@ static u32 mask_generation(u32 a) #define TID_RDMA_DESTQP_FLOW_SHIFT 11 #define TID_RDMA_DESTQP_FLOW_MASK 0x1f +#define TID_FLOW_SW_PSN BIT(0) + #define TID_OPFN_QP_CTXT_MASK 0xff #define TID_OPFN_QP_CTXT_SHIFT 56 #define TID_OPFN_QP_KDETH_MASK 0xff @@ -1620,6 +1622,34 @@ u64 hfi1_access_sw_tid_wait(const struct cntr_entry *entry, return dd->verbs_dev.n_tidwait; } +static struct tid_rdma_flow * +__find_flow_ranged(struct tid_rdma_request *req, u16 head, u16 tail, + u32 psn, u16 *fidx) +{ + for ( ; CIRC_CNT(head, tail, MAX_FLOWS); + tail = CIRC_NEXT(tail, MAX_FLOWS)) { + struct tid_rdma_flow *flow = &req->flows[tail]; + u32 spsn, lpsn; + + spsn = full_flow_psn(flow, flow->flow_state.spsn); + lpsn = full_flow_psn(flow, flow->flow_state.lpsn); + + if (cmp_psn(psn, spsn) >= 0 && cmp_psn(psn, lpsn) <= 0) { + if (fidx) + *fidx = tail; + return flow; + } + } + return NULL; +} + +static struct tid_rdma_flow *find_flow(struct tid_rdma_request *req, + u32 psn, u16 *fidx) +{ + return __find_flow_ranged(req, req->setup_head, req->clear_tail, psn, + fidx); +} + /* TID RDMA READ functions */ u32 hfi1_build_tid_rdma_read_packet(struct rvt_swqe *wqe, struct ib_other_headers *ohdr, u32 *bth1, @@ -2193,3 +2223,494 @@ u32 hfi1_build_tid_rdma_read_resp(struct rvt_qp *qp, struct rvt_ack_entry *e, done: return hdwords; } + +static inline struct tid_rdma_request * +find_tid_request(struct rvt_qp *qp, u32 psn, enum ib_wr_opcode opcode) + __must_hold(&qp->s_lock) +{ + struct rvt_swqe *wqe; + struct tid_rdma_request *req = NULL; + u32 i, end; + + end = qp->s_cur + 1; + if (end == qp->s_size) + end = 0; + for (i = qp->s_acked; i != end;) { + wqe = rvt_get_swqe_ptr(qp, i); + if (cmp_psn(psn, wqe->psn) >= 0 && + cmp_psn(psn, wqe->lpsn) <= 0) { + if (wqe->wr.opcode == opcode) + req = wqe_to_tid_req(wqe); + break; + } + if (++i == qp->s_size) + i = 0; + } + + return req; +} + +void hfi1_rc_rcv_tid_rdma_read_resp(struct hfi1_packet *packet) +{ + /* HANDLER FOR TID RDMA READ RESPONSE packet (Requestor side */ + + /* + * 1. Find matching SWQE + * 2. Check that the entire segment has been read. + * 3. Remove HFI1_S_WAIT_TID_RESP from s_flags. + * 4. Free the TID flow resources. + * 5. Kick the send engine (hfi1_schedule_send()) + */ + struct ib_other_headers *ohdr = packet->ohdr; + struct rvt_qp *qp = packet->qp; + struct hfi1_qp_priv *priv = qp->priv; + struct hfi1_ctxtdata *rcd = packet->rcd; + struct tid_rdma_request *req; + struct tid_rdma_flow *flow; + u32 opcode, aeth; + bool is_fecn; + unsigned long flags; + u32 kpsn, ipsn; + + is_fecn = process_ecn(qp, packet); + kpsn = mask_psn(be32_to_cpu(ohdr->bth[2])); + aeth = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.aeth); + opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; + + spin_lock_irqsave(&qp->s_lock, flags); + ipsn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn)); + req = find_tid_request(qp, ipsn, IB_WR_TID_RDMA_READ); + if (unlikely(!req)) + goto ack_op_err; + + flow = &req->flows[req->clear_tail]; + /* When header suppression is disabled */ + if (cmp_psn(ipsn, flow->flow_state.ib_lpsn)) + goto ack_done; + req->ack_pending--; + priv->pending_tid_r_segs--; + qp->s_num_rd_atomic--; + if ((qp->s_flags & RVT_S_WAIT_FENCE) && + !qp->s_num_rd_atomic) { + qp->s_flags &= ~(RVT_S_WAIT_FENCE | + RVT_S_WAIT_ACK); + hfi1_schedule_send(qp); + } + if (qp->s_flags & RVT_S_WAIT_RDMAR) { + qp->s_flags &= ~(RVT_S_WAIT_RDMAR | RVT_S_WAIT_ACK); + hfi1_schedule_send(qp); + } + + /* Release the tid resources */ + hfi1_kern_exp_rcv_clear(req); + + if (!do_rc_ack(qp, aeth, ipsn, opcode, 0, rcd)) + goto ack_done; + + /* If not done yet, build next read request */ + if (++req->comp_seg >= req->total_segs) { + priv->tid_r_comp++; + req->state = TID_REQUEST_COMPLETE; + } + + /* + * Clear the hw flow under two conditions: + * 1. This request is a sync point and it is complete; + * 2. Current request is completed and there are no more requests. + */ + if ((req->state == TID_REQUEST_SYNC && + req->comp_seg == req->cur_seg) || + priv->tid_r_comp == priv->tid_r_reqs) { + hfi1_kern_clear_hw_flow(priv->rcd, qp); + if (req->state == TID_REQUEST_SYNC) + req->state = TID_REQUEST_ACTIVE; + } + + hfi1_schedule_send(qp); + goto ack_done; + +ack_op_err: + /* + * The test indicates that the send engine has finished its cleanup + * after sending the request and it's now safe to put the QP into error + * state. However, if the wqe queue is empty (qp->s_acked == qp->s_tail + * == qp->s_head), it would be unsafe to complete the wqe pointed by + * qp->s_acked here. Putting the qp into error state will safely flush + * all remaining requests. + */ + if (qp->s_last == qp->s_acked) + rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); + +ack_done: + spin_unlock_irqrestore(&qp->s_lock, flags); + if (is_fecn) + hfi1_send_rc_ack(packet, is_fecn); +} + +void hfi1_kern_read_tid_flow_free(struct rvt_qp *qp) + __must_hold(&qp->s_lock) +{ + u32 n = qp->s_acked; + struct rvt_swqe *wqe; + struct tid_rdma_request *req; + struct hfi1_qp_priv *priv = qp->priv; + + lockdep_assert_held(&qp->s_lock); + /* Free any TID entries */ + while (n != qp->s_tail) { + wqe = rvt_get_swqe_ptr(qp, n); + if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { + req = wqe_to_tid_req(wqe); + hfi1_kern_exp_rcv_clear_all(req); + } + + if (++n == qp->s_size) + n = 0; + } + /* Free flow */ + hfi1_kern_clear_hw_flow(priv->rcd, qp); +} + +static bool tid_rdma_tid_err(struct hfi1_ctxtdata *rcd, + struct hfi1_packet *packet, u8 rcv_type, + u8 opcode) +{ + struct rvt_qp *qp = packet->qp; + u32 ipsn; + struct ib_other_headers *ohdr = packet->ohdr; + + if (rcv_type >= RHF_RCV_TYPE_IB) + goto done; + + spin_lock(&qp->s_lock); + /* + * For TID READ response, error out QP after freeing the tid + * resources. + */ + if (opcode == TID_OP(READ_RESP)) { + ipsn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn)); + if (cmp_psn(ipsn, qp->s_last_psn) > 0 && + cmp_psn(ipsn, qp->s_psn) < 0) { + hfi1_kern_read_tid_flow_free(qp); + spin_unlock(&qp->s_lock); + rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR); + goto done; + } + } + + spin_unlock(&qp->s_lock); +done: + return true; +} + +static void restart_tid_rdma_read_req(struct hfi1_ctxtdata *rcd, + struct rvt_qp *qp, struct rvt_swqe *wqe) +{ + struct tid_rdma_request *req; + struct tid_rdma_flow *flow; + + /* Start from the right segment */ + qp->r_flags |= RVT_R_RDMAR_SEQ; + req = wqe_to_tid_req(wqe); + flow = &req->flows[req->clear_tail]; + hfi1_restart_rc(qp, flow->flow_state.ib_spsn, 0); + if (list_empty(&qp->rspwait)) { + qp->r_flags |= RVT_R_RSP_SEND; + rvt_get_qp(qp); + list_add_tail(&qp->rspwait, &rcd->qp_wait_list); + } +} + +/* + * Handle the KDETH eflags for TID RDMA READ response. + * + * Return true if the last packet for a segment has been received and it is + * time to process the response normally; otherwise, return true. + * + * The caller must hold the packet->qp->r_lock and the rcu_read_lock. + */ +static bool handle_read_kdeth_eflags(struct hfi1_ctxtdata *rcd, + struct hfi1_packet *packet, u8 rcv_type, + u8 rte, u32 psn, u32 ibpsn) + __must_hold(&packet->qp->r_lock) __must_hold(RCU) +{ + struct hfi1_pportdata *ppd = rcd->ppd; + struct hfi1_devdata *dd = ppd->dd; + struct hfi1_ibport *ibp; + struct rvt_swqe *wqe; + struct tid_rdma_request *req; + struct tid_rdma_flow *flow; + u32 ack_psn; + struct rvt_qp *qp = packet->qp; + struct hfi1_qp_priv *priv = qp->priv; + bool ret = true; + int diff = 0; + u32 fpsn; + + lockdep_assert_held(&qp->r_lock); + /* If the psn is out of valid range, drop the packet */ + if (cmp_psn(ibpsn, qp->s_last_psn) < 0 || + cmp_psn(ibpsn, qp->s_psn) > 0) + return ret; + + spin_lock(&qp->s_lock); + /* + * Note that NAKs implicitly ACK outstanding SEND and RDMA write + * requests and implicitly NAK RDMA read and atomic requests issued + * before the NAK'ed request. + */ + ack_psn = ibpsn - 1; + wqe = rvt_get_swqe_ptr(qp, qp->s_acked); + ibp = to_iport(qp->ibqp.device, qp->port_num); + + /* Complete WQEs that the PSN finishes. */ + while ((int)delta_psn(ack_psn, wqe->lpsn) >= 0) { + /* + * If this request is a RDMA read or atomic, and the NACK is + * for a later operation, this NACK NAKs the RDMA read or + * atomic. + */ + if (wqe->wr.opcode == IB_WR_RDMA_READ || + wqe->wr.opcode == IB_WR_TID_RDMA_READ || + wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || + wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) { + /* Retry this request. */ + if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) { + qp->r_flags |= RVT_R_RDMAR_SEQ; + if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { + restart_tid_rdma_read_req(rcd, qp, + wqe); + } else { + hfi1_restart_rc(qp, qp->s_last_psn + 1, + 0); + if (list_empty(&qp->rspwait)) { + qp->r_flags |= RVT_R_RSP_SEND; + rvt_get_qp(qp); + list_add_tail(/* wait */ + &qp->rspwait, + &rcd->qp_wait_list); + } + } + } + /* + * No need to process the NAK since we are + * restarting an earlier request. + */ + break; + } + + wqe = do_rc_completion(qp, wqe, ibp); + if (qp->s_acked == qp->s_tail) + break; + } + + /* Handle the eflags for the request */ + if (wqe->wr.opcode != IB_WR_TID_RDMA_READ) + goto s_unlock; + + req = wqe_to_tid_req(wqe); + switch (rcv_type) { + case RHF_RCV_TYPE_EXPECTED: + switch (rte) { + case RHF_RTE_EXPECTED_FLOW_SEQ_ERR: + /* + * On the first occurrence of a Flow Sequence error, + * the flag TID_FLOW_SW_PSN is set. + * + * After that, the flow is *not* reprogrammed and the + * protocol falls back to SW PSN checking. This is done + * to prevent continuous Flow Sequence errors for any + * packets that could be still in the fabric. + */ + flow = find_flow(req, psn, NULL); + if (!flow) { + /* + * We can't find the IB PSN matching the + * received KDETH PSN. The only thing we can + * do at this point is report the error to + * the QP. + */ + hfi1_kern_read_tid_flow_free(qp); + spin_unlock(&qp->s_lock); + rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR); + return ret; + } + if (priv->flow_state.flags & TID_FLOW_SW_PSN) { + diff = cmp_psn(psn, + priv->flow_state.r_next_psn); + if (diff > 0) { + if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) + restart_tid_rdma_read_req(rcd, + qp, + wqe); + + /* Drop the packet.*/ + goto s_unlock; + } else if (diff < 0) { + /* + * If a response packet for a restarted + * request has come back, reset the + * restart flag. + */ + if (qp->r_flags & RVT_R_RDMAR_SEQ) + qp->r_flags &= + ~RVT_R_RDMAR_SEQ; + + /* Drop the packet.*/ + goto s_unlock; + } + + /* + * If SW PSN verification is successful and + * this is the last packet in the segment, tell + * the caller to process it as a normal packet. + */ + fpsn = full_flow_psn(flow, + flow->flow_state.lpsn); + if (cmp_psn(fpsn, psn) == 0) { + ret = false; + if (qp->r_flags & RVT_R_RDMAR_SEQ) + qp->r_flags &= + ~RVT_R_RDMAR_SEQ; + } + priv->flow_state.r_next_psn++; + } else { + u64 reg; + u32 last_psn; + + /* + * The only sane way to get the amount of + * progress is to read the HW flow state. + */ + reg = read_uctxt_csr(dd, rcd->ctxt, + RCV_TID_FLOW_TABLE + + (8 * flow->idx)); + last_psn = mask_psn(reg); + + priv->flow_state.r_next_psn = last_psn; + priv->flow_state.flags |= TID_FLOW_SW_PSN; + /* + * If no request has been restarted yet, + * restart the current one. + */ + if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) + restart_tid_rdma_read_req(rcd, qp, + wqe); + } + + break; + + case RHF_RTE_EXPECTED_FLOW_GEN_ERR: + /* + * Since the TID flow is able to ride through + * generation mismatch, drop this stale packet. + */ + break; + + default: + break; + } + break; + + case RHF_RCV_TYPE_ERROR: + switch (rte) { + case RHF_RTE_ERROR_OP_CODE_ERR: + case RHF_RTE_ERROR_KHDR_MIN_LEN_ERR: + case RHF_RTE_ERROR_KHDR_HCRC_ERR: + case RHF_RTE_ERROR_KHDR_KVER_ERR: + case RHF_RTE_ERROR_CONTEXT_ERR: + case RHF_RTE_ERROR_KHDR_TID_ERR: + default: + break; + } + default: + break; + } +s_unlock: + spin_unlock(&qp->s_lock); + return ret; +} + +bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd, + struct hfi1_pportdata *ppd, + struct hfi1_packet *packet) +{ + struct hfi1_ibport *ibp = &ppd->ibport_data; + struct hfi1_devdata *dd = ppd->dd; + struct rvt_dev_info *rdi = &dd->verbs_dev.rdi; + u8 rcv_type = rhf_rcv_type(packet->rhf); + u8 rte = rhf_rcv_type_err(packet->rhf); + struct ib_header *hdr = packet->hdr; + struct ib_other_headers *ohdr = NULL; + int lnh = be16_to_cpu(hdr->lrh[0]) & 3; + u16 lid = be16_to_cpu(hdr->lrh[1]); + u8 opcode; + u32 qp_num, psn, ibpsn; + struct rvt_qp *qp; + unsigned long flags; + bool ret = true; + + if (packet->rhf & (RHF_VCRC_ERR | RHF_ICRC_ERR)) + return ret; + + packet->ohdr = &hdr->u.oth; + ohdr = packet->ohdr; + trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf))); + + /* Get the destination QP number. */ + qp_num = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_qp) & + RVT_QPN_MASK; + if (lid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) + goto drop; + + psn = mask_psn(be32_to_cpu(ohdr->bth[2])); + opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; + + rcu_read_lock(); + qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num); + if (!qp) + goto rcu_unlock; + + packet->qp = qp; + + /* Check for valid receive state. */ + spin_lock_irqsave(&qp->r_lock, flags); + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { + ibp->rvp.n_pkt_drops++; + goto r_unlock; + } + + if (packet->rhf & RHF_TID_ERR) { + /* For TIDERR and RC QPs preemptively schedule a NAK */ + u32 tlen = rhf_pkt_len(packet->rhf); /* in bytes */ + + /* Sanity check packet */ + if (tlen < 24) + goto r_unlock; + + /* + * Check for GRH. We should never get packets with GRH in this + * path. + */ + if (lnh == HFI1_LRH_GRH) + goto r_unlock; + + if (tid_rdma_tid_err(rcd, packet, rcv_type, opcode)) + goto r_unlock; + } + + /* handle TID RDMA READ */ + if (opcode == TID_OP(READ_RESP)) { + ibpsn = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn); + ibpsn = mask_psn(ibpsn); + ret = handle_read_kdeth_eflags(rcd, packet, rcv_type, rte, psn, + ibpsn); + } + +r_unlock: + spin_unlock_irqrestore(&qp->r_lock, flags); +rcu_unlock: + rcu_read_unlock(); +drop: + return ret; +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index 01ded7c0c302..d428236aef68 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -177,6 +177,8 @@ static inline void trdma_clean_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe) __trdma_clean_swqe(qp, wqe); } +void hfi1_kern_read_tid_flow_free(struct rvt_qp *qp); + int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp, struct ib_qp_init_attr *init_attr); void hfi1_qp_priv_tid_free(struct rvt_dev_info *rdi, struct rvt_qp *qp); @@ -201,5 +203,9 @@ void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet); u32 hfi1_build_tid_rdma_read_resp(struct rvt_qp *qp, struct rvt_ack_entry *e, struct ib_other_headers *ohdr, u32 *bth0, u32 *bth1, u32 *bth2, u32 *len, bool *last); +void hfi1_rc_rcv_tid_rdma_read_resp(struct hfi1_packet *packet); +bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd, + struct hfi1_pportdata *ppd, + struct hfi1_packet *packet); #endif /* HFI1_TID_RDMA_H */ diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index 5e910c508360..c1574c0ed9a7 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -172,6 +172,8 @@ struct hfi1_qp_priv { unsigned long tid_timer_timeout_jiffies; /* For TID RDMA READ */ + u32 tid_r_reqs; /* Num of tid reads requested */ + u32 tid_r_comp; /* Num of tid reads completed */ u32 pending_tid_r_segs; /* Num of pending tid read segments */ u16 pkts_ps; /* packets per segment */ u8 timeout_shift; /* account for number of packets per segment */ -- cgit v1.2.3-59-g8ed1b From 22d136d7566f193fc67cdfd9228aaa7d122a3a45 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Thu, 24 Jan 2019 06:36:34 -0800 Subject: IB/hfi1: Add TID RDMA handlers This commit adds the TID RDMA READ pointers to the receiving opcode handlers. It also adds TID RDMA READ header sizes to header size table. A function to print the RHF EFLAGS errors is created so that it can be shared by both IB and TID RDMA receiving functions. Reviewed-by: Mike Marciniszyn Signed-off-by: Mitko Haralanov Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/driver.c | 58 +++++++++++------ drivers/infiniband/hw/hfi1/hfi.h | 2 +- drivers/infiniband/hw/hfi1/verbs.c | 125 ++++++++++++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/verbs.h | 4 ++ 4 files changed, 167 insertions(+), 22 deletions(-) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c index a8ad70730203..2a9d2912f5db 100644 --- a/drivers/infiniband/hw/hfi1/driver.c +++ b/drivers/infiniband/hw/hfi1/driver.c @@ -1575,25 +1575,32 @@ drop: return -EINVAL; } -void handle_eflags(struct hfi1_packet *packet) +static void show_eflags_errs(struct hfi1_packet *packet) { struct hfi1_ctxtdata *rcd = packet->rcd; u32 rte = rhf_rcv_type_err(packet->rhf); + dd_dev_err(rcd->dd, + "receive context %d: rhf 0x%016llx, errs [ %s%s%s%s%s%s%s%s] rte 0x%x\n", + rcd->ctxt, packet->rhf, + packet->rhf & RHF_K_HDR_LEN_ERR ? "k_hdr_len " : "", + packet->rhf & RHF_DC_UNC_ERR ? "dc_unc " : "", + packet->rhf & RHF_DC_ERR ? "dc " : "", + packet->rhf & RHF_TID_ERR ? "tid " : "", + packet->rhf & RHF_LEN_ERR ? "len " : "", + packet->rhf & RHF_ECC_ERR ? "ecc " : "", + packet->rhf & RHF_VCRC_ERR ? "vcrc " : "", + packet->rhf & RHF_ICRC_ERR ? "icrc " : "", + rte); +} + +void handle_eflags(struct hfi1_packet *packet) +{ + struct hfi1_ctxtdata *rcd = packet->rcd; + rcv_hdrerr(rcd, rcd->ppd, packet); if (rhf_err_flags(packet->rhf)) - dd_dev_err(rcd->dd, - "receive context %d: rhf 0x%016llx, errs [ %s%s%s%s%s%s%s%s] rte 0x%x\n", - rcd->ctxt, packet->rhf, - packet->rhf & RHF_K_HDR_LEN_ERR ? "k_hdr_len " : "", - packet->rhf & RHF_DC_UNC_ERR ? "dc_unc " : "", - packet->rhf & RHF_DC_ERR ? "dc " : "", - packet->rhf & RHF_TID_ERR ? "tid " : "", - packet->rhf & RHF_LEN_ERR ? "len " : "", - packet->rhf & RHF_ECC_ERR ? "ecc " : "", - packet->rhf & RHF_VCRC_ERR ? "vcrc " : "", - packet->rhf & RHF_ICRC_ERR ? "icrc " : "", - rte); + show_eflags_errs(packet); } /* @@ -1699,11 +1706,14 @@ static int kdeth_process_expected(struct hfi1_packet *packet) if (unlikely(hfi1_dbg_should_fault_rx(packet))) return RHF_RCV_CONTINUE; - if (unlikely(rhf_err_flags(packet->rhf))) - handle_eflags(packet); + if (unlikely(rhf_err_flags(packet->rhf))) { + struct hfi1_ctxtdata *rcd = packet->rcd; - dd_dev_err(packet->rcd->dd, - "Unhandled expected packet received. Dropping.\n"); + if (hfi1_handle_kdeth_eflags(rcd, rcd->ppd, packet)) + return RHF_RCV_CONTINUE; + } + + hfi1_kdeth_expected_rcv(packet); return RHF_RCV_CONTINUE; } @@ -1712,11 +1722,17 @@ static int kdeth_process_eager(struct hfi1_packet *packet) hfi1_setup_9B_packet(packet); if (unlikely(hfi1_dbg_should_fault_rx(packet))) return RHF_RCV_CONTINUE; - if (unlikely(rhf_err_flags(packet->rhf))) - handle_eflags(packet); - dd_dev_err(packet->rcd->dd, - "Unhandled eager packet received. Dropping.\n"); + trace_hfi1_rcvhdr(packet); + if (unlikely(rhf_err_flags(packet->rhf))) { + struct hfi1_ctxtdata *rcd = packet->rcd; + + show_eflags_errs(packet); + if (hfi1_handle_kdeth_eflags(rcd, rcd->ppd, packet)) + return RHF_RCV_CONTINUE; + } + + hfi1_kdeth_eager_rcv(packet); return RHF_RCV_CONTINUE; } diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index 1412ed157c98..6582184cc985 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -2120,7 +2120,7 @@ static inline u64 hfi1_pkt_default_send_ctxt_mask(struct hfi1_devdata *dd, SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_TEST_SMASK | #endif HFI1_PKT_USER_SC_INTEGRITY; - else + else if (ctxt_type != SC_KERNEL) base_sc_integrity |= HFI1_PKT_KERNEL_SC_INTEGRITY; /* turn on send-side job key checks if !A0 */ diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 8887a71edb98..2d59fcde4db6 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -200,6 +200,8 @@ const u8 hdr_len_by_opcode[256] = { [IB_OPCODE_RC_FETCH_ADD] = 12 + 8 + 28, [IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE] = 12 + 8 + 4, [IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE] = 12 + 8 + 4, + [IB_OPCODE_TID_RDMA_READ_REQ] = 12 + 8 + 36, + [IB_OPCODE_TID_RDMA_READ_RESP] = 12 + 8 + 36, /* UC */ [IB_OPCODE_UC_SEND_FIRST] = 12 + 8, [IB_OPCODE_UC_SEND_MIDDLE] = 12 + 8, @@ -243,6 +245,11 @@ static const opcode_handler opcode_handler_tbl[256] = { [IB_OPCODE_RC_FETCH_ADD] = &hfi1_rc_rcv, [IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE] = &hfi1_rc_rcv, [IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE] = &hfi1_rc_rcv, + + /* TID RDMA has separate handlers for different opcodes.*/ + [IB_OPCODE_TID_RDMA_READ_REQ] = &hfi1_rc_rcv_tid_rdma_read_req, + [IB_OPCODE_TID_RDMA_READ_RESP] = &hfi1_rc_rcv_tid_rdma_read_resp, + /* UC */ [IB_OPCODE_UC_SEND_FIRST] = &hfi1_uc_rcv, [IB_OPCODE_UC_SEND_MIDDLE] = &hfi1_uc_rcv, @@ -336,6 +343,124 @@ static u64 hfi1_fault_tx(struct rvt_qp *qp, u8 opcode, u64 pbc) return pbc; } +static opcode_handler tid_qp_ok(int opcode, struct hfi1_packet *packet) +{ + if (packet->qp->ibqp.qp_type != IB_QPT_RC || + !(ib_rvt_state_ops[packet->qp->state] & RVT_PROCESS_RECV_OK)) + return NULL; + if ((opcode & RVT_OPCODE_QP_MASK) == IB_OPCODE_TID_RDMA) + return opcode_handler_tbl[opcode]; + return NULL; +} + +void hfi1_kdeth_eager_rcv(struct hfi1_packet *packet) +{ + struct hfi1_ctxtdata *rcd = packet->rcd; + struct ib_header *hdr = packet->hdr; + u32 tlen = packet->tlen; + struct hfi1_pportdata *ppd = rcd->ppd; + struct hfi1_ibport *ibp = &ppd->ibport_data; + struct rvt_dev_info *rdi = &ppd->dd->verbs_dev.rdi; + opcode_handler opcode_handler; + unsigned long flags; + u32 qp_num; + int lnh; + u8 opcode; + + /* DW == LRH (2) + BTH (3) + KDETH (9) + CRC (1) */ + if (unlikely(tlen < 15 * sizeof(u32))) + goto drop; + + lnh = be16_to_cpu(hdr->lrh[0]) & 3; + if (lnh != HFI1_LRH_BTH) + goto drop; + + packet->ohdr = &hdr->u.oth; + trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf))); + + opcode = (be32_to_cpu(packet->ohdr->bth[0]) >> 24); + inc_opstats(tlen, &rcd->opstats->stats[opcode]); + + /* verbs_qp can be picked up from any tid_rdma header struct */ + qp_num = be32_to_cpu(packet->ohdr->u.tid_rdma.r_req.verbs_qp) & + RVT_QPN_MASK; + + rcu_read_lock(); + packet->qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num); + if (!packet->qp) + goto drop_rcu; + spin_lock_irqsave(&packet->qp->r_lock, flags); + opcode_handler = tid_qp_ok(opcode, packet); + if (likely(opcode_handler)) + opcode_handler(packet); + else + goto drop_unlock; + spin_unlock_irqrestore(&packet->qp->r_lock, flags); + rcu_read_unlock(); + + return; +drop_unlock: + spin_unlock_irqrestore(&packet->qp->r_lock, flags); +drop_rcu: + rcu_read_unlock(); +drop: + ibp->rvp.n_pkt_drops++; +} + +void hfi1_kdeth_expected_rcv(struct hfi1_packet *packet) +{ + struct hfi1_ctxtdata *rcd = packet->rcd; + struct ib_header *hdr = packet->hdr; + u32 tlen = packet->tlen; + struct hfi1_pportdata *ppd = rcd->ppd; + struct hfi1_ibport *ibp = &ppd->ibport_data; + struct rvt_dev_info *rdi = &ppd->dd->verbs_dev.rdi; + opcode_handler opcode_handler; + unsigned long flags; + u32 qp_num; + int lnh; + u8 opcode; + + /* DW == LRH (2) + BTH (3) + KDETH (9) + CRC (1) */ + if (unlikely(tlen < 15 * sizeof(u32))) + goto drop; + + lnh = be16_to_cpu(hdr->lrh[0]) & 3; + if (lnh != HFI1_LRH_BTH) + goto drop; + + packet->ohdr = &hdr->u.oth; + trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf))); + + opcode = (be32_to_cpu(packet->ohdr->bth[0]) >> 24); + inc_opstats(tlen, &rcd->opstats->stats[opcode]); + + /* verbs_qp can be picked up from any tid_rdma header struct */ + qp_num = be32_to_cpu(packet->ohdr->u.tid_rdma.r_rsp.verbs_qp) & + RVT_QPN_MASK; + + rcu_read_lock(); + packet->qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num); + if (!packet->qp) + goto drop_rcu; + spin_lock_irqsave(&packet->qp->r_lock, flags); + opcode_handler = tid_qp_ok(opcode, packet); + if (likely(opcode_handler)) + opcode_handler(packet); + else + goto drop_unlock; + spin_unlock_irqrestore(&packet->qp->r_lock, flags); + rcu_read_unlock(); + + return; +drop_unlock: + spin_unlock_irqrestore(&packet->qp->r_lock, flags); +drop_rcu: + rcu_read_unlock(); +drop: + ibp->rvp.n_pkt_drops++; +} + static int hfi1_do_pkey_check(struct hfi1_packet *packet) { struct hfi1_ctxtdata *rcd = packet->rcd; diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index c1574c0ed9a7..7642b59ad5d2 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -427,6 +427,10 @@ int hfi1_register_ib_device(struct hfi1_devdata *); void hfi1_unregister_ib_device(struct hfi1_devdata *); +void hfi1_kdeth_eager_rcv(struct hfi1_packet *packet); + +void hfi1_kdeth_expected_rcv(struct hfi1_packet *packet); + void hfi1_ib_rcv(struct hfi1_packet *packet); void hfi1_16B_rcv(struct hfi1_packet *packet); -- cgit v1.2.3-59-g8ed1b From b126078e8957f3aea4a44b8916f2f3752b5c392d Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 19:31:46 -0800 Subject: IB/hfi1: Add functions for restarting TID RDMA READ request This patch adds functions to retry TID RDMA READ request. Since TID RDMA READ request could be retried from any segment boundary, it requires a number of tracking fields in various structures and those fields should be reset properly. The qp->s_num_rd_atomic field is reset before retry and therefore should be incremented for each new or retried RDMA READ or atomic request. Reviewed-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/rc.c | 94 +++++++++++++++++++++++++++-------- drivers/infiniband/hw/hfi1/tid_rdma.c | 82 ++++++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/tid_rdma.h | 2 + 3 files changed, 158 insertions(+), 20 deletions(-) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 6e74cd3814b8..e478a0b93eb9 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -503,16 +503,14 @@ no_flow_control: * Don't allow more operations to be started * than the QP limits allow. */ - if (newreq) { - if (qp->s_num_rd_atomic >= - qp->s_max_rd_atomic) { - qp->s_flags |= RVT_S_WAIT_RDMAR; - goto bail; - } - qp->s_num_rd_atomic++; - if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) - qp->s_lsn++; + if (qp->s_num_rd_atomic >= + qp->s_max_rd_atomic) { + qp->s_flags |= RVT_S_WAIT_RDMAR; + goto bail; } + qp->s_num_rd_atomic++; + if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) + qp->s_lsn++; put_ib_reth_vaddr( wqe->rdma_wr.remote_addr, &ohdr->u.rc.reth); @@ -534,14 +532,12 @@ no_flow_control: * Don't allow more operations to be started * than the QP limits allow. */ - if (newreq) { - if (qp->s_num_rd_atomic >= - qp->s_max_rd_atomic) { - qp->s_flags |= RVT_S_WAIT_RDMAR; - goto bail; - } - qp->s_num_rd_atomic++; + if (qp->s_num_rd_atomic >= + qp->s_max_rd_atomic) { + qp->s_flags |= RVT_S_WAIT_RDMAR; + goto bail; } + qp->s_num_rd_atomic++; /* FALLTHROUGH */ case IB_WR_OPFN: @@ -970,6 +966,43 @@ void hfi1_send_rc_ack(struct hfi1_packet *packet, bool is_fecn) return; } +/** + * update_num_rd_atomic - update the qp->s_num_rd_atomic + * @qp: the QP + * @psn: the packet sequence number to restart at + * @wqe: the wqe + * + * This is called from reset_psn() to update qp->s_num_rd_atomic + * for the current wqe. + * Called at interrupt level with the QP s_lock held. + */ +static void update_num_rd_atomic(struct rvt_qp *qp, u32 psn, + struct rvt_swqe *wqe) +{ + u32 opcode = wqe->wr.opcode; + + if (opcode == IB_WR_RDMA_READ || + opcode == IB_WR_ATOMIC_CMP_AND_SWP || + opcode == IB_WR_ATOMIC_FETCH_AND_ADD) { + qp->s_num_rd_atomic++; + } else if (opcode == IB_WR_TID_RDMA_READ) { + struct tid_rdma_request *req = wqe_to_tid_req(wqe); + struct hfi1_qp_priv *priv = qp->priv; + + if (cmp_psn(psn, wqe->lpsn) <= 0) { + u32 cur_seg; + + cur_seg = (psn - wqe->psn) / priv->pkts_ps; + req->ack_pending = cur_seg - req->comp_seg; + priv->pending_tid_r_segs += req->ack_pending; + qp->s_num_rd_atomic += req->ack_pending; + } else { + priv->pending_tid_r_segs += req->total_segs; + qp->s_num_rd_atomic += req->total_segs; + } + } +} + /** * reset_psn - reset the QP state to send starting from PSN * @qp: the QP @@ -984,9 +1017,12 @@ static void reset_psn(struct rvt_qp *qp, u32 psn) u32 n = qp->s_acked; struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, n); u32 opcode; + struct hfi1_qp_priv *priv = qp->priv; lockdep_assert_held(&qp->s_lock); qp->s_cur = n; + priv->pending_tid_r_segs = 0; + qp->s_num_rd_atomic = 0; /* * If we are starting the request from the beginning, @@ -996,9 +1032,9 @@ static void reset_psn(struct rvt_qp *qp, u32 psn) qp->s_state = OP(SEND_LAST); goto done; } + update_num_rd_atomic(qp, psn, wqe); /* Find the work request opcode corresponding to the given PSN. */ - opcode = wqe->wr.opcode; for (;;) { int diff; @@ -1008,8 +1044,11 @@ static void reset_psn(struct rvt_qp *qp, u32 psn) break; wqe = rvt_get_swqe_ptr(qp, n); diff = cmp_psn(psn, wqe->psn); - if (diff < 0) + if (diff < 0) { + /* Point wqe back to the previous one*/ + wqe = rvt_get_swqe_ptr(qp, qp->s_cur); break; + } qp->s_cur = n; /* * If we are starting the request from the beginning, @@ -1019,8 +1058,10 @@ static void reset_psn(struct rvt_qp *qp, u32 psn) qp->s_state = OP(SEND_LAST); goto done; } - opcode = wqe->wr.opcode; + + update_num_rd_atomic(qp, psn, wqe); } + opcode = wqe->wr.opcode; /* * Set the state to restart in the middle of a request. @@ -1042,6 +1083,10 @@ static void reset_psn(struct rvt_qp *qp, u32 psn) qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE); break; + case IB_WR_TID_RDMA_READ: + qp->s_state = TID_OP(READ_RESP); + break; + default: /* * This case shouldn't happen since its only @@ -1095,6 +1140,14 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait) wqe = do_rc_completion(qp, wqe, ibp); qp->s_flags &= ~RVT_S_WAIT_ACK; } else { + if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { + struct tid_rdma_request *req; + + req = wqe_to_tid_req(wqe); + hfi1_kern_exp_rcv_clear_all(req); + hfi1_kern_clear_hw_flow(priv->rcd, qp); + } + rvt_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR); rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); @@ -1108,7 +1161,8 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait) } ibp = to_iport(qp->ibqp.device, qp->port_num); - if (wqe->wr.opcode == IB_WR_RDMA_READ) + if (wqe->wr.opcode == IB_WR_RDMA_READ || + wqe->wr.opcode == IB_WR_TID_RDMA_READ) ibp->rvp.n_rc_resends++; else ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn); diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index da8b63ec0f8d..f767c5c20566 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -1622,6 +1622,27 @@ u64 hfi1_access_sw_tid_wait(const struct cntr_entry *entry, return dd->verbs_dev.n_tidwait; } +static struct tid_rdma_flow *find_flow_ib(struct tid_rdma_request *req, + u32 psn, u16 *fidx) +{ + u16 head, tail; + struct tid_rdma_flow *flow; + + head = req->setup_head; + tail = req->clear_tail; + for ( ; CIRC_CNT(head, tail, MAX_FLOWS); + tail = CIRC_NEXT(tail, MAX_FLOWS)) { + flow = &req->flows[tail]; + if (cmp_psn(psn, flow->flow_state.ib_spsn) >= 0 && + cmp_psn(psn, flow->flow_state.ib_lpsn) <= 0) { + if (fidx) + *fidx = tail; + return flow; + } + } + return NULL; +} + static struct tid_rdma_flow * __find_flow_ranged(struct tid_rdma_request *req, u16 head, u16 tail, u32 psn, u16 *fidx) @@ -2714,3 +2735,64 @@ rcu_unlock: drop: return ret; } + +/* + * "Rewind" the TID request information. + * This means that we reset the state back to ACTIVE, + * find the proper flow, set the flow index to that flow, + * and reset the flow information. + */ +void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe, + u32 *bth2) +{ + struct tid_rdma_request *req = wqe_to_tid_req(wqe); + struct tid_rdma_flow *flow; + int diff; + u32 tididx = 0; + u16 fidx; + + if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { + *bth2 = mask_psn(qp->s_psn); + flow = find_flow_ib(req, *bth2, &fidx); + if (!flow) + return; + } else { + return; + } + + diff = delta_psn(*bth2, flow->flow_state.ib_spsn); + + flow->sent = 0; + flow->pkt = 0; + flow->tid_idx = 0; + flow->tid_offset = 0; + if (diff) { + for (tididx = 0; tididx < flow->tidcnt; tididx++) { + u32 tidentry = flow->tid_entry[tididx], tidlen, + tidnpkts, npkts; + + flow->tid_offset = 0; + tidlen = EXP_TID_GET(tidentry, LEN) * PAGE_SIZE; + tidnpkts = rvt_div_round_up_mtu(qp, tidlen); + npkts = min_t(u32, diff, tidnpkts); + flow->pkt += npkts; + flow->sent += (npkts == tidnpkts ? tidlen : + npkts * qp->pmtu); + flow->tid_offset += npkts * qp->pmtu; + diff -= npkts; + if (!diff) + break; + } + } + + if (flow->tid_offset == + EXP_TID_GET(flow->tid_entry[tididx], LEN) * PAGE_SIZE) { + tididx++; + flow->tid_offset = 0; + } + flow->tid_idx = tididx; + /* Move flow_idx to correct index */ + req->flow_idx = fidx; + + req->state = TID_REQUEST_ACTIVE; +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index d428236aef68..beb5982ce6ad 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -207,5 +207,7 @@ void hfi1_rc_rcv_tid_rdma_read_resp(struct hfi1_packet *packet); bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd, struct hfi1_pportdata *ppd, struct hfi1_packet *packet); +void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe, + u32 *bth2); #endif /* HFI1_TID_RDMA_H */ -- cgit v1.2.3-59-g8ed1b From 24b11923da4c7dbf5690d3ac74710affaf564196 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 19:32:09 -0800 Subject: IB/hfi1: Integrate TID RDMA READ protocol into RC protocol This patch integrates the TID RDMA READ protocol into the IB RC protocol. This protocol is an end-to-end protocol between the hfi1 drivers on two OPA nodes that converts a qualified RDMA READ request into a TID RDMA READ request to avoid data copying on the requester side. The following codes are added in this patch: - Send the TID RDMA READ request; - Complete the TID RDMA READ send request; - Send the TID RDMA READ response; - Complete the TID RDMA READ request; Reviewed-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/qp.c | 1 + drivers/infiniband/hw/hfi1/rc.c | 295 +++++++++++++++++++++++++++++++--- drivers/infiniband/hw/hfi1/tid_rdma.c | 33 ++++ drivers/infiniband/hw/hfi1/tid_rdma.h | 1 + drivers/infiniband/hw/hfi1/verbs.c | 1 + 5 files changed, 307 insertions(+), 24 deletions(-) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index 69c38af49492..5fea7319167e 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -761,6 +761,7 @@ void quiesce_qp(struct rvt_qp *qp) void notify_qp_reset(struct rvt_qp *qp) { + hfi1_qp_kern_exp_rcv_clear_all(qp); qp->r_adefered = 0; clear_ahg(qp); diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index e478a0b93eb9..a5aacf8e5b93 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -112,12 +112,14 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, { struct rvt_ack_entry *e; u32 hwords; - u32 len; - u32 bth0, bth2; + u32 len = 0; + u32 bth0 = 0, bth2 = 0; u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT); int middle = 0; u32 pmtu = qp->pmtu; struct hfi1_qp_priv *priv = qp->priv; + bool last_pkt; + u32 delta; lockdep_assert_held(&qp->s_lock); /* Don't send an ACK if we aren't supposed to. */ @@ -190,6 +192,26 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, hwords++; qp->s_ack_rdma_psn = e->psn; bth2 = mask_psn(qp->s_ack_rdma_psn++); + } else if (e->opcode == TID_OP(READ_REQ)) { + /* + * If a TID RDMA read response is being resent and + * we haven't seen the duplicate request yet, + * then stop sending the remaining responses the + * responder has seen until the requester re-sends it. + */ + len = e->rdma_sge.sge_length; + if (len && !e->rdma_sge.mr) { + qp->s_tail_ack_queue = qp->r_head_ack_queue; + goto bail; + } + /* Copy SGE state in case we need to resend */ + ps->s_txreq->mr = e->rdma_sge.mr; + if (ps->s_txreq->mr) + rvt_get_mr(ps->s_txreq->mr); + qp->s_ack_rdma_sge.sge = e->rdma_sge; + qp->s_ack_rdma_sge.num_sge = 1; + qp->s_ack_state = TID_OP(READ_RESP); + goto read_resp; } else { /* COMPARE_SWAP or FETCH_ADD */ ps->s_txreq->ss = NULL; @@ -227,6 +249,28 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, bth2 = mask_psn(qp->s_ack_rdma_psn++); break; + case TID_OP(READ_RESP): +read_resp: + e = &qp->s_ack_queue[qp->s_tail_ack_queue]; + ps->s_txreq->ss = &qp->s_ack_rdma_sge; + delta = hfi1_build_tid_rdma_read_resp(qp, e, ohdr, &bth0, + &bth1, &bth2, &len, + &last_pkt); + if (delta == 0) + goto error_qp; + hwords += delta; + if (last_pkt) { + e->sent = 1; + /* + * Increment qp->s_tail_ack_queue through s_ack_state + * transition. + */ + qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST); + } + break; + case TID_OP(READ_REQ): + goto bail; + default: normal: /* @@ -256,7 +300,14 @@ normal: ps->s_txreq->hdr_dwords = hwords; hfi1_make_ruc_header(qp, ohdr, bth0, bth1, bth2, middle, ps); return 1; - +error_qp: + spin_unlock_irqrestore(&qp->s_lock, ps->flags); + spin_lock_irqsave(&qp->r_lock, ps->flags); + spin_lock(&qp->s_lock); + rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); + spin_unlock(&qp->s_lock); + spin_unlock_irqrestore(&qp->r_lock, ps->flags); + spin_lock_irqsave(&qp->s_lock, ps->flags); bail: qp->s_ack_state = OP(ACKNOWLEDGE); /* @@ -283,16 +334,20 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) struct hfi1_qp_priv *priv = qp->priv; struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); struct ib_other_headers *ohdr; - struct rvt_sge_state *ss; + struct rvt_sge_state *ss = NULL; struct rvt_swqe *wqe; - u32 hwords; - u32 len; - u32 bth0 = 0, bth2; + struct hfi1_swqe_priv *wpriv; + struct tid_rdma_request *req = NULL; + /* header size in 32-bit words LRH+BTH = (8+12)/4. */ + u32 hwords = 5; + u32 len = 0; + u32 bth0 = 0, bth2 = 0; u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT); u32 pmtu = qp->pmtu; char newreq; int middle = 0; int delta; + struct tid_rdma_flow *flow = NULL; lockdep_assert_held(&qp->s_lock); ps->s_txreq = get_txreq(ps->dev, qp); @@ -334,8 +389,8 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) } clear_ahg(qp); wqe = rvt_get_swqe_ptr(qp, qp->s_last); - rvt_send_complete(qp, wqe, qp->s_last != qp->s_acked ? - IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR); + hfi1_trdma_send_complete(qp, wqe, qp->s_last != qp->s_acked ? + IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR); /* will get called again */ goto done_free_tx; } @@ -354,6 +409,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) /* Send a request. */ wqe = rvt_get_swqe_ptr(qp, qp->s_cur); +check_s_state: switch (qp->s_state) { default: if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_NEXT_SEND_OK)) @@ -375,9 +431,13 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) /* * If a fence is requested, wait for previous * RDMA read and atomic operations to finish. + * However, there is no need to guard against + * TID RDMA READ after TID RDMA READ. */ if ((wqe->wr.send_flags & IB_SEND_FENCE) && - qp->s_num_rd_atomic) { + qp->s_num_rd_atomic && + (wqe->wr.opcode != IB_WR_TID_RDMA_READ || + priv->pending_tid_r_segs < qp->s_num_rd_atomic)) { qp->s_flags |= RVT_S_WAIT_FENCE; goto bail; } @@ -526,6 +586,75 @@ no_flow_control: qp->s_cur = 0; break; + case IB_WR_TID_RDMA_READ: + wpriv = wqe->priv; + req = wqe_to_tid_req(wqe); + delta = cmp_psn(qp->s_psn, wqe->psn); + + /* + * Don't allow more operations to be started + * than the QP limits allow. We could get here under + * three conditions; (1) It's a new request; (2) We are + * sending the second or later segment of a request, + * but the qp->s_state is set to OP(RDMA_READ_REQUEST) + * when the last segment of a previous request is + * received just before this; (3) We are re-sending a + * request. + */ + if (qp->s_num_rd_atomic >= qp->s_max_rd_atomic) { + qp->s_flags |= RVT_S_WAIT_RDMAR; + goto bail; + } + if (newreq) { + struct tid_rdma_flow *flow = + &req->flows[req->setup_head]; + + /* + * Set up s_sge as it is needed for TID + * allocation. However, if the pages have been + * walked and mapped, skip it. An earlier try + * has failed to allocate the TID entries. + */ + if (!flow->npagesets) { + qp->s_sge.sge = wqe->sg_list[0]; + qp->s_sge.sg_list = wqe->sg_list + 1; + qp->s_sge.num_sge = wqe->wr.num_sge; + qp->s_sge.total_len = wqe->length; + qp->s_len = wqe->length; + req->isge = 0; + req->clear_tail = req->setup_head; + req->flow_idx = req->setup_head; + req->state = TID_REQUEST_ACTIVE; + } + } else if (delta == 0) { + /* Re-send a request */ + req->cur_seg = 0; + req->comp_seg = 0; + req->ack_pending = 0; + req->flow_idx = req->clear_tail; + req->state = TID_REQUEST_RESEND; + } + req->s_next_psn = qp->s_psn; + /* Read one segment at a time */ + len = min_t(u32, req->seg_len, + wqe->length - req->seg_len * req->cur_seg); + delta = hfi1_build_tid_rdma_read_req(qp, wqe, ohdr, + &bth1, &bth2, + &len); + if (delta <= 0) { + /* Wait for TID space */ + goto bail; + } + if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) + qp->s_lsn++; + hwords += delta; + ss = &wpriv->ss; + /* Check if this is the last segment */ + if (req->cur_seg >= req->total_segs && + ++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + case IB_WR_ATOMIC_CMP_AND_SWP: case IB_WR_ATOMIC_FETCH_AND_ADD: /* @@ -571,11 +700,13 @@ no_flow_control: default: goto bail; } - qp->s_sge.sge = wqe->sg_list[0]; - qp->s_sge.sg_list = wqe->sg_list + 1; - qp->s_sge.num_sge = wqe->wr.num_sge; - qp->s_sge.total_len = wqe->length; - qp->s_len = wqe->length; + if (wqe->wr.opcode != IB_WR_TID_RDMA_READ) { + qp->s_sge.sge = wqe->sg_list[0]; + qp->s_sge.sg_list = wqe->sg_list + 1; + qp->s_sge.num_sge = wqe->wr.num_sge; + qp->s_sge.total_len = wqe->length; + qp->s_len = wqe->length; + } if (newreq) { qp->s_tail++; if (qp->s_tail >= qp->s_size) @@ -583,6 +714,8 @@ no_flow_control: } if (wqe->wr.opcode == IB_WR_RDMA_READ) qp->s_psn = wqe->lpsn + 1; + else if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) + qp->s_psn = req->s_next_psn; else qp->s_psn++; break; @@ -699,6 +832,99 @@ no_flow_control: if (qp->s_cur == qp->s_size) qp->s_cur = 0; break; + case TID_OP(READ_RESP): + if (wqe->wr.opcode != IB_WR_TID_RDMA_READ) + goto bail; + /* This is used to restart a TID read request */ + req = wqe_to_tid_req(wqe); + wpriv = wqe->priv; + /* + * Back down. The field qp->s_psn has been set to the psn with + * which the request should be restart. It's OK to use division + * as this is on the retry path. + */ + req->cur_seg = delta_psn(qp->s_psn, wqe->psn) / priv->pkts_ps; + + /* + * The following function need to be redefined to return the + * status to make sure that we find the flow. At the same + * time, we can use the req->state change to check if the + * call succeeds or not. + */ + req->state = TID_REQUEST_RESEND; + hfi1_tid_rdma_restart_req(qp, wqe, &bth2); + if (req->state != TID_REQUEST_ACTIVE) { + /* + * Failed to find the flow. Release all allocated tid + * resources. + */ + hfi1_kern_exp_rcv_clear_all(req); + hfi1_kern_clear_hw_flow(priv->rcd, qp); + + hfi1_trdma_send_complete(qp, wqe, IB_WC_LOC_QP_OP_ERR); + goto bail; + } + req->state = TID_REQUEST_RESEND; + len = min_t(u32, req->seg_len, + wqe->length - req->seg_len * req->cur_seg); + flow = &req->flows[req->flow_idx]; + len -= flow->sent; + req->s_next_psn = flow->flow_state.ib_lpsn + 1; + delta = hfi1_build_tid_rdma_read_packet(wqe, ohdr, &bth1, + &bth2, &len); + if (delta <= 0) { + /* Wait for TID space */ + goto bail; + } + hwords += delta; + ss = &wpriv->ss; + /* Check if this is the last segment */ + if (req->cur_seg >= req->total_segs && + ++qp->s_cur == qp->s_size) + qp->s_cur = 0; + qp->s_psn = req->s_next_psn; + break; + case TID_OP(READ_REQ): + req = wqe_to_tid_req(wqe); + delta = cmp_psn(qp->s_psn, wqe->psn); + /* + * If the current WR is not TID RDMA READ, or this is the start + * of a new request, we need to change the qp->s_state so that + * the request can be set up properly. + */ + if (wqe->wr.opcode != IB_WR_TID_RDMA_READ || delta == 0 || + qp->s_cur == qp->s_tail) { + qp->s_state = OP(RDMA_READ_REQUEST); + if (delta == 0 || qp->s_cur == qp->s_tail) + goto check_s_state; + else + goto bail; + } + + /* Rate limiting */ + if (qp->s_num_rd_atomic >= qp->s_max_rd_atomic) { + qp->s_flags |= RVT_S_WAIT_RDMAR; + goto bail; + } + + wpriv = wqe->priv; + /* Read one segment at a time */ + len = min_t(u32, req->seg_len, + wqe->length - req->seg_len * req->cur_seg); + delta = hfi1_build_tid_rdma_read_req(qp, wqe, ohdr, &bth1, + &bth2, &len); + if (delta <= 0) { + /* Wait for TID space */ + goto bail; + } + hwords += delta; + ss = &wpriv->ss; + /* Check if this is the last segment */ + if (req->cur_seg >= req->total_segs && + ++qp->s_cur == qp->s_size) + qp->s_cur = 0; + qp->s_psn = req->s_next_psn; + break; } qp->s_sending_hpsn = bth2; delta = delta_psn(bth2, wqe->psn); @@ -1148,8 +1374,8 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait) hfi1_kern_clear_hw_flow(priv->rcd, qp); } - rvt_send_complete(qp, wqe, - IB_WC_RETRY_EXC_ERR); + hfi1_trdma_send_complete(qp, wqe, + IB_WC_RETRY_EXC_ERR); rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); } return; @@ -1189,7 +1415,8 @@ static void reset_sending_psn(struct rvt_qp *qp, u32 psn) for (;;) { wqe = rvt_get_swqe_ptr(qp, n); if (cmp_psn(psn, wqe->lpsn) <= 0) { - if (wqe->wr.opcode == IB_WR_RDMA_READ) + if (wqe->wr.opcode == IB_WR_RDMA_READ || + wqe->wr.opcode == IB_WR_TID_RDMA_READ) qp->s_sending_psn = wqe->lpsn + 1; else qp->s_sending_psn = psn + 1; @@ -1238,8 +1465,9 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah) } opcode = ib_bth_get_opcode(ohdr); - if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) && - opcode <= OP(ATOMIC_ACKNOWLEDGE)) { + if ((opcode >= OP(RDMA_READ_RESPONSE_FIRST) && + opcode <= OP(ATOMIC_ACKNOWLEDGE)) || + opcode == TID_OP(READ_RESP)) { WARN_ON(!qp->s_rdma_ack_cnt); qp->s_rdma_ack_cnt--; return; @@ -1255,8 +1483,12 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah) if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail && !(qp->s_flags & (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) && - (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) - rvt_add_retry_timer(qp); + (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { + if (opcode == TID_OP(READ_REQ)) + rvt_add_retry_timer_ext(qp, priv->timeout_shift); + else + rvt_add_retry_timer(qp); + } while (qp->s_last != qp->s_acked) { u32 s_last; @@ -1265,6 +1497,7 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah) if (cmp_psn(wqe->lpsn, qp->s_sending_psn) >= 0 && cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) break; + trdma_clean_swqe(qp, wqe); rvt_qp_wqe_unreserve(qp, wqe); s_last = qp->s_last; trace_hfi1_qp_send_completion(qp, wqe, s_last); @@ -1317,6 +1550,7 @@ struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) { u32 s_last; + trdma_clean_swqe(qp, wqe); rvt_put_swqe(wqe); rvt_qp_wqe_unreserve(qp, wqe); s_last = qp->s_last; @@ -1393,6 +1627,7 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, { struct hfi1_ibport *ibp; enum ib_wc_status status; + struct hfi1_qp_priv *qpriv = qp->priv; struct rvt_swqe *wqe; int ret = 0; u32 ack_psn; @@ -1439,6 +1674,8 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, */ if ((wqe->wr.opcode == IB_WR_RDMA_READ && (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) || + (wqe->wr.opcode == IB_WR_TID_RDMA_READ && + (opcode != TID_OP(READ_RESP) || diff != 0)) || ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) && (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) { @@ -1492,7 +1729,13 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, switch (aeth >> IB_AETH_NAK_SHIFT) { case 0: /* ACK */ this_cpu_inc(*ibp->rvp.rc_acks); - if (qp->s_acked != qp->s_tail) { + if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { + if (wqe_to_tid_req(wqe)->ack_pending) + rvt_mod_retry_timer_ext(qp, + qpriv->timeout_shift); + else + rvt_stop_rc_timers(qp); + } else if (qp->s_acked != qp->s_tail) { /* * We are expecting more ACKs so * mod the retry timer. @@ -1581,7 +1824,10 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, ibp->rvp.n_other_naks++; class_b: if (qp->s_last == qp->s_acked) { - rvt_send_complete(qp, wqe, status); + if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) + hfi1_kern_read_tid_flow_free(qp); + + hfi1_trdma_send_complete(qp, wqe, status); rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); } break; @@ -1622,6 +1868,7 @@ static void rdma_seq_err(struct rvt_qp *qp, struct hfi1_ibport *ibp, u32 psn, while (cmp_psn(psn, wqe->lpsn) > 0) { if (wqe->wr.opcode == IB_WR_RDMA_READ || + wqe->wr.opcode == IB_WR_TID_RDMA_READ || wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) break; diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index f767c5c20566..f6d9e2717106 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -2796,3 +2796,36 @@ void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe, req->state = TID_REQUEST_ACTIVE; } + +void hfi1_qp_kern_exp_rcv_clear_all(struct rvt_qp *qp) +{ + int i, ret; + struct hfi1_qp_priv *qpriv = qp->priv; + struct tid_flow_state *fs; + + if (qp->ibqp.qp_type != IB_QPT_RC || !HFI1_CAP_IS_KSET(TID_RDMA)) + return; + + /* + * First, clear the flow to help prevent any delayed packets from + * being delivered. + */ + fs = &qpriv->flow_state; + if (fs->index != RXE_NUM_TID_FLOWS) + hfi1_kern_clear_hw_flow(qpriv->rcd, qp); + + for (i = qp->s_acked; i != qp->s_head;) { + struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, i); + + if (++i == qp->s_size) + i = 0; + /* Free only locally allocated TID entries */ + if (wqe->wr.opcode != IB_WR_TID_RDMA_READ) + continue; + do { + struct hfi1_swqe_priv *priv = wqe->priv; + + ret = hfi1_kern_exp_rcv_clear(&priv->tid_req); + } while (!ret); + } +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index beb5982ce6ad..4f85b7ea5cf3 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -209,5 +209,6 @@ bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd, struct hfi1_packet *packet); void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe, u32 *bth2); +void hfi1_qp_kern_exp_rcv_clear_all(struct rvt_qp *qp); #endif /* HFI1_TID_RDMA_H */ diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 2d59fcde4db6..88676ca79fda 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -165,6 +165,7 @@ const enum ib_wc_opcode ib_hfi1_wc_opcode[] = { [IB_WR_SEND] = IB_WC_SEND, [IB_WR_SEND_WITH_IMM] = IB_WC_SEND, [IB_WR_RDMA_READ] = IB_WC_RDMA_READ, + [IB_WR_TID_RDMA_READ] = IB_WC_RDMA_READ, [IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP, [IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD, [IB_WR_SEND_WITH_INV] = IB_WC_SEND, -- cgit v1.2.3-59-g8ed1b From a0b34f75ec209e40f06912380533ec525691544f Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Thu, 24 Jan 2019 06:36:48 -0800 Subject: IB/hfi1: Add interlock between a TID RDMA request and other requests This locking mechanism is designed to provent vavious memory corruption scenarios from occurring when requests are pipelined, especially when RDMA READ/WRITE requests are interleaved with TID RDMA READ/WRITE requests: 1. READ-AFTER-READ; 2. READ-AFTER-WRITE; 3. WRITE-AFTER-READ; When memory corruption is likely, a request will be held back until previous requests have been completed. Reviewed-by: Mike Marciniszyn Signed-off-by: Mitko Haralanov Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/rc.c | 16 +++++++++++++++ drivers/infiniband/hw/hfi1/tid_rdma.c | 37 +++++++++++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/tid_rdma.h | 11 +++++++++++ drivers/infiniband/hw/hfi1/verbs.h | 3 +++ 4 files changed, 67 insertions(+) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index a5aacf8e5b93..349751cb8b47 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -482,6 +482,15 @@ check_s_state: len = wqe->length; ss = &qp->s_sge; bth2 = mask_psn(qp->s_psn); + + /* + * Interlock between various IB requests and TID RDMA + * if necessary. + */ + if ((priv->s_flags & HFI1_S_TID_WAIT_INTERLCK) || + hfi1_tid_rdma_wqe_interlock(qp, wqe)) + goto bail; + switch (wqe->wr.opcode) { case IB_WR_SEND: case IB_WR_SEND_WITH_IMM: @@ -1321,6 +1330,7 @@ static void reset_psn(struct rvt_qp *qp, u32 psn) qp->s_state = OP(SEND_LAST); } done: + priv->s_flags &= ~HFI1_S_TID_WAIT_INTERLCK; qp->s_psn = psn; /* * Set RVT_S_WAIT_PSN as rc_complete() may start the timer @@ -1540,6 +1550,8 @@ struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, struct rvt_swqe *wqe, struct hfi1_ibport *ibp) { + struct hfi1_qp_priv *priv = qp->priv; + lockdep_assert_held(&qp->s_lock); /* * Don't decrement refcount and don't generate a @@ -1608,6 +1620,10 @@ struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, qp->s_draining = 0; wqe = rvt_get_swqe_ptr(qp, qp->s_acked); } + if (priv->s_flags & HFI1_S_TID_WAIT_INTERLCK) { + priv->s_flags &= ~HFI1_S_TID_WAIT_INTERLCK; + hfi1_schedule_send(qp); + } return wqe; } diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index f6d9e2717106..ccf15c9e76f8 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -2829,3 +2829,40 @@ void hfi1_qp_kern_exp_rcv_clear_all(struct rvt_qp *qp) } while (!ret); } } + +bool hfi1_tid_rdma_wqe_interlock(struct rvt_qp *qp, struct rvt_swqe *wqe) +{ + struct rvt_swqe *prev; + struct hfi1_qp_priv *priv = qp->priv; + u32 s_prev; + + s_prev = (qp->s_cur == 0 ? qp->s_size : qp->s_cur) - 1; + prev = rvt_get_swqe_ptr(qp, s_prev); + + switch (wqe->wr.opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + case IB_WR_SEND_WITH_INV: + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_READ: + break; + case IB_WR_TID_RDMA_READ: + switch (prev->wr.opcode) { + case IB_WR_RDMA_READ: + if (qp->s_acked != qp->s_cur) + goto interlock; + break; + default: + break; + } + default: + break; + } + return false; + +interlock: + priv->s_flags |= HFI1_S_TID_WAIT_INTERLCK; + return true; +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index 4f85b7ea5cf3..689a5490432f 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -17,6 +17,16 @@ #define TID_RDMA_MAX_SEGMENT_SIZE BIT(18) /* 256 KiB (for now) */ #define TID_RDMA_MAX_PAGES (BIT(18) >> PAGE_SHIFT) +/* + * Bit definitions for priv->s_flags. + * These bit flags overload the bit flags defined for the QP's s_flags. + * Due to the fact that these bit fields are used only for the QP priv + * s_flags, there are no collisions. + * + * HFI1_S_TID_WAIT_INTERLCK - QP is waiting for requester interlock + */ +#define HFI1_S_TID_WAIT_INTERLCK BIT(5) + struct tid_rdma_params { struct rcu_head rcu_head; u32 qp; @@ -210,5 +220,6 @@ bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd, void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe, u32 *bth2); void hfi1_qp_kern_exp_rcv_clear_all(struct rvt_qp *qp); +bool hfi1_tid_rdma_wqe_interlock(struct rvt_qp *qp, struct rvt_swqe *wqe); #endif /* HFI1_TID_RDMA_H */ diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index 7642b59ad5d2..841727a684d5 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -171,6 +171,9 @@ struct hfi1_qp_priv { u8 hdr_type; /* 9B or 16B */ unsigned long tid_timer_timeout_jiffies; + /* variables for the TID RDMA SE state machine */ + u32 s_flags; + /* For TID RDMA READ */ u32 tid_r_reqs; /* Num of tid reads requested */ u32 tid_r_comp; /* Num of tid reads completed */ -- cgit v1.2.3-59-g8ed1b From f1ab4efa6d32e98f9e604c9dde57cfe7b89a6c07 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 19:32:30 -0800 Subject: IB/hfi1: Enable TID RDMA READ protocol This patch enables TID RDMA READ protocol by converting a qualified RDMA READ request into a TID RDMA READ request internally: (1) The TID RDMA capability must be enabled; (2) The request must start on a 4K page boundary and all receiving buffers must start on 4K page boundaries; (3) The request length must be a multiple of 4K and must be larger or equal to 256K. Each receiving buffer length must be a multiple of 4K. Signed-off-by: Mitko Haralanov Signed-off-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/qp.c | 1 + drivers/infiniband/hw/hfi1/tid_rdma.c | 68 +++++++++++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/tid_rdma.h | 11 ++++++ 3 files changed, 80 insertions(+) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index 5fea7319167e..acdd9eba189b 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -319,6 +319,7 @@ int hfi1_setup_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe, bool *call_send) switch (qp->ibqp.qp_type) { case IB_QPT_RC: + hfi1_setup_tid_rdma_wqe(qp, wqe); case IB_QPT_UC: if (wqe->length > 0x80000000U) return -EINVAL; diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index ccf15c9e76f8..d65c030911c9 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -2866,3 +2866,71 @@ interlock: priv->s_flags |= HFI1_S_TID_WAIT_INTERLCK; return true; } + +/* Does @sge meet the alignment requirements for tid rdma? */ +static inline bool hfi1_check_sge_align(struct rvt_sge *sge, int num_sge) +{ + int i; + + for (i = 0; i < num_sge; i++, sge++) + if ((u64)sge->vaddr & ~PAGE_MASK || + sge->sge_length & ~PAGE_MASK) + return false; + return true; +} + +void setup_tid_rdma_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe) +{ + struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv; + struct hfi1_swqe_priv *priv = wqe->priv; + struct tid_rdma_params *remote; + enum ib_wr_opcode new_opcode; + bool do_tid_rdma = false; + struct hfi1_pportdata *ppd = qpriv->rcd->ppd; + + if ((rdma_ah_get_dlid(&qp->remote_ah_attr) & ~((1 << ppd->lmc) - 1)) == + ppd->lid) + return; + if (qpriv->hdr_type != HFI1_PKT_TYPE_9B) + return; + + rcu_read_lock(); + remote = rcu_dereference(qpriv->tid_rdma.remote); + /* + * If TID RDMA is disabled by the negotiation, don't + * use it. + */ + if (!remote) + goto exit; + + if (wqe->wr.opcode == IB_WR_RDMA_READ) { + if (hfi1_check_sge_align(&wqe->sg_list[0], wqe->wr.num_sge)) { + new_opcode = IB_WR_TID_RDMA_READ; + do_tid_rdma = true; + } + } + + if (do_tid_rdma) { + if (hfi1_kern_exp_rcv_alloc_flows(&priv->tid_req, GFP_ATOMIC)) + goto exit; + wqe->wr.opcode = new_opcode; + priv->tid_req.seg_len = + min_t(u32, remote->max_len, wqe->length); + priv->tid_req.total_segs = + DIV_ROUND_UP(wqe->length, priv->tid_req.seg_len); + /* Compute the last PSN of the request */ + wqe->lpsn = wqe->psn; + if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { + priv->tid_req.n_flows = remote->max_read; + qpriv->tid_r_reqs++; + wqe->lpsn += rvt_div_round_up_mtu(qp, wqe->length) - 1; + } + + priv->tid_req.cur_seg = 0; + priv->tid_req.comp_seg = 0; + priv->tid_req.ack_seg = 0; + priv->tid_req.state = TID_REQUEST_INACTIVE; + } +exit: + rcu_read_unlock(); +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index 689a5490432f..a53598ce45b2 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -14,6 +14,7 @@ #define CIRC_NEXT(val, size) CIRC_ADD(val, 1, size) #define CIRC_PREV(val, size) CIRC_ADD(val, -1, size) +#define TID_RDMA_MIN_SEGMENT_SIZE BIT(18) /* 256 KiB (for now) */ #define TID_RDMA_MAX_SEGMENT_SIZE BIT(18) /* 256 KiB (for now) */ #define TID_RDMA_MAX_PAGES (BIT(18) >> PAGE_SHIFT) @@ -222,4 +223,14 @@ void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe, void hfi1_qp_kern_exp_rcv_clear_all(struct rvt_qp *qp); bool hfi1_tid_rdma_wqe_interlock(struct rvt_qp *qp, struct rvt_swqe *wqe); +void setup_tid_rdma_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe); +static inline void hfi1_setup_tid_rdma_wqe(struct rvt_qp *qp, + struct rvt_swqe *wqe) +{ + if (wqe->priv && + wqe->wr.opcode == IB_WR_RDMA_READ && + wqe->length >= TID_RDMA_MIN_SEGMENT_SIZE) + setup_tid_rdma_wqe(qp, wqe); +} + #endif /* HFI1_TID_RDMA_H */ -- cgit v1.2.3-59-g8ed1b From 3ce5daa2c1798a530db9a01cd35122e0958538ad Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 19:32:44 -0800 Subject: IB/hfi1: Add static trace for TID RDMA READ protocol This patch makes the following changes to the static trace: 1. Adds the decoding of TID RDMA READ packets in IB header trace; 2. Tracks qpriv->s_flags and iow_flags in qpsleepwakeup trace; 3. Adds a new event to track RC ACK receiving; 4. Adds trace events for various stages of the TID RDMA READ protocol. These events provide a fine-grained control for monitoring and debugging the hfi1 driver in the filed. Reviewed-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/rc.c | 16 + drivers/infiniband/hw/hfi1/tid_rdma.c | 51 ++- drivers/infiniband/hw/hfi1/trace.c | 36 ++ drivers/infiniband/hw/hfi1/trace_ibhdrs.h | 2 + drivers/infiniband/hw/hfi1/trace_rc.h | 48 +++ drivers/infiniband/hw/hfi1/trace_tid.h | 528 +++++++++++++++++++++++++++++- drivers/infiniband/hw/hfi1/trace_tx.h | 12 +- 7 files changed, 684 insertions(+), 9 deletions(-) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 349751cb8b47..6c9ef572fc69 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -121,6 +121,7 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, bool last_pkt; u32 delta; + trace_hfi1_rsp_make_rc_ack(qp, 0); lockdep_assert_held(&qp->s_lock); /* Don't send an ACK if we aren't supposed to. */ if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) @@ -349,6 +350,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) int delta; struct tid_rdma_flow *flow = NULL; + trace_hfi1_sender_make_rc_req(qp); lockdep_assert_held(&qp->s_lock); ps->s_txreq = get_txreq(ps->dev, qp); if (!ps->s_txreq) @@ -596,8 +598,13 @@ no_flow_control: break; case IB_WR_TID_RDMA_READ: + trace_hfi1_tid_read_sender_make_req(qp, newreq); wpriv = wqe->priv; req = wqe_to_tid_req(wqe); + trace_hfi1_tid_req_make_req_read(qp, newreq, + wqe->wr.opcode, + wqe->psn, wqe->lpsn, + req); delta = cmp_psn(qp->s_psn, wqe->psn); /* @@ -892,6 +899,8 @@ no_flow_control: ++qp->s_cur == qp->s_size) qp->s_cur = 0; qp->s_psn = req->s_next_psn; + trace_hfi1_tid_req_make_req_read(qp, 0, wqe->wr.opcode, + wqe->psn, wqe->lpsn, req); break; case TID_OP(READ_REQ): req = wqe_to_tid_req(wqe); @@ -933,6 +942,8 @@ no_flow_control: ++qp->s_cur == qp->s_size) qp->s_cur = 0; qp->s_psn = req->s_next_psn; + trace_hfi1_tid_req_make_req_read(qp, 0, wqe->wr.opcode, + wqe->psn, wqe->lpsn, req); break; } qp->s_sending_hpsn = bth2; @@ -1341,6 +1352,7 @@ done: (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)) qp->s_flags |= RVT_S_WAIT_PSN; qp->s_flags &= ~HFI1_S_AHG_VALID; + trace_hfi1_sender_reset_psn(qp); } /* @@ -1355,6 +1367,7 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait) lockdep_assert_held(&qp->r_lock); lockdep_assert_held(&qp->s_lock); + trace_hfi1_sender_restart_rc(qp); if (qp->s_retry == 0) { if (qp->s_mig_state == IB_MIG_ARMED) { hfi1_migrate_qp(qp); @@ -1558,6 +1571,7 @@ struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, * completion if the SWQE is being resent until the send * is finished. */ + trace_hfi1_rc_completion(qp, wqe->lpsn); if (cmp_psn(wqe->lpsn, qp->s_sending_psn) < 0 || cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) { u32 s_last; @@ -1742,6 +1756,8 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, break; } + trace_hfi1_rc_ack_do(qp, aeth, psn, wqe); + trace_hfi1_sender_do_rc_ack(qp); switch (aeth >> IB_AETH_NAK_SHIFT) { case 0: /* ACK */ this_cpu_inc(*ibp->rvp.rc_acks); diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index d65c030911c9..0ee79403acaf 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -1688,6 +1688,7 @@ u32 hfi1_build_tid_rdma_read_packet(struct rvt_swqe *wqe, /* This is the IB psn used to send the request */ *bth2 = mask_psn(flow->flow_state.ib_spsn + flow->pkt); + trace_hfi1_tid_flow_build_read_pkt(qp, req->flow_idx, flow); /* TID Entries for TID RDMA READ payload */ req_addr = &flow->tid_entry[flow->tid_idx]; @@ -1768,6 +1769,8 @@ u32 hfi1_build_tid_rdma_read_req(struct rvt_qp *qp, struct rvt_swqe *wqe, bool retry = true; u32 npkts = rvt_div_round_up_mtu(qp, *len); + trace_hfi1_tid_req_build_read_req(qp, 0, wqe->wr.opcode, wqe->psn, + wqe->lpsn, req); /* * Check sync conditions. Make sure that there are no pending * segments before freeing the flow. @@ -1883,6 +1886,8 @@ static int tid_rdma_rcv_read_request(struct rvt_qp *qp, */ flow->npkts = rvt_div_round_up_mtu(qp, len); for (i = 0; i < flow->tidcnt; i++) { + trace_hfi1_tid_entry_rcv_read_req(qp, i, + flow->tid_entry[i]); tlen = EXP_TID_GET(flow->tid_entry[i], LEN); if (!tlen) return 1; @@ -1917,6 +1922,7 @@ static int tid_rdma_rcv_read_request(struct rvt_qp *qp, flow->flow_state.ib_spsn = psn; flow->flow_state.ib_lpsn = flow->flow_state.ib_spsn + flow->npkts - 1; + trace_hfi1_tid_flow_rcv_read_req(qp, req->setup_head, flow); /* Set the initial flow index to the current flow. */ req->flow_idx = req->setup_head; @@ -1942,6 +1948,8 @@ static int tid_rdma_rcv_read_request(struct rvt_qp *qp, req->total_segs = 1; req->r_flow_psn = e->psn; + trace_hfi1_tid_req_rcv_read_req(qp, 0, e->opcode, e->psn, e->lpsn, + req); return 0; } @@ -1957,6 +1965,8 @@ static int tid_rdma_rcv_error(struct hfi1_packet *packet, u8 prev; bool old_req; + trace_hfi1_rsp_tid_rcv_error(qp, psn); + trace_hfi1_tid_rdma_rcv_err(qp, 0, psn, diff); if (diff > 0) { /* sequence error */ if (!qp->r_nak_state) { @@ -1977,7 +1987,7 @@ static int tid_rdma_rcv_error(struct hfi1_packet *packet, req = ack_to_tid_req(e); req->r_flow_psn = psn; - + trace_hfi1_tid_req_rcv_err(qp, 0, e->opcode, e->psn, e->lpsn, req); if (e->opcode == TID_OP(READ_REQ)) { struct ib_reth *reth; u32 offset; @@ -2088,6 +2098,7 @@ void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet) is_fecn = process_ecn(qp, packet); psn = mask_psn(be32_to_cpu(ohdr->bth[2])); + trace_hfi1_rsp_rcv_tid_read_req(qp, psn); if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST)) rvt_comm_est(qp); @@ -2199,6 +2210,9 @@ u32 hfi1_build_tid_rdma_read_resp(struct rvt_qp *qp, struct rvt_ack_entry *e, next_offset = flow->tid_offset + *len; last_pkt = (flow->sent >= flow->length); + trace_hfi1_tid_entry_build_read_resp(qp, flow->tid_idx, tidentry); + trace_hfi1_tid_flow_build_read_resp(qp, req->clear_tail, flow); + rcu_read_lock(); remote = rcu_dereference(qpriv->tid_rdma.remote); if (!remote) { @@ -2293,6 +2307,7 @@ void hfi1_rc_rcv_tid_rdma_read_resp(struct hfi1_packet *packet) unsigned long flags; u32 kpsn, ipsn; + trace_hfi1_sender_rcv_tid_read_resp(qp); is_fecn = process_ecn(qp, packet); kpsn = mask_psn(be32_to_cpu(ohdr->bth[2])); aeth = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.aeth); @@ -2322,6 +2337,12 @@ void hfi1_rc_rcv_tid_rdma_read_resp(struct hfi1_packet *packet) hfi1_schedule_send(qp); } + trace_hfi1_ack(qp, ipsn); + trace_hfi1_tid_req_rcv_read_resp(qp, 0, req->e.swqe->wr.opcode, + req->e.swqe->psn, req->e.swqe->lpsn, + req); + trace_hfi1_tid_flow_rcv_read_resp(qp, req->clear_tail, flow); + /* Release the tid resources */ hfi1_kern_exp_rcv_clear(req); @@ -2671,6 +2692,8 @@ bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd, unsigned long flags; bool ret = true; + trace_hfi1_msg_handle_kdeth_eflags(NULL, "Kdeth error: rhf ", + packet->rhf); if (packet->rhf & (RHF_VCRC_ERR | RHF_ICRC_ERR)) return ret; @@ -2754,12 +2777,20 @@ void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe, if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { *bth2 = mask_psn(qp->s_psn); flow = find_flow_ib(req, *bth2, &fidx); - if (!flow) + if (!flow) { + trace_hfi1_msg_tid_restart_req(/* msg */ + qp, "!!!!!! Could not find flow to restart: bth2 ", + (u64)*bth2); + trace_hfi1_tid_req_restart_req(qp, 0, wqe->wr.opcode, + wqe->psn, wqe->lpsn, + req); return; + } } else { return; } + trace_hfi1_tid_flow_restart_req(qp, fidx, flow); diff = delta_psn(*bth2, flow->flow_state.ib_spsn); flow->sent = 0; @@ -2794,6 +2825,9 @@ void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe, /* Move flow_idx to correct index */ req->flow_idx = fidx; + trace_hfi1_tid_flow_restart_req(qp, fidx, flow); + trace_hfi1_tid_req_restart_req(qp, 0, wqe->wr.opcode, wqe->psn, + wqe->lpsn, req); req->state = TID_REQUEST_ACTIVE; } @@ -2868,14 +2902,17 @@ interlock: } /* Does @sge meet the alignment requirements for tid rdma? */ -static inline bool hfi1_check_sge_align(struct rvt_sge *sge, int num_sge) +static inline bool hfi1_check_sge_align(struct rvt_qp *qp, + struct rvt_sge *sge, int num_sge) { int i; - for (i = 0; i < num_sge; i++, sge++) + for (i = 0; i < num_sge; i++, sge++) { + trace_hfi1_sge_check_align(qp, i, sge); if ((u64)sge->vaddr & ~PAGE_MASK || sge->sge_length & ~PAGE_MASK) return false; + } return true; } @@ -2904,7 +2941,8 @@ void setup_tid_rdma_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe) goto exit; if (wqe->wr.opcode == IB_WR_RDMA_READ) { - if (hfi1_check_sge_align(&wqe->sg_list[0], wqe->wr.num_sge)) { + if (hfi1_check_sge_align(qp, &wqe->sg_list[0], + wqe->wr.num_sge)) { new_opcode = IB_WR_TID_RDMA_READ; do_tid_rdma = true; } @@ -2930,6 +2968,9 @@ void setup_tid_rdma_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe) priv->tid_req.comp_seg = 0; priv->tid_req.ack_seg = 0; priv->tid_req.state = TID_REQUEST_INACTIVE; + trace_hfi1_tid_req_setup_tid_wqe(qp, 1, wqe->wr.opcode, + wqe->psn, wqe->lpsn, + &priv->tid_req); } exit: rcu_read_unlock(); diff --git a/drivers/infiniband/hw/hfi1/trace.c b/drivers/infiniband/hw/hfi1/trace.c index f1154c3c294e..28181d711fed 100644 --- a/drivers/infiniband/hw/hfi1/trace.c +++ b/drivers/infiniband/hw/hfi1/trace.c @@ -129,6 +129,10 @@ const char *hfi1_trace_get_packet_l2_str(u8 l2) #define IETH_PRN "ieth rkey:0x%.8x" #define ATOMICACKETH_PRN "origdata:%llx" #define ATOMICETH_PRN "vaddr:0x%llx rkey:0x%.8x sdata:%llx cdata:%llx" +#define TID_RDMA_KDETH "kdeth0 0x%x kdeth1 0x%x" +#define TID_RDMA_KDETH_DATA "kdeth0 0x%x: kver %u sh %u intr %u tidctrl %u tid %x offset %x kdeth1 0x%x: jkey %x" +#define TID_READ_REQ_PRN "tid_flow_psn 0x%x tid_flow_qp 0x%x verbs_qp 0x%x" +#define TID_READ_RSP_PRN "verbs_qp 0x%x" #define OP(transport, op) IB_OPCODE_## transport ## _ ## op @@ -323,6 +327,38 @@ const char *parse_everbs_hdrs( parse_syndrome(be32_to_cpu(eh->aeth) >> 24), be32_to_cpu(eh->aeth) & IB_MSN_MASK); break; + case OP(TID_RDMA, READ_REQ): + trace_seq_printf(p, TID_RDMA_KDETH " " RETH_PRN " " + TID_READ_REQ_PRN, + le32_to_cpu(eh->tid_rdma.r_req.kdeth0), + le32_to_cpu(eh->tid_rdma.r_req.kdeth1), + ib_u64_get(&eh->tid_rdma.r_req.reth.vaddr), + be32_to_cpu(eh->tid_rdma.r_req.reth.rkey), + be32_to_cpu(eh->tid_rdma.r_req.reth.length), + be32_to_cpu(eh->tid_rdma.r_req.tid_flow_psn), + be32_to_cpu(eh->tid_rdma.r_req.tid_flow_qp), + be32_to_cpu(eh->tid_rdma.r_req.verbs_qp)); + break; + case OP(TID_RDMA, READ_RESP): + trace_seq_printf(p, TID_RDMA_KDETH_DATA " " AETH_PRN " " + TID_READ_RSP_PRN, + le32_to_cpu(eh->tid_rdma.r_rsp.kdeth0), + KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, KVER), + KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, SH), + KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, INTR), + KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, TIDCTRL), + KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, TID), + KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, OFFSET), + le32_to_cpu(eh->tid_rdma.r_rsp.kdeth1), + KDETH_GET(eh->tid_rdma.r_rsp.kdeth1, JKEY), + be32_to_cpu(eh->tid_rdma.r_rsp.aeth) >> 24, + parse_syndrome(/* aeth */ + be32_to_cpu(eh->tid_rdma.r_rsp.aeth) + >> 24), + (be32_to_cpu(eh->tid_rdma.r_rsp.aeth) & + IB_MSN_MASK), + be32_to_cpu(eh->tid_rdma.r_rsp.verbs_qp)); + break; /* aeth + atomicacketh */ case OP(RC, ATOMIC_ACKNOWLEDGE): trace_seq_printf(p, AETH_PRN " " ATOMICACKETH_PRN, diff --git a/drivers/infiniband/hw/hfi1/trace_ibhdrs.h b/drivers/infiniband/hw/hfi1/trace_ibhdrs.h index 1dc2c28fc96e..1116238bf24d 100644 --- a/drivers/infiniband/hw/hfi1/trace_ibhdrs.h +++ b/drivers/infiniband/hw/hfi1/trace_ibhdrs.h @@ -79,6 +79,8 @@ __print_symbolic(opcode, \ ib_opcode_name(RC_ATOMIC_ACKNOWLEDGE), \ ib_opcode_name(RC_COMPARE_SWAP), \ ib_opcode_name(RC_FETCH_ADD), \ + ib_opcode_name(TID_RDMA_READ_REQ), \ + ib_opcode_name(TID_RDMA_READ_RESP), \ ib_opcode_name(UC_SEND_FIRST), \ ib_opcode_name(UC_SEND_MIDDLE), \ ib_opcode_name(UC_SEND_LAST), \ diff --git a/drivers/infiniband/hw/hfi1/trace_rc.h b/drivers/infiniband/hw/hfi1/trace_rc.h index 8ce476570462..1ebca37862e0 100644 --- a/drivers/infiniband/hw/hfi1/trace_rc.h +++ b/drivers/infiniband/hw/hfi1/trace_rc.h @@ -109,6 +109,54 @@ DEFINE_EVENT(hfi1_rc_template, hfi1_rcv_error, TP_ARGS(qp, psn) ); +DEFINE_EVENT(/* event */ + hfi1_rc_template, hfi1_rc_completion, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + +DECLARE_EVENT_CLASS(/* rc_ack */ + hfi1_rc_ack_template, + TP_PROTO(struct rvt_qp *qp, u32 aeth, u32 psn, + struct rvt_swqe *wqe), + TP_ARGS(qp, aeth, psn, wqe), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(u32, aeth) + __field(u32, psn) + __field(u8, opcode) + __field(u32, spsn) + __field(u32, lpsn) + ), + TP_fast_assign(/* assign */ + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)) + __entry->qpn = qp->ibqp.qp_num; + __entry->aeth = aeth; + __entry->psn = psn; + __entry->opcode = wqe->wr.opcode; + __entry->spsn = wqe->psn; + __entry->lpsn = wqe->lpsn; + ), + TP_printk(/* print */ + "[%s] qpn 0x%x aeth 0x%x psn 0x%x opcode 0x%x spsn 0x%x lpsn 0x%x", + __get_str(dev), + __entry->qpn, + __entry->aeth, + __entry->psn, + __entry->opcode, + __entry->spsn, + __entry->lpsn + ) +); + +DEFINE_EVENT(/* do_rc_ack */ + hfi1_rc_ack_template, hfi1_rc_ack_do, + TP_PROTO(struct rvt_qp *qp, u32 aeth, u32 psn, + struct rvt_swqe *wqe), + TP_ARGS(qp, aeth, psn, wqe) +); + #endif /* __HFI1_TRACE_RC_H */ #undef TRACE_INCLUDE_PATH diff --git a/drivers/infiniband/hw/hfi1/trace_tid.h b/drivers/infiniband/hw/hfi1/trace_tid.h index c1da744f44a5..b71638c22d4b 100644 --- a/drivers/infiniband/hw/hfi1/trace_tid.h +++ b/drivers/infiniband/hw/hfi1/trace_tid.h @@ -31,11 +31,41 @@ u16 hfi1_trace_get_tid_idx(u32 ent); #define TID_FLOW_PRN "[%s] qpn 0x%x flow %d: idx %d resp_ib_psn 0x%x " \ "generation 0x%x fpsn 0x%x-%x r_next_psn 0x%x " \ - "npagesets %u tnode_cnt %u tidcnt %u length %u" + "ib_psn 0x%x-%x npagesets %u tnode_cnt %u " \ + "tidcnt %u tid_idx %u tid_offset %u length %u sent %u" #define TID_NODE_PRN "[%s] qpn 0x%x %s idx %u grp base 0x%x map 0x%x " \ "used %u cnt %u" +#define RSP_INFO_PRN "[%s] qpn 0x%x state 0x%x s_state 0x%x psn 0x%x " \ + "r_psn 0x%x r_state 0x%x r_flags 0x%x " \ + "r_head_ack_queue %u s_tail_ack_queue %u " \ + "s_ack_state 0x%x " \ + "s_nak_state 0x%x s_flags 0x%x ps_flags 0x%x " \ + "iow_flags 0x%lx" + +#define SENDER_INFO_PRN "[%s] qpn 0x%x state 0x%x s_cur %u s_tail %u " \ + "s_head %u s_acked %u s_last %u s_psn 0x%x " \ + "s_last_psn 0x%x s_flags 0x%x ps_flags 0x%x " \ + "iow_flags 0x%lx s_state 0x%x s_num_rd %u s_retry %u" + +#define TID_READ_SENDER_PRN "[%s] qpn 0x%x newreq %u tid_r_reqs %u " \ + "tid_r_comp %u pending_tid_r_segs %u " \ + "s_flags 0x%x ps_flags 0x%x iow_flags 0x%lx " \ + "hw_flow_index %u generation 0x%x " \ + "fpsn 0x%x flow_flags 0x%x" + +#define TID_REQ_PRN "[%s] qpn 0x%x newreq %u opcode 0x%x psn 0x%x lpsn 0x%x " \ + "cur_seg %u comp_seg %u ack_seg %u " \ + "total_segs %u setup_head %u clear_tail %u flow_idx %u " \ + "state %u r_flow_psn 0x%x " \ + "s_next_psn 0x%x" + +#define RCV_ERR_PRN "[%s] qpn 0x%x s_flags 0x%x state 0x%x " \ + "s_tail_ack_queue %u " \ + "r_head_ack_queue %u opcode 0x%x psn 0x%x r_psn 0x%x " \ + " diff %d" + DECLARE_EVENT_CLASS(/* class */ hfi1_exp_tid_reg_unreg, TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages, @@ -340,6 +370,18 @@ DEFINE_EVENT(/* event */ TP_ARGS(qp, msg, more) ); +DEFINE_EVENT(/* event */ + hfi1_msg_template, hfi1_msg_tid_restart_req, + TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more), + TP_ARGS(qp, msg, more) +); + +DEFINE_EVENT(/* event */ + hfi1_msg_template, hfi1_msg_handle_kdeth_eflags, + TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more), + TP_ARGS(qp, msg, more) +); + DECLARE_EVENT_CLASS(/* tid_flow_page */ hfi1_tid_flow_page_template, TP_PROTO(struct rvt_qp *qp, struct tid_rdma_flow *flow, u32 index, @@ -429,10 +471,15 @@ DECLARE_EVENT_CLASS(/* tid_fow */ __field(u32, fspsn) __field(u32, flpsn) __field(u32, r_next_psn) + __field(u32, ib_spsn) + __field(u32, ib_lpsn) __field(u32, npagesets) __field(u32, tnode_cnt) __field(u32, tidcnt) + __field(u32, tid_idx) + __field(u32, tid_offset) __field(u32, length) + __field(u32, sent) ), TP_fast_assign(/* assign */ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); @@ -446,10 +493,15 @@ DECLARE_EVENT_CLASS(/* tid_fow */ __entry->flpsn = full_flow_psn(flow, flow->flow_state.lpsn); __entry->r_next_psn = flow->flow_state.r_next_psn; + __entry->ib_spsn = flow->flow_state.ib_spsn; + __entry->ib_lpsn = flow->flow_state.ib_lpsn; __entry->npagesets = flow->npagesets; __entry->tnode_cnt = flow->tnode_cnt; __entry->tidcnt = flow->tidcnt; + __entry->tid_idx = flow->tid_idx; + __entry->tid_offset = flow->tid_offset; __entry->length = flow->length; + __entry->sent = flow->sent; ), TP_printk(/* print */ TID_FLOW_PRN, @@ -462,10 +514,15 @@ DECLARE_EVENT_CLASS(/* tid_fow */ __entry->fspsn, __entry->flpsn, __entry->r_next_psn, + __entry->ib_spsn, + __entry->ib_lpsn, __entry->npagesets, __entry->tnode_cnt, __entry->tidcnt, - __entry->length + __entry->tid_idx, + __entry->tid_offset, + __entry->length, + __entry->sent ) ); @@ -475,6 +532,36 @@ DEFINE_EVENT(/* event */ TP_ARGS(qp, index, flow) ); +DEFINE_EVENT(/* event */ + hfi1_tid_flow_template, hfi1_tid_flow_build_read_pkt, + TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow), + TP_ARGS(qp, index, flow) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_flow_template, hfi1_tid_flow_build_read_resp, + TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow), + TP_ARGS(qp, index, flow) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_flow_template, hfi1_tid_flow_rcv_read_req, + TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow), + TP_ARGS(qp, index, flow) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_flow_template, hfi1_tid_flow_rcv_read_resp, + TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow), + TP_ARGS(qp, index, flow) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_flow_template, hfi1_tid_flow_restart_req, + TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow), + TP_ARGS(qp, index, flow) +); + DECLARE_EVENT_CLASS(/* tid_node */ hfi1_tid_node_template, TP_PROTO(struct rvt_qp *qp, const char *msg, u32 index, u32 base, @@ -557,6 +644,443 @@ DEFINE_EVENT(/* event */ TP_ARGS(qp, index, entry) ); +DEFINE_EVENT(/* event */ + hfi1_tid_entry_template, hfi1_tid_entry_build_read_resp, + TP_PROTO(struct rvt_qp *qp, int index, u32 ent), + TP_ARGS(qp, index, ent) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_entry_template, hfi1_tid_entry_rcv_read_req, + TP_PROTO(struct rvt_qp *qp, int index, u32 ent), + TP_ARGS(qp, index, ent) +); + +DECLARE_EVENT_CLASS(/* rsp_info */ + hfi1_responder_info_template, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(u8, state) + __field(u8, s_state) + __field(u32, psn) + __field(u32, r_psn) + __field(u8, r_state) + __field(u8, r_flags) + __field(u8, r_head_ack_queue) + __field(u8, s_tail_ack_queue) + __field(u8, s_ack_state) + __field(u8, s_nak_state) + __field(u8, r_nak_state) + __field(u32, s_flags) + __field(u32, ps_flags) + __field(unsigned long, iow_flags) + ), + TP_fast_assign(/* assign */ + struct hfi1_qp_priv *priv = qp->priv; + + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->qpn = qp->ibqp.qp_num; + __entry->state = qp->state; + __entry->s_state = qp->s_state; + __entry->psn = psn; + __entry->r_psn = qp->r_psn; + __entry->r_state = qp->r_state; + __entry->r_flags = qp->r_flags; + __entry->r_head_ack_queue = qp->r_head_ack_queue; + __entry->s_tail_ack_queue = qp->s_tail_ack_queue; + __entry->s_ack_state = qp->s_ack_state; + __entry->s_nak_state = qp->s_nak_state; + __entry->s_flags = qp->s_flags; + __entry->ps_flags = priv->s_flags; + __entry->iow_flags = priv->s_iowait.flags; + ), + TP_printk(/* print */ + RSP_INFO_PRN, + __get_str(dev), + __entry->qpn, + __entry->state, + __entry->s_state, + __entry->psn, + __entry->r_psn, + __entry->r_state, + __entry->r_flags, + __entry->r_head_ack_queue, + __entry->s_tail_ack_queue, + __entry->s_ack_state, + __entry->s_nak_state, + __entry->s_flags, + __entry->ps_flags, + __entry->iow_flags + ) +); + +DEFINE_EVENT(/* event */ + hfi1_responder_info_template, hfi1_rsp_make_rc_ack, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + +DEFINE_EVENT(/* event */ + hfi1_responder_info_template, hfi1_rsp_rcv_tid_read_req, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + +DEFINE_EVENT(/* event */ + hfi1_responder_info_template, hfi1_rsp_tid_rcv_error, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + +DECLARE_EVENT_CLASS(/* sender_info */ + hfi1_sender_info_template, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(u8, state) + __field(u32, s_cur) + __field(u32, s_tail) + __field(u32, s_head) + __field(u32, s_acked) + __field(u32, s_last) + __field(u32, s_psn) + __field(u32, s_last_psn) + __field(u32, s_flags) + __field(u32, ps_flags) + __field(unsigned long, iow_flags) + __field(u8, s_state) + __field(u8, s_num_rd) + __field(u8, s_retry) + ), + TP_fast_assign(/* assign */ + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)) + __entry->qpn = qp->ibqp.qp_num; + __entry->state = qp->state; + __entry->s_cur = qp->s_cur; + __entry->s_tail = qp->s_tail; + __entry->s_head = qp->s_head; + __entry->s_acked = qp->s_acked; + __entry->s_last = qp->s_last; + __entry->s_psn = qp->s_psn; + __entry->s_last_psn = qp->s_last_psn; + __entry->s_flags = qp->s_flags; + __entry->ps_flags = ((struct hfi1_qp_priv *)qp->priv)->s_flags; + __entry->iow_flags = + ((struct hfi1_qp_priv *)qp->priv)->s_iowait.flags; + __entry->s_state = qp->s_state; + __entry->s_num_rd = qp->s_num_rd_atomic; + __entry->s_retry = qp->s_retry; + ), + TP_printk(/* print */ + SENDER_INFO_PRN, + __get_str(dev), + __entry->qpn, + __entry->state, + __entry->s_cur, + __entry->s_tail, + __entry->s_head, + __entry->s_acked, + __entry->s_last, + __entry->s_psn, + __entry->s_last_psn, + __entry->s_flags, + __entry->ps_flags, + __entry->iow_flags, + __entry->s_state, + __entry->s_num_rd, + __entry->s_retry + ) +); + +DEFINE_EVENT(/* event */ + hfi1_sender_info_template, hfi1_sender_make_rc_req, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_sender_info_template, hfi1_sender_reset_psn, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_sender_info_template, hfi1_sender_restart_rc, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_sender_info_template, hfi1_sender_do_rc_ack, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_sender_info_template, hfi1_sender_rcv_tid_read_resp, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DECLARE_EVENT_CLASS(/* tid_read_sender */ + hfi1_tid_read_sender_template, + TP_PROTO(struct rvt_qp *qp, char newreq), + TP_ARGS(qp, newreq), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(char, newreq) + __field(u32, tid_r_reqs) + __field(u32, tid_r_comp) + __field(u32, pending_tid_r_segs) + __field(u32, s_flags) + __field(u32, ps_flags) + __field(unsigned long, iow_flags) + __field(u32, hw_flow_index) + __field(u32, generation) + __field(u32, fpsn) + __field(u32, flow_flags) + ), + TP_fast_assign(/* assign */ + struct hfi1_qp_priv *priv = qp->priv; + + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->qpn = qp->ibqp.qp_num; + __entry->newreq = newreq; + __entry->tid_r_reqs = priv->tid_r_reqs; + __entry->tid_r_comp = priv->tid_r_comp; + __entry->pending_tid_r_segs = priv->pending_tid_r_segs; + __entry->s_flags = qp->s_flags; + __entry->ps_flags = priv->s_flags; + __entry->iow_flags = priv->s_iowait.flags; + __entry->hw_flow_index = priv->flow_state.index; + __entry->generation = priv->flow_state.generation; + __entry->fpsn = priv->flow_state.psn; + __entry->flow_flags = priv->flow_state.flags; + ), + TP_printk(/* print */ + TID_READ_SENDER_PRN, + __get_str(dev), + __entry->qpn, + __entry->newreq, + __entry->tid_r_reqs, + __entry->tid_r_comp, + __entry->pending_tid_r_segs, + __entry->s_flags, + __entry->ps_flags, + __entry->iow_flags, + __entry->hw_flow_index, + __entry->generation, + __entry->fpsn, + __entry->flow_flags + ) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_read_sender_template, hfi1_tid_read_sender_make_req, + TP_PROTO(struct rvt_qp *qp, char newreq), + TP_ARGS(qp, newreq) +); + +DECLARE_EVENT_CLASS(/* tid_rdma_request */ + hfi1_tid_rdma_request_template, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(char, newreq) + __field(u8, opcode) + __field(u32, psn) + __field(u32, lpsn) + __field(u32, cur_seg) + __field(u32, comp_seg) + __field(u32, ack_seg) + __field(u32, total_segs) + __field(u16, setup_head) + __field(u16, clear_tail) + __field(u16, flow_idx) + __field(u32, state) + __field(u32, r_flow_psn) + __field(u32, s_next_psn) + ), + TP_fast_assign(/* assign */ + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->qpn = qp->ibqp.qp_num; + __entry->newreq = newreq; + __entry->opcode = opcode; + __entry->psn = psn; + __entry->lpsn = lpsn; + __entry->cur_seg = req->cur_seg; + __entry->comp_seg = req->comp_seg; + __entry->ack_seg = req->ack_seg; + __entry->total_segs = req->total_segs; + __entry->setup_head = req->setup_head; + __entry->clear_tail = req->clear_tail; + __entry->flow_idx = req->flow_idx; + __entry->state = req->state; + __entry->r_flow_psn = req->r_flow_psn; + __entry->s_next_psn = req->s_next_psn; + ), + TP_printk(/* print */ + TID_REQ_PRN, + __get_str(dev), + __entry->qpn, + __entry->newreq, + __entry->opcode, + __entry->psn, + __entry->lpsn, + __entry->cur_seg, + __entry->comp_seg, + __entry->ack_seg, + __entry->total_segs, + __entry->setup_head, + __entry->clear_tail, + __entry->flow_idx, + __entry->state, + __entry->r_flow_psn, + __entry->s_next_psn + ) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_make_req_read, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_build_read_req, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_read_req, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_read_resp, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_err, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_restart_req, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_setup_tid_wqe, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DECLARE_EVENT_CLASS(/* rc_rcv_err */ + hfi1_rc_rcv_err_template, + TP_PROTO(struct rvt_qp *qp, u32 opcode, u32 psn, int diff), + TP_ARGS(qp, opcode, psn, diff), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(u32, s_flags) + __field(u8, state) + __field(u8, s_tail_ack_queue) + __field(u8, r_head_ack_queue) + __field(u32, opcode) + __field(u32, psn) + __field(u32, r_psn) + __field(int, diff) + ), + TP_fast_assign(/* assign */ + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)) + __entry->qpn = qp->ibqp.qp_num; + __entry->s_flags = qp->s_flags; + __entry->state = qp->state; + __entry->s_tail_ack_queue = qp->s_tail_ack_queue; + __entry->r_head_ack_queue = qp->r_head_ack_queue; + __entry->opcode = opcode; + __entry->psn = psn; + __entry->r_psn = qp->r_psn; + __entry->diff = diff; + ), + TP_printk(/* print */ + RCV_ERR_PRN, + __get_str(dev), + __entry->qpn, + __entry->s_flags, + __entry->state, + __entry->s_tail_ack_queue, + __entry->r_head_ack_queue, + __entry->opcode, + __entry->psn, + __entry->r_psn, + __entry->diff + ) +); + +DEFINE_EVENT(/* event */ + hfi1_rc_rcv_err_template, hfi1_tid_rdma_rcv_err, + TP_PROTO(struct rvt_qp *qp, u32 opcode, u32 psn, int diff), + TP_ARGS(qp, opcode, psn, diff) +); + +DECLARE_EVENT_CLASS(/* sge */ + hfi1_sge_template, + TP_PROTO(struct rvt_qp *qp, int index, struct rvt_sge *sge), + TP_ARGS(qp, index, sge), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(int, index) + __field(u64, vaddr) + __field(u32, sge_length) + ), + TP_fast_assign(/* assign */ + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->qpn = qp->ibqp.qp_num; + __entry->index = index; + __entry->vaddr = (u64)sge->vaddr; + __entry->sge_length = sge->sge_length; + ), + TP_printk(/* print */ + "[%s] qpn 0x%x sge %d: vaddr 0x%llx sge_length %u", + __get_str(dev), + __entry->qpn, + __entry->index, + __entry->vaddr, + __entry->sge_length + ) +); + +DEFINE_EVENT(/* event */ + hfi1_sge_template, hfi1_sge_check_align, + TP_PROTO(struct rvt_qp *qp, int index, struct rvt_sge *sge), + TP_ARGS(qp, index, sge) +); + #endif /* __HFI1_TRACE_TID_H */ #undef TRACE_INCLUDE_PATH diff --git a/drivers/infiniband/hw/hfi1/trace_tx.h b/drivers/infiniband/hw/hfi1/trace_tx.h index c57af3b31fe1..37dbb3e599c3 100644 --- a/drivers/infiniband/hw/hfi1/trace_tx.h +++ b/drivers/infiniband/hw/hfi1/trace_tx.h @@ -114,19 +114,27 @@ DECLARE_EVENT_CLASS(hfi1_qpsleepwakeup_template, __field(u32, qpn) __field(u32, flags) __field(u32, s_flags) + __field(u32, ps_flags) + __field(unsigned long, iow_flags) ), TP_fast_assign( DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)) __entry->flags = flags; __entry->qpn = qp->ibqp.qp_num; __entry->s_flags = qp->s_flags; + __entry->ps_flags = + ((struct hfi1_qp_priv *)qp->priv)->s_flags; + __entry->iow_flags = + ((struct hfi1_qp_priv *)qp->priv)->s_iowait.flags; ), TP_printk( - "[%s] qpn 0x%x flags 0x%x s_flags 0x%x", + "[%s] qpn 0x%x flags 0x%x s_flags 0x%x ps_flags 0x%x iow_flags 0x%lx", __get_str(dev), __entry->qpn, __entry->flags, - __entry->s_flags + __entry->s_flags, + __entry->ps_flags, + __entry->iow_flags ) ); -- cgit v1.2.3-59-g8ed1b From c098bbb00cd1986cbb58ed1712643f80ed00fcc3 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:48:28 -0800 Subject: IB/hfi1: Build TID RDMA WRITE request This patch adds the functions to build TID RDMA WRITE request. The work request opcode, packet opcode, and packet formats for TID RDMA WRITE protocol are also defined in this patch. Signed-off-by: Mitko Haralanov Signed-off-by: Mike Marciniszyn Signed-off-by: Ashutosh Dixit Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/qp.h | 2 ++ drivers/infiniband/hw/hfi1/tid_rdma.c | 38 ++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/tid_rdma.h | 3 ++ include/rdma/ib_hdrs.h | 5 ++++ include/rdma/tid_rdma_defs.h | 56 +++++++++++++++++++++++++++++++++++ 5 files changed, 104 insertions(+) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/qp.h b/drivers/infiniband/hw/hfi1/qp.h index ce25a27aa4a1..f74e2509e8b9 100644 --- a/drivers/infiniband/hw/hfi1/qp.h +++ b/drivers/infiniband/hw/hfi1/qp.h @@ -64,12 +64,14 @@ extern const struct rvt_operation_params hfi1_post_parms[]; * HFI1_S_AHG_CLEAR - have send engine clear ahg state * HFI1_S_WAIT_PIO_DRAIN - qp waiting for PIOs to drain * HFI1_S_WAIT_TID_SPACE - a QP is waiting for TID resource + * HFI1_S_WAIT_TID_RESP - waiting for a TID RDMA WRITE response * HFI1_S_MIN_BIT_MASK - the lowest bit that can be used by hfi1 */ #define HFI1_S_AHG_VALID 0x80000000 #define HFI1_S_AHG_CLEAR 0x40000000 #define HFI1_S_WAIT_PIO_DRAIN 0x20000000 #define HFI1_S_WAIT_TID_SPACE 0x10000000 +#define HFI1_S_WAIT_TID_RESP 0x08000000 #define HFI1_S_MIN_BIT_MASK 0x01000000 /* diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 0ee79403acaf..089e301d9bcd 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -2975,3 +2975,41 @@ void setup_tid_rdma_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe) exit: rcu_read_unlock(); } + +/* TID RDMA WRITE functions */ + +u32 hfi1_build_tid_rdma_write_req(struct rvt_qp *qp, struct rvt_swqe *wqe, + struct ib_other_headers *ohdr, + u32 *bth1, u32 *bth2, u32 *len) +{ + struct hfi1_qp_priv *qpriv = qp->priv; + struct tid_rdma_request *req = wqe_to_tid_req(wqe); + struct tid_rdma_params *remote; + + rcu_read_lock(); + remote = rcu_dereference(qpriv->tid_rdma.remote); + /* + * Set the number of flow to be used based on negotiated + * parameters. + */ + req->n_flows = remote->max_write; + req->state = TID_REQUEST_ACTIVE; + + KDETH_RESET(ohdr->u.tid_rdma.w_req.kdeth0, KVER, 0x1); + KDETH_RESET(ohdr->u.tid_rdma.w_req.kdeth1, JKEY, remote->jkey); + ohdr->u.tid_rdma.w_req.reth.vaddr = + cpu_to_be64(wqe->rdma_wr.remote_addr + (wqe->length - *len)); + ohdr->u.tid_rdma.w_req.reth.rkey = + cpu_to_be32(wqe->rdma_wr.rkey); + ohdr->u.tid_rdma.w_req.reth.length = cpu_to_be32(*len); + ohdr->u.tid_rdma.w_req.verbs_qp = cpu_to_be32(qp->remote_qpn); + *bth1 &= ~RVT_QPN_MASK; + *bth1 |= remote->qp; + qp->s_state = TID_OP(WRITE_REQ); + qp->s_flags |= HFI1_S_WAIT_TID_RESP; + *bth2 |= IB_BTH_REQ_ACK; + *len = 0; + + rcu_read_unlock(); + return sizeof(ohdr->u.tid_rdma.w_req) / sizeof(u32); +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index a53598ce45b2..baba539b2b80 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -233,4 +233,7 @@ static inline void hfi1_setup_tid_rdma_wqe(struct rvt_qp *qp, setup_tid_rdma_wqe(qp, wqe); } +u32 hfi1_build_tid_rdma_write_req(struct rvt_qp *qp, struct rvt_swqe *wqe, + struct ib_other_headers *ohdr, + u32 *bth1, u32 *bth2, u32 *len); #endif /* HFI1_TID_RDMA_H */ diff --git a/include/rdma/ib_hdrs.h b/include/rdma/ib_hdrs.h index 58a0a0f99e7f..9a90bd031e8c 100644 --- a/include/rdma/ib_hdrs.h +++ b/include/rdma/ib_hdrs.h @@ -123,6 +123,11 @@ union ib_ehdrs { union { struct tid_rdma_read_req r_req; struct tid_rdma_read_resp r_rsp; + struct tid_rdma_write_req w_req; + struct tid_rdma_write_resp w_rsp; + struct tid_rdma_write_data w_data; + struct tid_rdma_resync resync; + struct tid_rdma_ack ack; } tid_rdma; } __packed; diff --git a/include/rdma/tid_rdma_defs.h b/include/rdma/tid_rdma_defs.h index 1c431ea32b52..08fe47c7ad2c 100644 --- a/include/rdma/tid_rdma_defs.h +++ b/include/rdma/tid_rdma_defs.h @@ -27,16 +27,71 @@ struct tid_rdma_read_resp { __be32 verbs_qp; }; +struct tid_rdma_write_req { + __le32 kdeth0; + __le32 kdeth1; + struct ib_reth reth; + __be32 reserved[2]; + __be32 verbs_qp; +}; + +struct tid_rdma_write_resp { + __le32 kdeth0; + __le32 kdeth1; + __be32 aeth; + __be32 reserved[3]; + __be32 tid_flow_psn; + __be32 tid_flow_qp; + __be32 verbs_qp; +}; + +struct tid_rdma_write_data { + __le32 kdeth0; + __le32 kdeth1; + __be32 reserved[6]; + __be32 verbs_qp; +}; + +struct tid_rdma_resync { + __le32 kdeth0; + __le32 kdeth1; + __be32 reserved[6]; + __be32 verbs_qp; +}; + +struct tid_rdma_ack { + __le32 kdeth0; + __le32 kdeth1; + __be32 aeth; + __be32 reserved[2]; + __be32 tid_flow_psn; + __be32 verbs_psn; + __be32 tid_flow_qp; + __be32 verbs_qp; +}; + /* * TID RDMA Opcodes */ #define IB_OPCODE_TID_RDMA 0xe0 enum { + IB_OPCODE_WRITE_REQ = 0x0, + IB_OPCODE_WRITE_RESP = 0x1, + IB_OPCODE_WRITE_DATA = 0x2, + IB_OPCODE_WRITE_DATA_LAST = 0x3, IB_OPCODE_READ_REQ = 0x4, IB_OPCODE_READ_RESP = 0x5, + IB_OPCODE_RESYNC = 0x6, + IB_OPCODE_ACK = 0x7, + IB_OPCODE(TID_RDMA, WRITE_REQ), + IB_OPCODE(TID_RDMA, WRITE_RESP), + IB_OPCODE(TID_RDMA, WRITE_DATA), + IB_OPCODE(TID_RDMA, WRITE_DATA_LAST), IB_OPCODE(TID_RDMA, READ_REQ), IB_OPCODE(TID_RDMA, READ_RESP), + IB_OPCODE(TID_RDMA, RESYNC), + IB_OPCODE(TID_RDMA, ACK), }; #define TID_OP(x) IB_OPCODE_TID_RDMA_##x @@ -47,6 +102,7 @@ enum { * low level drivers. Two of those are used but renamed * to be more descriptive. */ +#define IB_WR_TID_RDMA_WRITE IB_WR_RESERVED1 #define IB_WR_TID_RDMA_READ IB_WR_RESERVED2 #endif /* TID_RDMA_DEFS_H */ -- cgit v1.2.3-59-g8ed1b From f5a4a95f4dd8a09d28936c2e1e357e4c8dcca6c1 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:48:38 -0800 Subject: IB/hfi1: Allow for extra entries in QP's s_ack_queue The TID RDMA WRITE protocol differs from normal IB RDMA WRITE in that TID RDMA WRITE requests do require responses, not just ACKs. Therefore, TID RDMA WRITE requests need to be treated as RDMA READ requests from the point of view of the QPs' s_ack_queue. In other words, the QPs' need to allow for TID RDMA WRITE requests to be stored in their s_ack_queue. However, because the user does not know anything about the TID RDMA capability and/or protocols, these extra entries in the queue cannot be advertized to the user. Reviewed-by: Mike Marciniszyn Signed-off-by: Mitko Haralanov Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/tid_rdma.h | 11 +++++++++++ drivers/infiniband/hw/hfi1/verbs.c | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index baba539b2b80..9b952351f072 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -28,6 +28,17 @@ */ #define HFI1_S_TID_WAIT_INTERLCK BIT(5) +/* + * Unlike regular IB RDMA VERBS, which do not require an entry + * in the s_ack_queue, TID RDMA WRITE requests do because they + * generate responses. + * Therefore, the s_ack_queue needs to be extended by a certain + * amount. The key point is that the queue needs to be extended + * without letting the "user" know so they user doesn't end up + * using these extra entries. + */ +#define HFI1_TID_RDMA_WRITE_CNT 8 + struct tid_rdma_params { struct rcu_head rcu_head; u32 qp; diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 88676ca79fda..7b87b77582bd 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1888,7 +1888,7 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd) dd->verbs_dev.rdi.dparms.wss_threshold = wss_threshold; dd->verbs_dev.rdi.dparms.wss_clean_period = wss_clean_period; dd->verbs_dev.rdi.dparms.reserved_operations = 1; - dd->verbs_dev.rdi.dparms.extra_rdma_atomic = 1; + dd->verbs_dev.rdi.dparms.extra_rdma_atomic = HFI1_TID_RDMA_WRITE_CNT; /* post send table */ dd->verbs_dev.rdi.post_parms = hfi1_post_parms; -- cgit v1.2.3-59-g8ed1b From 4f9264d156dc6c154a8a6cfae780730bad45c6f8 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:48:48 -0800 Subject: IB/hfi1: Add an s_acked_ack_queue pointer The s_ack_queue is managed by two pointers into the ring: r_head_ack_queue and s_tail_ack_queue. r_head_ack_queue is the index of where the next received request is going to be placed and s_tail_ack_queue is the entry of the request currently being processed. This works perfectly fine for normal Verbs as the requests are processed one at a time and the s_tail_ack_queue is not moved until the request that it points to is fully completed. In this fashion, s_tail_ack_queue constantly chases r_head_ack_queue and the two pointers can easily be used to determine "queue full" and "queue empty" conditions. The detection of these two conditions are imported in determining when an old entry can safely be overwritten with a new received request and the resources associated with the old request be safely released. When pipelined TID RDMA WRITE is introduced into this mix, things look very different. r_head_ack_queue is still the point at which a newly received request will be inserted, s_tail_ack_queue is still the currently processed request. However, with pipelined TID RDMA WRITE requests, s_tail_ack_queue moves to the next request once all TID RDMA WRITE responses for that request have been sent. The rest of the protocol for a particular request is managed by other pointers specific to TID RDMA - r_tid_tail and r_tid_ack - which point to the entries for which the next TID RDMA DATA packets are going to arrive and the request for which the next TID RDMA ACK packets are to be generated, respectively. What this means is that entries in the ring, which are "behind" s_tail_ack_queue (entries which s_tail_ack_queue has gone past) are no longer considered complete. This is where the problem is - a newly received request could potentially overwrite a still active TID RDMA WRITE request. The reason why the TID RDMA pointers trail s_tail_ack_queue is that the normal Verbs send engine uses s_tail_ack_queue as the pointer for the next response. Since TID RDMA WRITE responses are processed by the normal Verbs send engine, s_tail_ack_queue had to be moved to the next entry once all TID RDMA WRITE response packets were sent to get the desired pipelining between requests. Doing otherwise would mean that the normal Verbs send engine would not be able to send the TID RDMA WRITE responses for the next TID RDMA request until the current one is fully completed. This patch introduces the s_acked_ack_queue index to point to the next request to complete on the responder side. For requests other than TID RDMA WRITE, s_acked_ack_queue should always be kept in sync with s_tail_ack_queue. For TID RDMA WRITE request, it may fall behind s_tail_ack_queue. Reviewed-by: Mike Marciniszyn Signed-off-by: Mitko Haralanov Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/rc.c | 33 ++++++++++++++++++++++++++++----- drivers/infiniband/hw/hfi1/rc.h | 1 + drivers/infiniband/hw/hfi1/tid_rdma.c | 2 ++ drivers/infiniband/hw/hfi1/trace_tid.h | 10 ++++++++-- drivers/infiniband/sw/rdmavt/qp.c | 1 + include/rdma/rdmavt_qp.h | 1 + 6 files changed, 41 insertions(+), 7 deletions(-) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 6c9ef572fc69..9dc8e524510e 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -120,6 +120,7 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, struct hfi1_qp_priv *priv = qp->priv; bool last_pkt; u32 delta; + u8 next = qp->s_tail_ack_queue; trace_hfi1_rsp_make_rc_ack(qp, 0); lockdep_assert_held(&qp->s_lock); @@ -149,9 +150,17 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, * response has been sent instead of only being * constructed. */ - if (++qp->s_tail_ack_queue > - rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) - qp->s_tail_ack_queue = 0; + if (++next > rvt_size_atomic(&dev->rdi)) + next = 0; + /* + * Only advance the s_acked_ack_queue pointer if there + * have been no TID RDMA requests. + */ + e = &qp->s_ack_queue[qp->s_tail_ack_queue]; + if (e->opcode != TID_OP(WRITE_REQ) && + qp->s_acked_ack_queue == qp->s_tail_ack_queue) + qp->s_acked_ack_queue = next; + qp->s_tail_ack_queue = next; /* FALLTHROUGH */ case OP(SEND_ONLY): case OP(ACKNOWLEDGE): @@ -172,6 +181,10 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, */ len = e->rdma_sge.sge_length; if (len && !e->rdma_sge.mr) { + if (qp->s_acked_ack_queue == + qp->s_tail_ack_queue) + qp->s_acked_ack_queue = + qp->r_head_ack_queue; qp->s_tail_ack_queue = qp->r_head_ack_queue; goto bail; } @@ -202,6 +215,10 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, */ len = e->rdma_sge.sge_length; if (len && !e->rdma_sge.mr) { + if (qp->s_acked_ack_queue == + qp->s_tail_ack_queue) + qp->s_acked_ack_queue = + qp->r_head_ack_queue; qp->s_tail_ack_queue = qp->r_head_ack_queue; goto bail; } @@ -2235,6 +2252,8 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data, e->psn = psn; if (old_req) goto unlock_done; + if (qp->s_acked_ack_queue == qp->s_tail_ack_queue) + qp->s_acked_ack_queue = prev; qp->s_tail_ack_queue = prev; break; } @@ -2248,6 +2267,8 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data, */ if (!e || e->opcode != (u8)opcode || old_req) goto unlock_done; + if (qp->s_tail_ack_queue == qp->s_acked_ack_queue) + qp->s_acked_ack_queue = prev; qp->s_tail_ack_queue = prev; break; } @@ -2274,6 +2295,8 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data, * Resend the RDMA read or atomic op which * ACKs this duplicate request. */ + if (qp->s_tail_ack_queue == qp->s_acked_ack_queue) + qp->s_acked_ack_queue = mra; qp->s_tail_ack_queue = mra; break; } @@ -2646,7 +2669,7 @@ send_last: if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) next = 0; spin_lock_irqsave(&qp->s_lock, flags); - if (unlikely(next == qp->s_tail_ack_queue)) { + if (unlikely(next == qp->s_acked_ack_queue)) { if (!qp->s_ack_queue[next].sent) goto nack_inv_unlck; update_ack_queue(qp, next); @@ -2723,7 +2746,7 @@ send_last: if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) next = 0; spin_lock_irqsave(&qp->s_lock, flags); - if (unlikely(next == qp->s_tail_ack_queue)) { + if (unlikely(next == qp->s_acked_ack_queue)) { if (!qp->s_ack_queue[next].sent) goto nack_inv_unlck; update_ack_queue(qp, next); diff --git a/drivers/infiniband/hw/hfi1/rc.h b/drivers/infiniband/hw/hfi1/rc.h index 4329eadcb3df..8e0935b9bf2a 100644 --- a/drivers/infiniband/hw/hfi1/rc.h +++ b/drivers/infiniband/hw/hfi1/rc.h @@ -18,6 +18,7 @@ static inline void update_ack_queue(struct rvt_qp *qp, unsigned int n) if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) next = 0; qp->s_tail_ack_queue = next; + qp->s_acked_ack_queue = next; qp->s_ack_state = OP(ACKNOWLEDGE); } diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 089e301d9bcd..c320a99afb35 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -2044,6 +2044,8 @@ static int tid_rdma_rcv_error(struct hfi1_packet *packet, goto unlock; } /* Re-process old requests.*/ + if (qp->s_acked_ack_queue == qp->s_tail_ack_queue) + qp->s_acked_ack_queue = prev; qp->s_tail_ack_queue = prev; /* * Since the qp->s_tail_ack_queue is modified, the diff --git a/drivers/infiniband/hw/hfi1/trace_tid.h b/drivers/infiniband/hw/hfi1/trace_tid.h index b71638c22d4b..51f5b0e8da71 100644 --- a/drivers/infiniband/hw/hfi1/trace_tid.h +++ b/drivers/infiniband/hw/hfi1/trace_tid.h @@ -40,7 +40,7 @@ u16 hfi1_trace_get_tid_idx(u32 ent); #define RSP_INFO_PRN "[%s] qpn 0x%x state 0x%x s_state 0x%x psn 0x%x " \ "r_psn 0x%x r_state 0x%x r_flags 0x%x " \ "r_head_ack_queue %u s_tail_ack_queue %u " \ - "s_ack_state 0x%x " \ + "s_acked_ack_queue %u s_ack_state 0x%x " \ "s_nak_state 0x%x s_flags 0x%x ps_flags 0x%x " \ "iow_flags 0x%lx" @@ -62,7 +62,7 @@ u16 hfi1_trace_get_tid_idx(u32 ent); "s_next_psn 0x%x" #define RCV_ERR_PRN "[%s] qpn 0x%x s_flags 0x%x state 0x%x " \ - "s_tail_ack_queue %u " \ + "s_acked_ack_queue %u s_tail_ack_queue %u " \ "r_head_ack_queue %u opcode 0x%x psn 0x%x r_psn 0x%x " \ " diff %d" @@ -671,6 +671,7 @@ DECLARE_EVENT_CLASS(/* rsp_info */ __field(u8, r_flags) __field(u8, r_head_ack_queue) __field(u8, s_tail_ack_queue) + __field(u8, s_acked_ack_queue) __field(u8, s_ack_state) __field(u8, s_nak_state) __field(u8, r_nak_state) @@ -691,6 +692,7 @@ DECLARE_EVENT_CLASS(/* rsp_info */ __entry->r_flags = qp->r_flags; __entry->r_head_ack_queue = qp->r_head_ack_queue; __entry->s_tail_ack_queue = qp->s_tail_ack_queue; + __entry->s_acked_ack_queue = qp->s_acked_ack_queue; __entry->s_ack_state = qp->s_ack_state; __entry->s_nak_state = qp->s_nak_state; __entry->s_flags = qp->s_flags; @@ -709,6 +711,7 @@ DECLARE_EVENT_CLASS(/* rsp_info */ __entry->r_flags, __entry->r_head_ack_queue, __entry->s_tail_ack_queue, + __entry->s_acked_ack_queue, __entry->s_ack_state, __entry->s_nak_state, __entry->s_flags, @@ -1007,6 +1010,7 @@ DECLARE_EVENT_CLASS(/* rc_rcv_err */ __field(u32, qpn) __field(u32, s_flags) __field(u8, state) + __field(u8, s_acked_ack_queue) __field(u8, s_tail_ack_queue) __field(u8, r_head_ack_queue) __field(u32, opcode) @@ -1019,6 +1023,7 @@ DECLARE_EVENT_CLASS(/* rc_rcv_err */ __entry->qpn = qp->ibqp.qp_num; __entry->s_flags = qp->s_flags; __entry->state = qp->state; + __entry->s_acked_ack_queue = qp->s_acked_ack_queue; __entry->s_tail_ack_queue = qp->s_tail_ack_queue; __entry->r_head_ack_queue = qp->r_head_ack_queue; __entry->opcode = opcode; @@ -1032,6 +1037,7 @@ DECLARE_EVENT_CLASS(/* rc_rcv_err */ __entry->qpn, __entry->s_flags, __entry->state, + __entry->s_acked_ack_queue, __entry->s_tail_ack_queue, __entry->r_head_ack_queue, __entry->opcode, diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 2769ebdf89fb..14ec2577bcaa 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -854,6 +854,7 @@ static void rvt_init_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, qp->s_mig_state = IB_MIG_MIGRATED; qp->r_head_ack_queue = 0; qp->s_tail_ack_queue = 0; + qp->s_acked_ack_queue = 0; qp->s_num_rd_atomic = 0; if (qp->r_rq.wq) { qp->r_rq.wq->head = 0; diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index d8d88d023092..4ee612ab6cb4 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -375,6 +375,7 @@ struct rvt_qp { u8 s_rnr_retry; /* requester RNR retry counter */ u8 s_num_rd_atomic; /* number of RDMA read/atomic pending */ u8 s_tail_ack_queue; /* index into s_ack_queue[] */ + u8 s_acked_ack_queue; /* index into s_ack_queue[] */ struct rvt_sge_state s_ack_rdma_sge; struct timer_list s_timer; -- cgit v1.2.3-59-g8ed1b From 07b923701e38f93b4725e64318e6483f890c1c1d Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:48:59 -0800 Subject: IB/hfi1: Add functions to receive TID RDMA WRITE request This patch adds the functions to receive TID RDMA WRITE request. The request will be stored in the QP's s_ack_queue. This patch also adds code to handle duplicate TID RDMA WRITE request and a function to allocate TID resources for data receiving on the responder side. Signed-off-by: Mitko Haralanov Signed-off-by: Mike Marciniszyn Signed-off-by: Ashutosh Dixit Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/init.c | 1 + drivers/infiniband/hw/hfi1/rc.c | 3 + drivers/infiniband/hw/hfi1/tid_rdma.c | 570 +++++++++++++++++++++++++++++++++- drivers/infiniband/hw/hfi1/tid_rdma.h | 16 + drivers/infiniband/hw/hfi1/verbs.h | 12 + 5 files changed, 601 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index d13304f7340d..7841a0ad7cb6 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -1512,6 +1512,7 @@ static int __init hfi1_mod_init(void) goto bail_dev; } + hfi1_compute_tid_rdma_flow_wt(); /* * These must be called before the driver is registered with * the PCI subsystem. diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 9dc8e524510e..fcb733ea8dfb 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -2411,6 +2411,7 @@ void hfi1_rc_rcv(struct hfi1_packet *packet) void *data = packet->payload; u32 tlen = packet->tlen; struct rvt_qp *qp = packet->qp; + struct hfi1_qp_priv *qpriv = qp->priv; struct hfi1_ibport *ibp = rcd_to_iport(rcd); struct ib_other_headers *ohdr = packet->ohdr; u32 opcode = packet->opcode; @@ -2716,6 +2717,7 @@ send_last: qp->r_state = opcode; qp->r_nak_state = 0; qp->r_head_ack_queue = next; + qpriv->r_tid_alloc = qp->r_head_ack_queue; /* Schedule the send engine. */ qp->s_flags |= RVT_S_RESP_PENDING; @@ -2789,6 +2791,7 @@ ack: qp->r_state = opcode; qp->r_nak_state = 0; qp->r_head_ack_queue = next; + qpriv->r_tid_alloc = qp->r_head_ack_queue; /* Schedule the send engine. */ qp->s_flags |= RVT_S_RESP_PENDING; diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index c320a99afb35..516dca9f497e 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -109,12 +109,15 @@ static u32 mask_generation(u32 a) * C - Capcode */ +static u32 tid_rdma_flow_wt; + static void tid_rdma_trigger_resume(struct work_struct *work); static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req); static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req, gfp_t gfp); static void hfi1_init_trdma_req(struct rvt_qp *qp, struct tid_rdma_request *req); +static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx); static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p) { @@ -313,6 +316,11 @@ int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp, qpriv->flow_state.index = RXE_NUM_TID_FLOWS; qpriv->flow_state.last_index = RXE_NUM_TID_FLOWS; qpriv->flow_state.generation = KERN_GENERATION_RESERVED; + qpriv->rnr_nak_state = TID_RNR_NAK_INIT; + qpriv->r_tid_head = HFI1_QP_WQE_INVALID; + qpriv->r_tid_tail = HFI1_QP_WQE_INVALID; + qpriv->r_tid_ack = HFI1_QP_WQE_INVALID; + qpriv->r_tid_alloc = HFI1_QP_WQE_INVALID; INIT_LIST_HEAD(&qpriv->tid_wait); if (init_attr->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) { @@ -1959,6 +1967,8 @@ static int tid_rdma_rcv_error(struct hfi1_packet *packet, { struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); struct hfi1_ctxtdata *rcd = ((struct hfi1_qp_priv *)qp->priv)->rcd; + struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); + struct hfi1_qp_priv *qpriv = qp->priv; struct rvt_ack_entry *e; struct tid_rdma_request *req; unsigned long flags; @@ -1982,7 +1992,8 @@ static int tid_rdma_rcv_error(struct hfi1_packet *packet, spin_lock_irqsave(&qp->s_lock, flags); e = find_prev_entry(qp, psn, &prev, NULL, &old_req); - if (!e || e->opcode != TID_OP(READ_REQ)) + if (!e || (e->opcode != TID_OP(READ_REQ) && + e->opcode != TID_OP(WRITE_REQ))) goto unlock; req = ack_to_tid_req(e); @@ -2042,6 +2053,114 @@ static int tid_rdma_rcv_error(struct hfi1_packet *packet, */ if (old_req) goto unlock; + } else { + struct flow_state *fstate; + bool schedule = false; + u8 i; + + if (req->state == TID_REQUEST_RESEND) { + req->state = TID_REQUEST_RESEND_ACTIVE; + } else if (req->state == TID_REQUEST_INIT_RESEND) { + req->state = TID_REQUEST_INIT; + schedule = true; + } + + /* + * True if the request is already scheduled (between + * qp->s_tail_ack_queue and qp->r_head_ack_queue). + * Also, don't change requests, which are at the SYNC + * point and haven't generated any responses yet. + * There is nothing to retransmit for them yet. + */ + if (old_req || req->state == TID_REQUEST_INIT || + (req->state == TID_REQUEST_SYNC && !req->cur_seg)) { + for (i = prev + 1; ; i++) { + if (i > rvt_size_atomic(&dev->rdi)) + i = 0; + if (i == qp->r_head_ack_queue) + break; + e = &qp->s_ack_queue[i]; + req = ack_to_tid_req(e); + if (e->opcode == TID_OP(WRITE_REQ) && + req->state == TID_REQUEST_INIT) + req->state = TID_REQUEST_INIT_RESEND; + } + /* + * If the state of the request has been changed, + * the first leg needs to get scheduled in order to + * pick up the change. Otherwise, normal response + * processing should take care of it. + */ + if (!schedule) + goto unlock; + } + + /* + * If there is no more allocated segment, just schedule the qp + * without changing any state. + */ + if (req->clear_tail == req->setup_head) + goto schedule; + /* + * If this request has sent responses for segments, which have + * not received data yet (flow_idx != clear_tail), the flow_idx + * pointer needs to be adjusted so the same responses can be + * re-sent. + */ + if (CIRC_CNT(req->flow_idx, req->clear_tail, MAX_FLOWS)) { + fstate = &req->flows[req->clear_tail].flow_state; + qpriv->pending_tid_w_segs -= + CIRC_CNT(req->flow_idx, req->clear_tail, + MAX_FLOWS); + req->flow_idx = + CIRC_ADD(req->clear_tail, + delta_psn(psn, fstate->resp_ib_psn), + MAX_FLOWS); + qpriv->pending_tid_w_segs += + delta_psn(psn, fstate->resp_ib_psn); + /* + * When flow_idx == setup_head, we've gotten a duplicate + * request for a segment, which has not been allocated + * yet. In that case, don't adjust this request. + * However, we still want to go through the loop below + * to adjust all subsequent requests. + */ + if (CIRC_CNT(req->setup_head, req->flow_idx, + MAX_FLOWS)) { + req->cur_seg = delta_psn(psn, e->psn); + req->state = TID_REQUEST_RESEND_ACTIVE; + } + } + + for (i = prev + 1; ; i++) { + /* + * Look at everything up to and including + * s_tail_ack_queue + */ + if (i > rvt_size_atomic(&dev->rdi)) + i = 0; + if (i == qp->r_head_ack_queue) + break; + e = &qp->s_ack_queue[i]; + req = ack_to_tid_req(e); + trace_hfi1_tid_req_rcv_err(qp, 0, e->opcode, e->psn, + e->lpsn, req); + if (e->opcode != TID_OP(WRITE_REQ) || + req->cur_seg == req->comp_seg || + req->state == TID_REQUEST_INIT || + req->state == TID_REQUEST_INIT_RESEND) { + if (req->state == TID_REQUEST_INIT) + req->state = TID_REQUEST_INIT_RESEND; + continue; + } + qpriv->pending_tid_w_segs -= + CIRC_CNT(req->flow_idx, + req->clear_tail, + MAX_FLOWS); + req->flow_idx = req->clear_tail; + req->state = TID_REQUEST_RESEND; + req->cur_seg = req->comp_seg; + } } /* Re-process old requests.*/ if (qp->s_acked_ack_queue == qp->s_tail_ack_queue) @@ -2054,6 +2173,18 @@ static int tid_rdma_rcv_error(struct hfi1_packet *packet, * wrong memory region. */ qp->s_ack_state = OP(ACKNOWLEDGE); +schedule: + /* + * It's possible to receive a retry psn that is earlier than an RNRNAK + * psn. In this case, the rnrnak state should be cleared. + */ + if (qpriv->rnr_nak_state) { + qp->s_nak_state = 0; + qpriv->rnr_nak_state = TID_RNR_NAK_INIT; + qp->r_psn = e->lpsn + 1; + hfi1_tid_write_alloc_resources(qp, true); + } + qp->r_state = e->opcode; qp->r_nak_state = 0; qp->s_flags |= RVT_S_RESP_PENDING; @@ -2164,6 +2295,14 @@ void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet) qp->r_head_ack_queue = next; + /* + * For all requests other than TID WRITE which are added to the ack + * queue, qpriv->r_tid_alloc follows qp->r_head_ack_queue. It is ok to + * do this because of interlocks between these and TID WRITE + * requests. The same change has also been made in hfi1_rc_rcv(). + */ + qpriv->r_tid_alloc = qp->r_head_ack_queue; + /* Schedule the send tasklet. */ qp->s_flags |= RVT_S_RESP_PENDING; hfi1_schedule_send(qp); @@ -3015,3 +3154,432 @@ u32 hfi1_build_tid_rdma_write_req(struct rvt_qp *qp, struct rvt_swqe *wqe, rcu_read_unlock(); return sizeof(ohdr->u.tid_rdma.w_req) / sizeof(u32); } + +void hfi1_compute_tid_rdma_flow_wt(void) +{ + /* + * Heuristic for computing the RNR timeout when waiting on the flow + * queue. Rather than a computationaly expensive exact estimate of when + * a flow will be available, we assume that if a QP is at position N in + * the flow queue it has to wait approximately (N + 1) * (number of + * segments between two sync points), assuming PMTU of 4K. The rationale + * for this is that flows are released and recycled at each sync point. + */ + tid_rdma_flow_wt = MAX_TID_FLOW_PSN * enum_to_mtu(OPA_MTU_4096) / + TID_RDMA_MAX_SEGMENT_SIZE; +} + +static u32 position_in_queue(struct hfi1_qp_priv *qpriv, + struct tid_queue *queue) +{ + return qpriv->tid_enqueue - queue->dequeue; +} + +/* + * @qp: points to rvt_qp context. + * @to_seg: desired RNR timeout in segments. + * Return: index of the next highest timeout in the ib_hfi1_rnr_table[] + */ +static u32 hfi1_compute_tid_rnr_timeout(struct rvt_qp *qp, u32 to_seg) +{ + struct hfi1_qp_priv *qpriv = qp->priv; + u64 timeout; + u32 bytes_per_us; + u8 i; + + bytes_per_us = active_egress_rate(qpriv->rcd->ppd) / 8; + timeout = (to_seg * TID_RDMA_MAX_SEGMENT_SIZE) / bytes_per_us; + /* + * Find the next highest value in the RNR table to the required + * timeout. This gives the responder some padding. + */ + for (i = 1; i <= IB_AETH_CREDIT_MASK; i++) + if (rvt_rnr_tbl_to_usec(i) >= timeout) + return i; + return 0; +} + +/** + * Central place for resource allocation at TID write responder, + * is called from write_req and write_data interrupt handlers as + * well as the send thread when a queued QP is scheduled for + * resource allocation. + * + * Iterates over (a) segments of a request and then (b) queued requests + * themselves to allocate resources for up to local->max_write + * segments across multiple requests. Stop allocating when we + * hit a sync point, resume allocating after data packets at + * sync point have been received. + * + * Resource allocation and sending of responses is decoupled. The + * request/segment which are being allocated and sent are as follows. + * Resources are allocated for: + * [request: qpriv->r_tid_alloc, segment: req->alloc_seg] + * The send thread sends: + * [request: qp->s_tail_ack_queue, segment:req->cur_seg] + */ +static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx) +{ + struct tid_rdma_request *req; + struct hfi1_qp_priv *qpriv = qp->priv; + struct hfi1_ctxtdata *rcd = qpriv->rcd; + struct tid_rdma_params *local = &qpriv->tid_rdma.local; + struct rvt_ack_entry *e; + u32 npkts, to_seg; + bool last; + int ret = 0; + + lockdep_assert_held(&qp->s_lock); + + while (1) { + /* + * Don't allocate more segments if a RNR NAK has already been + * scheduled to avoid messing up qp->r_psn: the RNR NAK will + * be sent only when all allocated segments have been sent. + * However, if more segments are allocated before that, TID RDMA + * WRITE RESP packets will be sent out for these new segments + * before the RNR NAK packet. When the requester receives the + * RNR NAK packet, it will restart with qp->s_last_psn + 1, + * which does not match qp->r_psn and will be dropped. + * Consequently, the requester will exhaust its retries and + * put the qp into error state. + */ + if (qpriv->rnr_nak_state == TID_RNR_NAK_SEND) + break; + + /* No requests left to process */ + if (qpriv->r_tid_alloc == qpriv->r_tid_head) { + /* If all data has been received, clear the flow */ + if (qpriv->flow_state.index < RXE_NUM_TID_FLOWS && + !qpriv->alloc_w_segs) + hfi1_kern_clear_hw_flow(rcd, qp); + break; + } + + e = &qp->s_ack_queue[qpriv->r_tid_alloc]; + if (e->opcode != TID_OP(WRITE_REQ)) + goto next_req; + req = ack_to_tid_req(e); + /* Finished allocating for all segments of this request */ + if (req->alloc_seg >= req->total_segs) + goto next_req; + + /* Can allocate only a maximum of local->max_write for a QP */ + if (qpriv->alloc_w_segs >= local->max_write) + break; + + /* Don't allocate at a sync point with data packets pending */ + if (qpriv->sync_pt && qpriv->alloc_w_segs) + break; + + /* All data received at the sync point, continue */ + if (qpriv->sync_pt && !qpriv->alloc_w_segs) { + hfi1_kern_clear_hw_flow(rcd, qp); + qpriv->sync_pt = false; + if (qpriv->s_flags & HFI1_R_TID_SW_PSN) + qpriv->s_flags &= ~HFI1_R_TID_SW_PSN; + } + + /* Allocate flow if we don't have one */ + if (qpriv->flow_state.index >= RXE_NUM_TID_FLOWS) { + ret = hfi1_kern_setup_hw_flow(qpriv->rcd, qp); + if (ret) { + to_seg = tid_rdma_flow_wt * + position_in_queue(qpriv, + &rcd->flow_queue); + break; + } + } + + npkts = rvt_div_round_up_mtu(qp, req->seg_len); + + /* + * We are at a sync point if we run out of KDETH PSN space. + * Last PSN of every generation is reserved for RESYNC. + */ + if (qpriv->flow_state.psn + npkts > MAX_TID_FLOW_PSN - 1) { + qpriv->sync_pt = true; + break; + } + + /* + * If overtaking req->acked_tail, send an RNR NAK. Because the + * QP is not queued in this case, and the issue can only be + * caused due a delay in scheduling the second leg which we + * cannot estimate, we use a rather arbitrary RNR timeout of + * (MAX_FLOWS / 2) segments + */ + if (!CIRC_SPACE(req->setup_head, req->acked_tail, + MAX_FLOWS)) { + ret = -EAGAIN; + to_seg = MAX_FLOWS >> 1; + qpriv->s_flags |= RVT_S_ACK_PENDING; + break; + } + + /* Try to allocate rcv array / TID entries */ + ret = hfi1_kern_exp_rcv_setup(req, &req->ss, &last); + if (ret == -EAGAIN) + to_seg = position_in_queue(qpriv, &rcd->rarr_queue); + if (ret) + break; + + qpriv->alloc_w_segs++; + req->alloc_seg++; + continue; +next_req: + /* Begin processing the next request */ + if (++qpriv->r_tid_alloc > + rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) + qpriv->r_tid_alloc = 0; + } + + /* + * Schedule an RNR NAK to be sent if (a) flow or rcv array allocation + * has failed (b) we are called from the rcv handler interrupt context + * (c) an RNR NAK has not already been scheduled + */ + if (ret == -EAGAIN && intr_ctx && !qp->r_nak_state) + goto send_rnr_nak; + + return; + +send_rnr_nak: + lockdep_assert_held(&qp->r_lock); + + /* Set r_nak_state to prevent unrelated events from generating NAK's */ + qp->r_nak_state = hfi1_compute_tid_rnr_timeout(qp, to_seg) | IB_RNR_NAK; + + /* Pull back r_psn to the segment being RNR NAK'd */ + qp->r_psn = e->psn + req->alloc_seg; + qp->r_ack_psn = qp->r_psn; + /* + * Pull back r_head_ack_queue to the ack entry following the request + * being RNR NAK'd. This allows resources to be allocated to the request + * if the queued QP is scheduled. + */ + qp->r_head_ack_queue = qpriv->r_tid_alloc + 1; + if (qp->r_head_ack_queue > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) + qp->r_head_ack_queue = 0; + qpriv->r_tid_head = qp->r_head_ack_queue; + /* + * These send side fields are used in make_rc_ack(). They are set in + * hfi1_send_rc_ack() but must be set here before dropping qp->s_lock + * for consistency + */ + qp->s_nak_state = qp->r_nak_state; + qp->s_ack_psn = qp->r_ack_psn; + /* + * Clear the ACK PENDING flag to prevent unwanted ACK because we + * have modified qp->s_ack_psn here. + */ + qp->s_flags &= ~(RVT_S_ACK_PENDING); + + /* + * qpriv->rnr_nak_state is used to determine when the scheduled RNR NAK + * has actually been sent. qp->s_flags RVT_S_ACK_PENDING bit cannot be + * used for this because qp->s_lock is dropped before calling + * hfi1_send_rc_ack() leading to inconsistency between the receive + * interrupt handlers and the send thread in make_rc_ack() + */ + qpriv->rnr_nak_state = TID_RNR_NAK_SEND; + + /* + * Schedule RNR NAK to be sent. RNR NAK's are scheduled from the receive + * interrupt handlers but will be sent from the send engine behind any + * previous responses that may have been scheduled + */ + rc_defered_ack(rcd, qp); +} + +void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet) +{ + /* HANDLER FOR TID RDMA WRITE REQUEST packet (Responder side)*/ + + /* + * 1. Verify TID RDMA WRITE REQ as per IB_OPCODE_RC_RDMA_WRITE_FIRST + * (see hfi1_rc_rcv()) + * - Don't allow 0-length requests. + * 2. Put TID RDMA WRITE REQ into the response queueu (s_ack_queue) + * - Setup struct tid_rdma_req with request info + * - Prepare struct tid_rdma_flow array? + * 3. Set the qp->s_ack_state as state diagram in design doc. + * 4. Set RVT_S_RESP_PENDING in s_flags. + * 5. Kick the send engine (hfi1_schedule_send()) + */ + struct hfi1_ctxtdata *rcd = packet->rcd; + struct rvt_qp *qp = packet->qp; + struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + struct ib_other_headers *ohdr = packet->ohdr; + struct rvt_ack_entry *e; + unsigned long flags; + struct ib_reth *reth; + struct hfi1_qp_priv *qpriv = qp->priv; + struct tid_rdma_request *req; + u32 bth0, psn, len, rkey, num_segs; + bool is_fecn; + u8 next; + u64 vaddr; + int diff; + + bth0 = be32_to_cpu(ohdr->bth[0]); + if (hfi1_ruc_check_hdr(ibp, packet)) + return; + + is_fecn = process_ecn(qp, packet); + psn = mask_psn(be32_to_cpu(ohdr->bth[2])); + + if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST)) + rvt_comm_est(qp); + + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) + goto nack_inv; + + reth = &ohdr->u.tid_rdma.w_req.reth; + vaddr = be64_to_cpu(reth->vaddr); + len = be32_to_cpu(reth->length); + + num_segs = DIV_ROUND_UP(len, qpriv->tid_rdma.local.max_len); + diff = delta_psn(psn, qp->r_psn); + if (unlikely(diff)) { + if (tid_rdma_rcv_error(packet, ohdr, qp, psn, diff)) + return; + goto send_ack; + } + + /* + * The resent request which was previously RNR NAK'd is inserted at the + * location of the original request, which is one entry behind + * r_head_ack_queue + */ + if (qpriv->rnr_nak_state) + qp->r_head_ack_queue = qp->r_head_ack_queue ? + qp->r_head_ack_queue - 1 : + rvt_size_atomic(ib_to_rvt(qp->ibqp.device)); + + /* We've verified the request, insert it into the ack queue. */ + next = qp->r_head_ack_queue + 1; + if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) + next = 0; + spin_lock_irqsave(&qp->s_lock, flags); + if (unlikely(next == qp->s_acked_ack_queue)) { + if (!qp->s_ack_queue[next].sent) + goto nack_inv_unlock; + update_ack_queue(qp, next); + } + e = &qp->s_ack_queue[qp->r_head_ack_queue]; + req = ack_to_tid_req(e); + + /* Bring previously RNR NAK'd request back to life */ + if (qpriv->rnr_nak_state) { + qp->r_nak_state = 0; + qp->s_nak_state = 0; + qpriv->rnr_nak_state = TID_RNR_NAK_INIT; + qp->r_psn = e->lpsn + 1; + req->state = TID_REQUEST_INIT; + goto update_head; + } + + if (e->rdma_sge.mr) { + rvt_put_mr(e->rdma_sge.mr); + e->rdma_sge.mr = NULL; + } + + /* The length needs to be in multiples of PAGE_SIZE */ + if (!len || len & ~PAGE_MASK) + goto nack_inv_unlock; + + rkey = be32_to_cpu(reth->rkey); + qp->r_len = len; + + if (e->opcode == TID_OP(WRITE_REQ) && + (req->setup_head != req->clear_tail || + req->clear_tail != req->acked_tail)) + goto nack_inv_unlock; + + if (unlikely(!rvt_rkey_ok(qp, &e->rdma_sge, qp->r_len, vaddr, + rkey, IB_ACCESS_REMOTE_WRITE))) + goto nack_acc; + + qp->r_psn += num_segs - 1; + + e->opcode = (bth0 >> 24) & 0xff; + e->psn = psn; + e->lpsn = qp->r_psn; + e->sent = 0; + + req->n_flows = min_t(u16, num_segs, qpriv->tid_rdma.local.max_write); + req->state = TID_REQUEST_INIT; + req->cur_seg = 0; + req->comp_seg = 0; + req->ack_seg = 0; + req->alloc_seg = 0; + req->isge = 0; + req->seg_len = qpriv->tid_rdma.local.max_len; + req->total_len = len; + req->total_segs = num_segs; + req->r_flow_psn = e->psn; + req->ss.sge = e->rdma_sge; + req->ss.num_sge = 1; + + req->flow_idx = req->setup_head; + req->clear_tail = req->setup_head; + req->acked_tail = req->setup_head; + + qp->r_state = e->opcode; + qp->r_nak_state = 0; + /* + * We need to increment the MSN here instead of when we + * finish sending the result since a duplicate request would + * increment it more than once. + */ + qp->r_msn++; + qp->r_psn++; + + if (qpriv->r_tid_tail == HFI1_QP_WQE_INVALID) { + qpriv->r_tid_tail = qp->r_head_ack_queue; + } else if (qpriv->r_tid_tail == qpriv->r_tid_head) { + struct tid_rdma_request *ptr; + + e = &qp->s_ack_queue[qpriv->r_tid_tail]; + ptr = ack_to_tid_req(e); + + if (e->opcode != TID_OP(WRITE_REQ) || + ptr->comp_seg == ptr->total_segs) { + if (qpriv->r_tid_tail == qpriv->r_tid_ack) + qpriv->r_tid_ack = qp->r_head_ack_queue; + qpriv->r_tid_tail = qp->r_head_ack_queue; + } + } +update_head: + qp->r_head_ack_queue = next; + qpriv->r_tid_head = qp->r_head_ack_queue; + + hfi1_tid_write_alloc_resources(qp, true); + + /* Schedule the send tasklet. */ + qp->s_flags |= RVT_S_RESP_PENDING; + hfi1_schedule_send(qp); + + spin_unlock_irqrestore(&qp->s_lock, flags); + if (is_fecn) + goto send_ack; + return; + +nack_inv_unlock: + spin_unlock_irqrestore(&qp->s_lock, flags); +nack_inv: + rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR); + qp->r_nak_state = IB_NAK_INVALID_REQUEST; + qp->r_ack_psn = qp->r_psn; + /* Queue NAK for later */ + rc_defered_ack(rcd, qp); + return; +nack_acc: + spin_unlock_irqrestore(&qp->s_lock, flags); + rvt_rc_error(qp, IB_WC_LOC_PROT_ERR); + qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR; + qp->r_ack_psn = qp->r_psn; +send_ack: + hfi1_send_rc_ack(packet, is_fecn); +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index 9b952351f072..7780a28db316 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -26,7 +26,9 @@ * * HFI1_S_TID_WAIT_INTERLCK - QP is waiting for requester interlock */ +/* BIT(4) reserved for RVT_S_ACK_PENDING. */ #define HFI1_S_TID_WAIT_INTERLCK BIT(5) +#define HFI1_R_TID_SW_PSN BIT(19) /* * Unlike regular IB RDMA VERBS, which do not require an entry @@ -89,10 +91,12 @@ struct tid_rdma_request { } e; struct tid_rdma_flow *flows; /* array of tid flows */ + struct rvt_sge_state ss; /* SGE state for TID RDMA requests */ u16 n_flows; /* size of the flow buffer window */ u16 setup_head; /* flow index we are setting up */ u16 clear_tail; /* flow index we are clearing */ u16 flow_idx; /* flow index most recently set up */ + u16 acked_tail; u32 seg_len; u32 total_len; @@ -103,6 +107,7 @@ struct tid_rdma_request { u32 cur_seg; /* index of current segment */ u32 comp_seg; /* index of last completed segment */ u32 ack_seg; /* index of last ack'ed segment */ + u32 alloc_seg; /* index of next segment to be allocated */ u32 isge; /* index of "current" sge */ u32 ack_pending; /* num acks pending for this request */ @@ -174,6 +179,12 @@ struct tid_rdma_flow { u32 tid_entry[TID_RDMA_MAX_PAGES]; }; +enum tid_rnr_nak_state { + TID_RNR_NAK_INIT = 0, + TID_RNR_NAK_SEND, + TID_RNR_NAK_SENT, +}; + bool tid_rdma_conn_req(struct rvt_qp *qp, u64 *data); bool tid_rdma_conn_reply(struct rvt_qp *qp, u64 data); bool tid_rdma_conn_resp(struct rvt_qp *qp, u64 *data); @@ -247,4 +258,9 @@ static inline void hfi1_setup_tid_rdma_wqe(struct rvt_qp *qp, u32 hfi1_build_tid_rdma_write_req(struct rvt_qp *qp, struct rvt_swqe *wqe, struct ib_other_headers *ohdr, u32 *bth1, u32 *bth2, u32 *len); + +void hfi1_compute_tid_rdma_flow_wt(void); + +void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet); + #endif /* HFI1_TID_RDMA_H */ diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index 841727a684d5..9ced8a4a7b76 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -172,7 +172,15 @@ struct hfi1_qp_priv { unsigned long tid_timer_timeout_jiffies; /* variables for the TID RDMA SE state machine */ + u8 rnr_nak_state; /* RNR NAK state */ u32 s_flags; + u32 r_tid_head; /* Most recently added TID RDMA request */ + u32 r_tid_tail; /* the last completed TID RDMA request */ + u32 r_tid_ack; /* the TID RDMA request to be ACK'ed */ + u32 r_tid_alloc; /* Request for which we are allocating resources */ + u32 pending_tid_w_segs; /* Num of pending tid write segments */ + u32 alloc_w_segs; /* Number of segments for which write */ + /* resources have been allocated for this QP */ /* For TID RDMA READ */ u32 tid_r_reqs; /* Num of tid reads requested */ @@ -180,8 +188,12 @@ struct hfi1_qp_priv { u32 pending_tid_r_segs; /* Num of pending tid read segments */ u16 pkts_ps; /* packets per segment */ u8 timeout_shift; /* account for number of packets per segment */ + + u8 sync_pt; /* Set when QP reaches sync point */ }; +#define HFI1_QP_WQE_INVALID ((u32)-1) + struct hfi1_swqe_priv { struct tid_rdma_request tid_req; struct rvt_sge_state ss; /* Used for TID RDMA READ Request */ -- cgit v1.2.3-59-g8ed1b From 38d46d3676ed6ecba284eb49e4b675ca9891801a Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:49:09 -0800 Subject: IB/hfi1: Add a function to build TID RDMA WRITE response This patch adds the function to build TID RDMA WRITE response. The main role of the TID RDMA WRITE RESP packet is to send TID entries to the requester so that they can be used to encode TID RDMA WRITE DATA packet. Signed-off-by: Mitko Haralanov Signed-off-by: Mike Marciniszyn Signed-off-by: Ashutosh Dixit Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/tid_rdma.c | 95 +++++++++++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/tid_rdma.h | 5 ++ drivers/infiniband/hw/hfi1/verbs.h | 1 + 3 files changed, 101 insertions(+) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 516dca9f497e..78828f9f7592 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -3583,3 +3583,98 @@ nack_acc: send_ack: hfi1_send_rc_ack(packet, is_fecn); } + +u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp *qp, struct rvt_ack_entry *e, + struct ib_other_headers *ohdr, u32 *bth1, + u32 bth2, u32 *len, + struct rvt_sge_state **ss) +{ + struct hfi1_ack_priv *epriv = e->priv; + struct tid_rdma_request *req = &epriv->tid_req; + struct hfi1_qp_priv *qpriv = qp->priv; + struct tid_rdma_flow *flow = NULL; + u32 resp_len = 0, hdwords = 0; + void *resp_addr = NULL; + struct tid_rdma_params *remote; + + flow = &req->flows[req->flow_idx]; + switch (req->state) { + default: + /* + * Try to allocate resources here in case QP was queued and was + * later scheduled when resources became available + */ + hfi1_tid_write_alloc_resources(qp, false); + + /* We've already sent everything which is ready */ + if (req->cur_seg >= req->alloc_seg) + goto done; + + /* + * Resources can be assigned but responses cannot be sent in + * rnr_nak state, till the resent request is received + */ + if (qpriv->rnr_nak_state == TID_RNR_NAK_SENT) + goto done; + + req->state = TID_REQUEST_ACTIVE; + req->flow_idx = CIRC_NEXT(req->flow_idx, MAX_FLOWS); + break; + + case TID_REQUEST_RESEND_ACTIVE: + case TID_REQUEST_RESEND: + req->flow_idx = CIRC_NEXT(req->flow_idx, MAX_FLOWS); + if (!CIRC_CNT(req->setup_head, req->flow_idx, MAX_FLOWS)) + req->state = TID_REQUEST_ACTIVE; + + break; + } + flow->flow_state.resp_ib_psn = bth2; + resp_addr = (void *)flow->tid_entry; + resp_len = sizeof(*flow->tid_entry) * flow->tidcnt; + req->cur_seg++; + + memset(&ohdr->u.tid_rdma.w_rsp, 0, sizeof(ohdr->u.tid_rdma.w_rsp)); + epriv->ss.sge.vaddr = resp_addr; + epriv->ss.sge.sge_length = resp_len; + epriv->ss.sge.length = epriv->ss.sge.sge_length; + /* + * We can safely zero these out. Since the first SGE covers the + * entire packet, nothing else should even look at the MR. + */ + epriv->ss.sge.mr = NULL; + epriv->ss.sge.m = 0; + epriv->ss.sge.n = 0; + + epriv->ss.sg_list = NULL; + epriv->ss.total_len = epriv->ss.sge.sge_length; + epriv->ss.num_sge = 1; + + *ss = &epriv->ss; + *len = epriv->ss.total_len; + + /* Construct the TID RDMA WRITE RESP packet header */ + rcu_read_lock(); + remote = rcu_dereference(qpriv->tid_rdma.remote); + + KDETH_RESET(ohdr->u.tid_rdma.w_rsp.kdeth0, KVER, 0x1); + KDETH_RESET(ohdr->u.tid_rdma.w_rsp.kdeth1, JKEY, remote->jkey); + ohdr->u.tid_rdma.w_rsp.aeth = rvt_compute_aeth(qp); + ohdr->u.tid_rdma.w_rsp.tid_flow_psn = + cpu_to_be32((flow->flow_state.generation << + HFI1_KDETH_BTH_SEQ_SHIFT) | + (flow->flow_state.spsn & + HFI1_KDETH_BTH_SEQ_MASK)); + ohdr->u.tid_rdma.w_rsp.tid_flow_qp = + cpu_to_be32(qpriv->tid_rdma.local.qp | + ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) << + TID_RDMA_DESTQP_FLOW_SHIFT) | + qpriv->rcd->ctxt); + ohdr->u.tid_rdma.w_rsp.verbs_qp = cpu_to_be32(qp->remote_qpn); + *bth1 = remote->qp; + rcu_read_unlock(); + hdwords = sizeof(ohdr->u.tid_rdma.w_rsp) / sizeof(u32); + qpriv->pending_tid_w_segs++; +done: + return hdwords; +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index 7780a28db316..19f4dd89680f 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -263,4 +263,9 @@ void hfi1_compute_tid_rdma_flow_wt(void); void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet); +u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp *qp, struct rvt_ack_entry *e, + struct ib_other_headers *ohdr, u32 *bth1, + u32 bth2, u32 *len, + struct rvt_sge_state **ss); + #endif /* HFI1_TID_RDMA_H */ diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index 9ced8a4a7b76..3a501b09621e 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -200,6 +200,7 @@ struct hfi1_swqe_priv { }; struct hfi1_ack_priv { + struct rvt_sge_state ss; /* used for TID WRITE RESP */ struct tid_rdma_request tid_req; }; -- cgit v1.2.3-59-g8ed1b From 3c759e003a6a4d4b8fd0472f9501e8c45d775c26 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:49:19 -0800 Subject: IB/hfi1: Add TID resource timer This patch adds the TID resource timer, which is used by the responder to free any TID resources that are allocated for TID RDMA WRITE request and not returned by the requester after a reasonable time. Reviewed-by: Mike Marciniszyn Signed-off-by: Mitko Haralanov Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/qp.c | 1 + drivers/infiniband/hw/hfi1/tid_rdma.c | 92 +++++++++++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/tid_rdma.h | 3 ++ drivers/infiniband/hw/hfi1/verbs.h | 1 + 4 files changed, 97 insertions(+) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index acdd9eba189b..31b4b60f4364 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -755,6 +755,7 @@ void quiesce_qp(struct rvt_qp *qp) { struct hfi1_qp_priv *priv = qp->priv; + hfi1_del_tid_reap_timer(qp); iowait_sdma_drain(&priv->s_iowait); qp_pio_drain(qp); flush_tx_list(qp); diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 78828f9f7592..ede25ee195ff 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -118,6 +118,9 @@ static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req, static void hfi1_init_trdma_req(struct rvt_qp *qp, struct tid_rdma_request *req); static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx); +static void hfi1_tid_timeout(struct timer_list *t); +static void hfi1_add_tid_reap_timer(struct rvt_qp *qp); +static void hfi1_mod_tid_reap_timer(struct rvt_qp *qp); static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p) { @@ -321,6 +324,7 @@ int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp, qpriv->r_tid_tail = HFI1_QP_WQE_INVALID; qpriv->r_tid_ack = HFI1_QP_WQE_INVALID; qpriv->r_tid_alloc = HFI1_QP_WQE_INVALID; + timer_setup(&qpriv->s_tid_timer, hfi1_tid_timeout, 0); INIT_LIST_HEAD(&qpriv->tid_wait); if (init_attr->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) { @@ -3619,6 +3623,7 @@ u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp *qp, struct rvt_ack_entry *e, req->state = TID_REQUEST_ACTIVE; req->flow_idx = CIRC_NEXT(req->flow_idx, MAX_FLOWS); + hfi1_add_tid_reap_timer(qp); break; case TID_REQUEST_RESEND_ACTIVE: @@ -3627,6 +3632,7 @@ u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp *qp, struct rvt_ack_entry *e, if (!CIRC_CNT(req->setup_head, req->flow_idx, MAX_FLOWS)) req->state = TID_REQUEST_ACTIVE; + hfi1_mod_tid_reap_timer(qp); break; } flow->flow_state.resp_ib_psn = bth2; @@ -3678,3 +3684,89 @@ u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp *qp, struct rvt_ack_entry *e, done: return hdwords; } + +static void hfi1_add_tid_reap_timer(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *qpriv = qp->priv; + + lockdep_assert_held(&qp->s_lock); + if (!(qpriv->s_flags & HFI1_R_TID_RSC_TIMER)) { + qpriv->s_flags |= HFI1_R_TID_RSC_TIMER; + qpriv->s_tid_timer.expires = jiffies + + qpriv->tid_timer_timeout_jiffies; + add_timer(&qpriv->s_tid_timer); + } +} + +static void hfi1_mod_tid_reap_timer(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *qpriv = qp->priv; + + lockdep_assert_held(&qp->s_lock); + qpriv->s_flags |= HFI1_R_TID_RSC_TIMER; + mod_timer(&qpriv->s_tid_timer, jiffies + + qpriv->tid_timer_timeout_jiffies); +} + +static int hfi1_stop_tid_reap_timer(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *qpriv = qp->priv; + int rval = 0; + + lockdep_assert_held(&qp->s_lock); + if (qpriv->s_flags & HFI1_R_TID_RSC_TIMER) { + rval = del_timer(&qpriv->s_tid_timer); + qpriv->s_flags &= ~HFI1_R_TID_RSC_TIMER; + } + return rval; +} + +void hfi1_del_tid_reap_timer(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *qpriv = qp->priv; + + del_timer_sync(&qpriv->s_tid_timer); + qpriv->s_flags &= ~HFI1_R_TID_RSC_TIMER; +} + +static void hfi1_tid_timeout(struct timer_list *t) +{ + struct hfi1_qp_priv *qpriv = from_timer(qpriv, t, s_tid_timer); + struct rvt_qp *qp = qpriv->owner; + struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device); + unsigned long flags; + u32 i; + + spin_lock_irqsave(&qp->r_lock, flags); + spin_lock(&qp->s_lock); + if (qpriv->s_flags & HFI1_R_TID_RSC_TIMER) { + dd_dev_warn(dd_from_ibdev(qp->ibqp.device), "[QP%u] %s %d\n", + qp->ibqp.qp_num, __func__, __LINE__); + hfi1_stop_tid_reap_timer(qp); + /* + * Go though the entire ack queue and clear any outstanding + * HW flow and RcvArray resources. + */ + hfi1_kern_clear_hw_flow(qpriv->rcd, qp); + for (i = 0; i < rvt_max_atomic(rdi); i++) { + struct tid_rdma_request *req = + ack_to_tid_req(&qp->s_ack_queue[i]); + + hfi1_kern_exp_rcv_clear_all(req); + } + spin_unlock(&qp->s_lock); + if (qp->ibqp.event_handler) { + struct ib_event ev; + + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_QP_FATAL; + qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); + } + rvt_rc_error(qp, IB_WC_RESP_TIMEOUT_ERR); + goto unlock_r_lock; + } + spin_unlock(&qp->s_lock); +unlock_r_lock: + spin_unlock_irqrestore(&qp->r_lock, flags); +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index 19f4dd89680f..39137e3c79fe 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -26,6 +26,7 @@ * * HFI1_S_TID_WAIT_INTERLCK - QP is waiting for requester interlock */ +#define HFI1_R_TID_RSC_TIMER BIT(2) /* BIT(4) reserved for RVT_S_ACK_PENDING. */ #define HFI1_S_TID_WAIT_INTERLCK BIT(5) #define HFI1_R_TID_SW_PSN BIT(19) @@ -268,4 +269,6 @@ u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp *qp, struct rvt_ack_entry *e, u32 bth2, u32 *len, struct rvt_sge_state **ss); +void hfi1_del_tid_reap_timer(struct rvt_qp *qp); + #endif /* HFI1_TID_RDMA_H */ diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index 3a501b09621e..68a41f54bc78 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -163,6 +163,7 @@ struct hfi1_qp_priv { u32 tid_enqueue; /* saved when tid waited */ u8 s_sc; /* SC[0..4] for next packet */ struct iowait s_iowait; + struct timer_list s_tid_timer; /* for timing tid wait */ struct list_head tid_wait; /* for queueing tid space */ struct hfi1_opfn_data opfn; struct tid_flow_state flow_state; -- cgit v1.2.3-59-g8ed1b From 72a0ea99ec13bcb27784c1a48f4e8fda61586c26 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:49:31 -0800 Subject: IB/hfi1: Add a function to receive TID RDMA WRITE response This patch adds a function to receive TID RDMA WRITE response. The TID entries will be stored for encoding TID RDMA WRITE DATA packet later. Signed-off-by: Mitko Haralanov Signed-off-by: Mike Marciniszyn Signed-off-by: Ashutosh Dixit Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/tid_rdma.c | 173 ++++++++++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/tid_rdma.h | 4 + drivers/infiniband/hw/hfi1/verbs.h | 3 + 3 files changed, 180 insertions(+) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index ede25ee195ff..92b6a3d90ce5 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -319,6 +319,9 @@ int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp, qpriv->flow_state.index = RXE_NUM_TID_FLOWS; qpriv->flow_state.last_index = RXE_NUM_TID_FLOWS; qpriv->flow_state.generation = KERN_GENERATION_RESERVED; + qpriv->s_tid_cur = HFI1_QP_WQE_INVALID; + qpriv->s_tid_head = HFI1_QP_WQE_INVALID; + qpriv->s_tid_tail = HFI1_QP_WQE_INVALID; qpriv->rnr_nak_state = TID_RNR_NAK_INIT; qpriv->r_tid_head = HFI1_QP_WQE_INVALID; qpriv->r_tid_tail = HFI1_QP_WQE_INVALID; @@ -3770,3 +3773,173 @@ static void hfi1_tid_timeout(struct timer_list *t) unlock_r_lock: spin_unlock_irqrestore(&qp->r_lock, flags); } + +void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet) +{ + /* HANDLER FOR TID RDMA WRITE RESPONSE packet (Requestor side */ + + /* + * 1. Find matching SWQE + * 2. Check that TIDENTRY array has enough space for a complete + * segment. If not, put QP in error state. + * 3. Save response data in struct tid_rdma_req and struct tid_rdma_flow + * 4. Remove HFI1_S_WAIT_TID_RESP from s_flags. + * 5. Set qp->s_state + * 6. Kick the send engine (hfi1_schedule_send()) + */ + struct ib_other_headers *ohdr = packet->ohdr; + struct rvt_qp *qp = packet->qp; + struct hfi1_qp_priv *qpriv = qp->priv; + struct hfi1_ctxtdata *rcd = packet->rcd; + struct rvt_swqe *wqe; + struct tid_rdma_request *req; + struct tid_rdma_flow *flow; + enum ib_wc_status status; + u32 opcode, aeth, psn, flow_psn, i, tidlen = 0, pktlen; + bool is_fecn; + unsigned long flags; + + is_fecn = process_ecn(qp, packet); + psn = mask_psn(be32_to_cpu(ohdr->bth[2])); + aeth = be32_to_cpu(ohdr->u.tid_rdma.w_rsp.aeth); + opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; + + spin_lock_irqsave(&qp->s_lock, flags); + + /* Ignore invalid responses */ + if (cmp_psn(psn, qp->s_next_psn) >= 0) + goto ack_done; + + /* Ignore duplicate responses. */ + if (unlikely(cmp_psn(psn, qp->s_last_psn) <= 0)) + goto ack_done; + + if (unlikely(qp->s_acked == qp->s_tail)) + goto ack_done; + + /* + * If we are waiting for a particular packet sequence number + * due to a request being resent, check for it. Otherwise, + * ensure that we haven't missed anything. + */ + if (qp->r_flags & RVT_R_RDMAR_SEQ) { + if (cmp_psn(psn, qp->s_last_psn + 1) != 0) + goto ack_done; + qp->r_flags &= ~RVT_R_RDMAR_SEQ; + } + + wqe = rvt_get_swqe_ptr(qp, qpriv->s_tid_cur); + if (unlikely(wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)) + goto ack_op_err; + + req = wqe_to_tid_req(wqe); + /* + * If we've lost ACKs and our acked_tail pointer is too far + * behind, don't overwrite segments. Just drop the packet and + * let the reliability protocol take care of it. + */ + if (!CIRC_SPACE(req->setup_head, req->acked_tail, MAX_FLOWS)) + goto ack_done; + + /* + * The call to do_rc_ack() should be last in the chain of + * packet checks because it will end up updating the QP state. + * Therefore, anything that would prevent the packet from + * being accepted as a successful response should be prior + * to it. + */ + if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd)) + goto ack_done; + + flow = &req->flows[req->setup_head]; + flow->pkt = 0; + flow->tid_idx = 0; + flow->tid_offset = 0; + flow->sent = 0; + flow->resync_npkts = 0; + flow->tid_qpn = be32_to_cpu(ohdr->u.tid_rdma.w_rsp.tid_flow_qp); + flow->idx = (flow->tid_qpn >> TID_RDMA_DESTQP_FLOW_SHIFT) & + TID_RDMA_DESTQP_FLOW_MASK; + flow_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.w_rsp.tid_flow_psn)); + flow->flow_state.generation = flow_psn >> HFI1_KDETH_BTH_SEQ_SHIFT; + flow->flow_state.spsn = flow_psn & HFI1_KDETH_BTH_SEQ_MASK; + flow->flow_state.resp_ib_psn = psn; + flow->length = min_t(u32, req->seg_len, + (wqe->length - (req->comp_seg * req->seg_len))); + + flow->npkts = rvt_div_round_up_mtu(qp, flow->length); + flow->flow_state.lpsn = flow->flow_state.spsn + + flow->npkts - 1; + /* payload length = packet length - (header length + ICRC length) */ + pktlen = packet->tlen - (packet->hlen + 4); + if (pktlen > sizeof(flow->tid_entry)) { + status = IB_WC_LOC_LEN_ERR; + goto ack_err; + } + memcpy(flow->tid_entry, packet->ebuf, pktlen); + flow->tidcnt = pktlen / sizeof(*flow->tid_entry); + + req->comp_seg++; + /* + * Walk the TID_ENTRY list to make sure we have enough space for a + * complete segment. + */ + for (i = 0; i < flow->tidcnt; i++) { + if (!EXP_TID_GET(flow->tid_entry[i], LEN)) { + status = IB_WC_LOC_LEN_ERR; + goto ack_err; + } + tidlen += EXP_TID_GET(flow->tid_entry[i], LEN); + } + if (tidlen * PAGE_SIZE < flow->length) { + status = IB_WC_LOC_LEN_ERR; + goto ack_err; + } + + /* + * If this is the first response for this request, set the initial + * flow index to the current flow. + */ + if (!cmp_psn(psn, wqe->psn)) { + req->r_last_acked = mask_psn(wqe->psn - 1); + /* Set acked flow index to head index */ + req->acked_tail = req->setup_head; + } + + /* advance circular buffer head */ + req->setup_head = CIRC_NEXT(req->setup_head, MAX_FLOWS); + req->state = TID_REQUEST_ACTIVE; + + /* + * If all responses for this TID RDMA WRITE request have been received + * advance the pointer to the next one. + * Since TID RDMA requests could be mixed in with regular IB requests, + * they might not appear sequentially in the queue. Therefore, the + * next request needs to be "found". + */ + if (qpriv->s_tid_cur != qpriv->s_tid_head && + req->comp_seg == req->total_segs) { + for (i = qpriv->s_tid_cur + 1; ; i++) { + if (i == qp->s_size) + i = 0; + wqe = rvt_get_swqe_ptr(qp, i); + if (i == qpriv->s_tid_head) + break; + if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) + break; + } + qpriv->s_tid_cur = i; + } + qp->s_flags &= ~HFI1_S_WAIT_TID_RESP; + + goto ack_done; + +ack_op_err: + status = IB_WC_LOC_QP_OP_ERR; +ack_err: + rvt_error_qp(qp, status); +ack_done: + spin_unlock_irqrestore(&qp->s_lock, flags); + if (is_fecn) + hfi1_send_rc_ack(packet, is_fecn); +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index 39137e3c79fe..6f11fd5ca4c0 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -102,6 +102,7 @@ struct tid_rdma_request { u32 seg_len; u32 total_len; u32 r_flow_psn; /* IB PSN of next segment start */ + u32 r_last_acked; /* IB PSN of last ACK'ed packet */ u32 s_next_psn; /* IB PSN of next segment start for read */ u32 total_segs; /* segments required to complete a request */ @@ -175,6 +176,7 @@ struct tid_rdma_flow { u8 npagesets; u8 npkts; u8 pkt; + u8 resync_npkts; struct kern_tid_node tnode[TID_RDMA_MAX_PAGES]; struct tid_rdma_pageset pagesets[TID_RDMA_MAX_PAGES]; u32 tid_entry[TID_RDMA_MAX_PAGES]; @@ -271,4 +273,6 @@ u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp *qp, struct rvt_ack_entry *e, void hfi1_del_tid_reap_timer(struct rvt_qp *qp); +void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet); + #endif /* HFI1_TID_RDMA_H */ diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index 68a41f54bc78..b2096c7c1132 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -175,6 +175,9 @@ struct hfi1_qp_priv { /* variables for the TID RDMA SE state machine */ u8 rnr_nak_state; /* RNR NAK state */ u32 s_flags; + u32 s_tid_cur; + u32 s_tid_head; + u32 s_tid_tail; u32 r_tid_head; /* Most recently added TID RDMA request */ u32 r_tid_tail; /* the last completed TID RDMA request */ u32 r_tid_ack; /* the TID RDMA request to be ACK'ed */ -- cgit v1.2.3-59-g8ed1b From 539e1908e45b5cdcc72bded272f8adb52ad2c913 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:49:41 -0800 Subject: IB/hfi1: Add a function to build TID RDMA WRITE DATA packet This patch adds a function to build TID RDMA WRITE DATA packet. Signed-off-by: Mitko Haralanov Signed-off-by: Mike Marciniszyn Signed-off-by: Ashutosh Dixit Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/tid_rdma.c | 62 +++++++++++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/tid_rdma.h | 4 +++ 2 files changed, 66 insertions(+) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 92b6a3d90ce5..243feaddb811 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -3943,3 +3943,65 @@ ack_done: if (is_fecn) hfi1_send_rc_ack(packet, is_fecn); } + +bool hfi1_build_tid_rdma_packet(struct rvt_swqe *wqe, + struct ib_other_headers *ohdr, + u32 *bth1, u32 *bth2, u32 *len) +{ + struct tid_rdma_request *req = wqe_to_tid_req(wqe); + struct tid_rdma_flow *flow = &req->flows[req->clear_tail]; + struct tid_rdma_params *remote; + struct rvt_qp *qp = req->qp; + struct hfi1_qp_priv *qpriv = qp->priv; + u32 tidentry = flow->tid_entry[flow->tid_idx]; + u32 tidlen = EXP_TID_GET(tidentry, LEN) << PAGE_SHIFT; + struct tid_rdma_write_data *wd = &ohdr->u.tid_rdma.w_data; + u32 next_offset, om = KDETH_OM_LARGE; + bool last_pkt; + + if (!tidlen) { + hfi1_trdma_send_complete(qp, wqe, IB_WC_REM_INV_RD_REQ_ERR); + rvt_error_qp(qp, IB_WC_REM_INV_RD_REQ_ERR); + } + + *len = min_t(u32, qp->pmtu, tidlen - flow->tid_offset); + flow->sent += *len; + next_offset = flow->tid_offset + *len; + last_pkt = (flow->tid_idx == (flow->tidcnt - 1) && + next_offset >= tidlen) || (flow->sent >= flow->length); + + rcu_read_lock(); + remote = rcu_dereference(qpriv->tid_rdma.remote); + KDETH_RESET(wd->kdeth0, KVER, 0x1); + KDETH_SET(wd->kdeth0, SH, !last_pkt); + KDETH_SET(wd->kdeth0, INTR, !!(!last_pkt && remote->urg)); + KDETH_SET(wd->kdeth0, TIDCTRL, EXP_TID_GET(tidentry, CTRL)); + KDETH_SET(wd->kdeth0, TID, EXP_TID_GET(tidentry, IDX)); + KDETH_SET(wd->kdeth0, OM, om == KDETH_OM_LARGE); + KDETH_SET(wd->kdeth0, OFFSET, flow->tid_offset / om); + KDETH_RESET(wd->kdeth1, JKEY, remote->jkey); + wd->verbs_qp = cpu_to_be32(qp->remote_qpn); + rcu_read_unlock(); + + *bth1 = flow->tid_qpn; + *bth2 = mask_psn(((flow->flow_state.spsn + flow->pkt++) & + HFI1_KDETH_BTH_SEQ_MASK) | + (flow->flow_state.generation << + HFI1_KDETH_BTH_SEQ_SHIFT)); + if (last_pkt) { + /* PSNs are zero-based, so +1 to count number of packets */ + if (flow->flow_state.lpsn + 1 + + rvt_div_round_up_mtu(qp, req->seg_len) > + MAX_TID_FLOW_PSN) + req->state = TID_REQUEST_SYNC; + *bth2 |= IB_BTH_REQ_ACK; + } + + if (next_offset >= tidlen) { + flow->tid_offset = 0; + flow->tid_idx++; + } else { + flow->tid_offset = next_offset; + } + return last_pkt; +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index 6f11fd5ca4c0..f28c7ab752b2 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -275,4 +275,8 @@ void hfi1_del_tid_reap_timer(struct rvt_qp *qp); void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet); +bool hfi1_build_tid_rdma_packet(struct rvt_swqe *wqe, + struct ib_other_headers *ohdr, + u32 *bth1, u32 *bth2, u32 *len); + #endif /* HFI1_TID_RDMA_H */ -- cgit v1.2.3-59-g8ed1b From d72fe7d5008b5600a11f03a0dcb743fd7acb0085 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:49:51 -0800 Subject: IB/hfi1: Add a function to receive TID RDMA WRITE DATA packet This patch adds a function to receive TID RDMA WRITE DATA packet, which is in the KDETH PSN space in packet ordering. Due to the use of header suppression, software is generally only notified when the last data packet for a segment is received. This patch also adds code to handle KDETH EFLAGS errors for ingress TID RDMA WRITE DATA packets. Signed-off-by: Mitko Haralanov Signed-off-by: Mike Marciniszyn Signed-off-by: Ashutosh Dixit Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/tid_rdma.c | 236 ++++++++++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/tid_rdma.h | 2 + drivers/infiniband/hw/hfi1/verbs.h | 3 + 3 files changed, 241 insertions(+) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 243feaddb811..166a34c8449d 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -2566,13 +2566,32 @@ static bool tid_rdma_tid_err(struct hfi1_ctxtdata *rcd, u8 opcode) { struct rvt_qp *qp = packet->qp; + struct hfi1_qp_priv *qpriv = qp->priv; u32 ipsn; struct ib_other_headers *ohdr = packet->ohdr; + struct rvt_ack_entry *e; + struct tid_rdma_request *req; + struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device); + u32 i; if (rcv_type >= RHF_RCV_TYPE_IB) goto done; spin_lock(&qp->s_lock); + + /* + * We've ran out of space in the eager buffer. + * Eagerly received KDETH packets which require space in the + * Eager buffer (packet that have payload) are TID RDMA WRITE + * response packets. In this case, we have to re-transmit the + * TID RDMA WRITE request. + */ + if (rcv_type == RHF_RCV_TYPE_EAGER) { + hfi1_restart_rc(qp, qp->s_last_psn + 1, 1); + hfi1_schedule_send(qp); + goto done_unlock; + } + /* * For TID READ response, error out QP after freeing the tid * resources. @@ -2586,8 +2605,25 @@ static bool tid_rdma_tid_err(struct hfi1_ctxtdata *rcd, rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR); goto done; } + goto done_unlock; + } + + /* + * Error out the qp for TID RDMA WRITE + */ + hfi1_kern_clear_hw_flow(qpriv->rcd, qp); + for (i = 0; i < rvt_max_atomic(rdi); i++) { + e = &qp->s_ack_queue[i]; + if (e->opcode == TID_OP(WRITE_REQ)) { + req = ack_to_tid_req(e); + hfi1_kern_exp_rcv_clear_all(req); + } } + spin_unlock(&qp->s_lock); + rvt_rc_error(qp, IB_WC_LOC_LEN_ERR); + goto done; +done_unlock: spin_unlock(&qp->s_lock); done: return true; @@ -2837,8 +2873,12 @@ bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd, u8 opcode; u32 qp_num, psn, ibpsn; struct rvt_qp *qp; + struct hfi1_qp_priv *qpriv; unsigned long flags; bool ret = true; + struct rvt_ack_entry *e; + struct tid_rdma_request *req; + struct tid_rdma_flow *flow; trace_hfi1_msg_handle_kdeth_eflags(NULL, "Kdeth error: rhf ", packet->rhf); @@ -2897,14 +2937,109 @@ bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd, ibpsn = mask_psn(ibpsn); ret = handle_read_kdeth_eflags(rcd, packet, rcv_type, rte, psn, ibpsn); + goto r_unlock; + } + + /* + * qp->s_tail_ack_queue points to the rvt_ack_entry currently being + * processed. These a completed sequentially so we can be sure that + * the pointer will not change until the entire request has completed. + */ + spin_lock(&qp->s_lock); + qpriv = qp->priv; + e = &qp->s_ack_queue[qpriv->r_tid_tail]; + req = ack_to_tid_req(e); + flow = &req->flows[req->clear_tail]; + + switch (rcv_type) { + case RHF_RCV_TYPE_EXPECTED: + switch (rte) { + case RHF_RTE_EXPECTED_FLOW_SEQ_ERR: + if (!(qpriv->s_flags & HFI1_R_TID_SW_PSN)) { + u64 reg; + + qpriv->s_flags |= HFI1_R_TID_SW_PSN; + /* + * The only sane way to get the amount of + * progress is to read the HW flow state. + */ + reg = read_uctxt_csr(dd, rcd->ctxt, + RCV_TID_FLOW_TABLE + + (8 * flow->idx)); + flow->flow_state.r_next_psn = mask_psn(reg); + qpriv->r_next_psn_kdeth = + flow->flow_state.r_next_psn; + goto nak_psn; + } else { + /* + * If the received PSN does not match the next + * expected PSN, NAK the packet. + * However, only do that if we know that the a + * NAK has already been sent. Otherwise, this + * mismatch could be due to packets that were + * already in flight. + */ + if (psn != flow->flow_state.r_next_psn) { + psn = flow->flow_state.r_next_psn; + goto nak_psn; + } + + qpriv->s_nak_state = 0; + /* + * If SW PSN verification is successful and this + * is the last packet in the segment, tell the + * caller to process it as a normal packet. + */ + if (psn == full_flow_psn(flow, + flow->flow_state.lpsn)) + ret = false; + qpriv->r_next_psn_kdeth = + ++flow->flow_state.r_next_psn; + } + break; + + case RHF_RTE_EXPECTED_FLOW_GEN_ERR: + goto nak_psn; + + default: + break; + } + break; + + case RHF_RCV_TYPE_ERROR: + switch (rte) { + case RHF_RTE_ERROR_OP_CODE_ERR: + case RHF_RTE_ERROR_KHDR_MIN_LEN_ERR: + case RHF_RTE_ERROR_KHDR_HCRC_ERR: + case RHF_RTE_ERROR_KHDR_KVER_ERR: + case RHF_RTE_ERROR_CONTEXT_ERR: + case RHF_RTE_ERROR_KHDR_TID_ERR: + default: + break; + } + default: + break; } +unlock: + spin_unlock(&qp->s_lock); r_unlock: spin_unlock_irqrestore(&qp->r_lock, flags); rcu_unlock: rcu_read_unlock(); drop: return ret; +nak_psn: + ibp->rvp.n_rc_seqnak++; + if (!qpriv->s_nak_state) { + qpriv->s_nak_state = IB_NAK_PSN_ERROR; + /* We are NAK'ing the next expected PSN */ + qpriv->s_nak_psn = mask_psn(flow->flow_state.r_next_psn); + qpriv->s_flags |= RVT_S_ACK_PENDING; + if (qpriv->r_tid_ack == HFI1_QP_WQE_INVALID) + qpriv->r_tid_ack = qpriv->r_tid_tail; + } + goto unlock; } /* @@ -4005,3 +4140,104 @@ bool hfi1_build_tid_rdma_packet(struct rvt_swqe *wqe, } return last_pkt; } + +void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet) +{ + struct rvt_qp *qp = packet->qp; + struct hfi1_qp_priv *priv = qp->priv; + struct hfi1_ctxtdata *rcd = priv->rcd; + struct ib_other_headers *ohdr = packet->ohdr; + struct rvt_ack_entry *e; + struct tid_rdma_request *req; + struct tid_rdma_flow *flow; + struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); + unsigned long flags; + u32 psn, next; + u8 opcode; + + psn = mask_psn(be32_to_cpu(ohdr->bth[2])); + opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; + + /* + * All error handling should be done by now. If we are here, the packet + * is either good or been accepted by the error handler. + */ + spin_lock_irqsave(&qp->s_lock, flags); + e = &qp->s_ack_queue[priv->r_tid_tail]; + req = ack_to_tid_req(e); + flow = &req->flows[req->clear_tail]; + if (cmp_psn(psn, full_flow_psn(flow, flow->flow_state.lpsn))) { + if (cmp_psn(psn, flow->flow_state.r_next_psn)) + goto send_nak; + flow->flow_state.r_next_psn++; + goto exit; + } + flow->flow_state.r_next_psn = mask_psn(psn + 1); + hfi1_kern_exp_rcv_clear(req); + priv->alloc_w_segs--; + rcd->flows[flow->idx].psn = psn & HFI1_KDETH_BTH_SEQ_MASK; + req->comp_seg++; + priv->s_nak_state = 0; + + /* + * Release the flow if one of the following conditions has been met: + * - The request has reached a sync point AND all outstanding + * segments have been completed, or + * - The entire request is complete and there are no more requests + * (of any kind) in the queue. + */ + if (priv->r_tid_ack == HFI1_QP_WQE_INVALID) + priv->r_tid_ack = priv->r_tid_tail; + + if (opcode == TID_OP(WRITE_DATA_LAST)) { + for (next = priv->r_tid_tail + 1; ; next++) { + if (next > rvt_size_atomic(&dev->rdi)) + next = 0; + if (next == priv->r_tid_head) + break; + e = &qp->s_ack_queue[next]; + if (e->opcode == TID_OP(WRITE_REQ)) + break; + } + priv->r_tid_tail = next; + if (++qp->s_acked_ack_queue > rvt_size_atomic(&dev->rdi)) + qp->s_acked_ack_queue = 0; + } + + hfi1_tid_write_alloc_resources(qp, true); + + /* + * If we need to generate more responses, schedule the + * send engine. + */ + if (req->cur_seg < req->total_segs || + qp->s_tail_ack_queue != qp->r_head_ack_queue) { + qp->s_flags |= RVT_S_RESP_PENDING; + hfi1_schedule_send(qp); + } + + priv->pending_tid_w_segs--; + if (priv->s_flags & HFI1_R_TID_RSC_TIMER) { + if (priv->pending_tid_w_segs) + hfi1_mod_tid_reap_timer(req->qp); + else + hfi1_stop_tid_reap_timer(req->qp); + } + +done: + priv->s_flags |= RVT_S_ACK_PENDING; +exit: + priv->r_next_psn_kdeth = flow->flow_state.r_next_psn; + spin_unlock_irqrestore(&qp->s_lock, flags); + return; + +send_nak: + if (!priv->s_nak_state) { + priv->s_nak_state = IB_NAK_PSN_ERROR; + priv->s_nak_psn = flow->flow_state.r_next_psn; + priv->s_flags |= RVT_S_ACK_PENDING; + if (priv->r_tid_ack == HFI1_QP_WQE_INVALID) + priv->r_tid_ack = priv->r_tid_tail; + } + goto done; +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index f28c7ab752b2..647a6f0cba31 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -279,4 +279,6 @@ bool hfi1_build_tid_rdma_packet(struct rvt_swqe *wqe, struct ib_other_headers *ohdr, u32 *bth1, u32 *bth2, u32 *len); +void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet); + #endif /* HFI1_TID_RDMA_H */ diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index b2096c7c1132..eec6e822635b 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -174,6 +174,8 @@ struct hfi1_qp_priv { /* variables for the TID RDMA SE state machine */ u8 rnr_nak_state; /* RNR NAK state */ + u8 s_nak_state; + u32 s_nak_psn; u32 s_flags; u32 s_tid_cur; u32 s_tid_head; @@ -193,6 +195,7 @@ struct hfi1_qp_priv { u16 pkts_ps; /* packets per segment */ u8 timeout_shift; /* account for number of packets per segment */ + u32 r_next_psn_kdeth; u8 sync_pt; /* Set when QP reaches sync point */ }; -- cgit v1.2.3-59-g8ed1b From 0f75e325aa11552599a18d7558970be16fc15c1a Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:50:03 -0800 Subject: IB/hfi1: Add a function to build TID RDMA ACK packet This patch adds a function to build TID RDMA ACJ packet, which is also in the KDETH PSN space for packet ordering. This packet is used to acknowledge the receiving of all the TID RDMA WRITE DATA packets before the given KDETH PSN. Similar to RC ACK packets, TID RDMA ACK packets could also be coalesced. Signed-off-by: Mitko Haralanov Signed-off-by: Mike Marciniszyn Signed-off-by: Ashutosh Dixit Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/tid_rdma.c | 77 +++++++++++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/tid_rdma.h | 4 ++ drivers/infiniband/hw/hfi1/verbs.h | 2 + 3 files changed, 83 insertions(+) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 166a34c8449d..d8a7f07b028d 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -4241,3 +4241,80 @@ send_nak: } goto done; } + +static bool hfi1_tid_rdma_is_resync_psn(u32 psn) +{ + return (bool)((psn & HFI1_KDETH_BTH_SEQ_MASK) == + HFI1_KDETH_BTH_SEQ_MASK); +} + +u32 hfi1_build_tid_rdma_write_ack(struct rvt_qp *qp, struct rvt_ack_entry *e, + struct ib_other_headers *ohdr, u16 iflow, + u32 *bth1, u32 *bth2) +{ + struct hfi1_qp_priv *qpriv = qp->priv; + struct tid_flow_state *fs = &qpriv->flow_state; + struct tid_rdma_request *req = ack_to_tid_req(e); + struct tid_rdma_flow *flow = &req->flows[iflow]; + struct tid_rdma_params *remote; + + rcu_read_lock(); + remote = rcu_dereference(qpriv->tid_rdma.remote); + KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth1, JKEY, remote->jkey); + ohdr->u.tid_rdma.ack.verbs_qp = cpu_to_be32(qp->remote_qpn); + *bth1 = remote->qp; + rcu_read_unlock(); + + if (qpriv->resync) { + *bth2 = mask_psn((fs->generation << + HFI1_KDETH_BTH_SEQ_SHIFT) - 1); + ohdr->u.tid_rdma.ack.aeth = rvt_compute_aeth(qp); + } else if (qpriv->s_nak_state) { + *bth2 = mask_psn(qpriv->s_nak_psn); + ohdr->u.tid_rdma.ack.aeth = + cpu_to_be32((qp->r_msn & IB_MSN_MASK) | + (qpriv->s_nak_state << + IB_AETH_CREDIT_SHIFT)); + } else { + *bth2 = full_flow_psn(flow, flow->flow_state.lpsn); + ohdr->u.tid_rdma.ack.aeth = rvt_compute_aeth(qp); + } + KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth0, KVER, 0x1); + ohdr->u.tid_rdma.ack.tid_flow_qp = + cpu_to_be32(qpriv->tid_rdma.local.qp | + ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) << + TID_RDMA_DESTQP_FLOW_SHIFT) | + qpriv->rcd->ctxt); + + ohdr->u.tid_rdma.ack.tid_flow_psn = 0; + ohdr->u.tid_rdma.ack.verbs_psn = + cpu_to_be32(flow->flow_state.resp_ib_psn); + + if (qpriv->resync) { + /* + * If the PSN before the current expect KDETH PSN is the + * RESYNC PSN, then we never received a good TID RDMA WRITE + * DATA packet after a previous RESYNC. + * In this case, the next expected KDETH PSN stays the same. + */ + if (hfi1_tid_rdma_is_resync_psn(qpriv->r_next_psn_kdeth - 1)) { + ohdr->u.tid_rdma.ack.tid_flow_psn = + cpu_to_be32(qpriv->r_next_psn_kdeth_save); + } else { + /* + * Because the KDETH PSNs jump during a RESYNC, it's + * not possible to infer (or compute) the previous value + * of r_next_psn_kdeth in the case of back-to-back + * RESYNC packets. Therefore, we save it. + */ + qpriv->r_next_psn_kdeth_save = + qpriv->r_next_psn_kdeth - 1; + ohdr->u.tid_rdma.ack.tid_flow_psn = + cpu_to_be32(qpriv->r_next_psn_kdeth_save); + qpriv->r_next_psn_kdeth = mask_psn(*bth2 + 1); + } + qpriv->resync = false; + } + + return sizeof(ohdr->u.tid_rdma.ack) / sizeof(u32); +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index 647a6f0cba31..89f5af627128 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -281,4 +281,8 @@ bool hfi1_build_tid_rdma_packet(struct rvt_swqe *wqe, void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet); +u32 hfi1_build_tid_rdma_write_ack(struct rvt_qp *qp, struct rvt_ack_entry *e, + struct ib_other_headers *ohdr, u16 iflow, + u32 *bth1, u32 *bth2); + #endif /* HFI1_TID_RDMA_H */ diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index eec6e822635b..3a97a39aeba4 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -196,7 +196,9 @@ struct hfi1_qp_priv { u8 timeout_shift; /* account for number of packets per segment */ u32 r_next_psn_kdeth; + u32 r_next_psn_kdeth_save; u8 sync_pt; /* Set when QP reaches sync point */ + u8 resync; }; #define HFI1_QP_WQE_INVALID ((u32)-1) -- cgit v1.2.3-59-g8ed1b From 9e93e967f7b452e6c9e4a33d0b42ff64fa7293c4 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:50:14 -0800 Subject: IB/hfi1: Add a function to receive TID RDMA ACK packet This patch adds a function to receive TID RDMA ACK packet, which could be an acknowledge to either a TID RDMA WRITE DATA packet or an TID RDMA RESYNC packet. For an ACK to TID RDMA WRITE DATA packet, the request segments are completed appropriately. For an ACK to a TID RDMA RESYNC packet, any pending segment flow information is updated accordingly. Signed-off-by: Mitko Haralanov Signed-off-by: Mike Marciniszyn Signed-off-by: Ashutosh Dixit Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/opfn.c | 3 + drivers/infiniband/hw/hfi1/qp.h | 2 + drivers/infiniband/hw/hfi1/tid_rdma.c | 212 +++++++++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/tid_rdma.h | 3 + drivers/infiniband/hw/hfi1/trace_tid.h | 5 +- drivers/infiniband/hw/hfi1/verbs.h | 4 + 6 files changed, 228 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/opfn.c b/drivers/infiniband/hw/hfi1/opfn.c index 2ca070690b2f..82e1889ca969 100644 --- a/drivers/infiniband/hw/hfi1/opfn.c +++ b/drivers/infiniband/hw/hfi1/opfn.c @@ -245,6 +245,9 @@ void opfn_qp_init(struct rvt_qp *qp, struct ib_qp_attr *attr, int attr_mask) struct hfi1_qp_priv *priv = qp->priv; unsigned long flags; + if (attr_mask & IB_QP_RETRY_CNT) + priv->s_retry = attr->retry_cnt; + spin_lock_irqsave(&priv->opfn.lock, flags); if (ibqp->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) { struct tid_rdma_params *local = &priv->tid_rdma.local; diff --git a/drivers/infiniband/hw/hfi1/qp.h b/drivers/infiniband/hw/hfi1/qp.h index f74e2509e8b9..d531b760ea93 100644 --- a/drivers/infiniband/hw/hfi1/qp.h +++ b/drivers/infiniband/hw/hfi1/qp.h @@ -65,6 +65,7 @@ extern const struct rvt_operation_params hfi1_post_parms[]; * HFI1_S_WAIT_PIO_DRAIN - qp waiting for PIOs to drain * HFI1_S_WAIT_TID_SPACE - a QP is waiting for TID resource * HFI1_S_WAIT_TID_RESP - waiting for a TID RDMA WRITE response + * HFI1_S_WAIT_HALT - halt the first leg send engine * HFI1_S_MIN_BIT_MASK - the lowest bit that can be used by hfi1 */ #define HFI1_S_AHG_VALID 0x80000000 @@ -72,6 +73,7 @@ extern const struct rvt_operation_params hfi1_post_parms[]; #define HFI1_S_WAIT_PIO_DRAIN 0x20000000 #define HFI1_S_WAIT_TID_SPACE 0x10000000 #define HFI1_S_WAIT_TID_RESP 0x08000000 +#define HFI1_S_WAIT_HALT 0x04000000 #define HFI1_S_MIN_BIT_MASK 0x01000000 /* diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index d8a7f07b028d..5eb8453a719e 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -319,6 +319,7 @@ int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp, qpriv->flow_state.index = RXE_NUM_TID_FLOWS; qpriv->flow_state.last_index = RXE_NUM_TID_FLOWS; qpriv->flow_state.generation = KERN_GENERATION_RESERVED; + qpriv->s_state = TID_OP(WRITE_RESP); qpriv->s_tid_cur = HFI1_QP_WQE_INVALID; qpriv->s_tid_head = HFI1_QP_WQE_INVALID; qpriv->s_tid_tail = HFI1_QP_WQE_INVALID; @@ -327,6 +328,7 @@ int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp, qpriv->r_tid_tail = HFI1_QP_WQE_INVALID; qpriv->r_tid_ack = HFI1_QP_WQE_INVALID; qpriv->r_tid_alloc = HFI1_QP_WQE_INVALID; + atomic_set(&qpriv->n_tid_requests, 0); timer_setup(&qpriv->s_tid_timer, hfi1_tid_timeout, 0); INIT_LIST_HEAD(&qpriv->tid_wait); @@ -4318,3 +4320,213 @@ u32 hfi1_build_tid_rdma_write_ack(struct rvt_qp *qp, struct rvt_ack_entry *e, return sizeof(ohdr->u.tid_rdma.ack) / sizeof(u32); } + +void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet) +{ + struct ib_other_headers *ohdr = packet->ohdr; + struct rvt_qp *qp = packet->qp; + struct hfi1_qp_priv *qpriv = qp->priv; + struct rvt_swqe *wqe; + struct tid_rdma_request *req; + struct tid_rdma_flow *flow; + u32 aeth, psn, req_psn, ack_psn, fspsn, resync_psn, ack_kpsn; + bool is_fecn; + unsigned long flags; + u16 fidx; + + is_fecn = process_ecn(qp, packet); + psn = mask_psn(be32_to_cpu(ohdr->bth[2])); + aeth = be32_to_cpu(ohdr->u.tid_rdma.ack.aeth); + req_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.ack.verbs_psn)); + resync_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.ack.tid_flow_psn)); + + spin_lock_irqsave(&qp->s_lock, flags); + + /* If we are waiting for an ACK to RESYNC, drop any other packets */ + if ((qp->s_flags & HFI1_S_WAIT_HALT) && + cmp_psn(psn, qpriv->s_resync_psn)) + goto ack_op_err; + + ack_psn = req_psn; + if (hfi1_tid_rdma_is_resync_psn(psn)) + ack_kpsn = resync_psn; + else + ack_kpsn = psn; + if (aeth >> 29) { + ack_psn--; + ack_kpsn--; + } + + wqe = rvt_get_swqe_ptr(qp, qp->s_acked); + + if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE) + goto ack_op_err; + + req = wqe_to_tid_req(wqe); + flow = &req->flows[req->acked_tail]; + + /* Drop stale ACK/NAK */ + if (cmp_psn(psn, full_flow_psn(flow, flow->flow_state.spsn)) < 0) + goto ack_op_err; + + while (cmp_psn(ack_kpsn, + full_flow_psn(flow, flow->flow_state.lpsn)) >= 0 && + req->ack_seg < req->cur_seg) { + req->ack_seg++; + /* advance acked segment pointer */ + req->acked_tail = CIRC_NEXT(req->acked_tail, MAX_FLOWS); + req->r_last_acked = flow->flow_state.resp_ib_psn; + if (req->ack_seg == req->total_segs) { + req->state = TID_REQUEST_COMPLETE; + wqe = do_rc_completion(qp, wqe, + to_iport(qp->ibqp.device, + qp->port_num)); + atomic_dec(&qpriv->n_tid_requests); + if (qp->s_acked == qp->s_tail) + break; + if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE) + break; + req = wqe_to_tid_req(wqe); + } + flow = &req->flows[req->acked_tail]; + } + + switch (aeth >> 29) { + case 0: /* ACK */ + if (qpriv->s_flags & RVT_S_WAIT_ACK) + qpriv->s_flags &= ~RVT_S_WAIT_ACK; + if (!hfi1_tid_rdma_is_resync_psn(psn)) { + hfi1_schedule_send(qp); + } else { + u32 spsn, fpsn, last_acked, generation; + struct tid_rdma_request *rptr; + + /* Allow new requests (see hfi1_make_tid_rdma_pkt) */ + qp->s_flags &= ~HFI1_S_WAIT_HALT; + /* + * Clear RVT_S_SEND_ONE flag in case that the TID RDMA + * ACK is received after the TID retry timer is fired + * again. In this case, do not send any more TID + * RESYNC request or wait for any more TID ACK packet. + */ + qpriv->s_flags &= ~RVT_S_SEND_ONE; + hfi1_schedule_send(qp); + + if ((qp->s_acked == qpriv->s_tid_tail && + req->ack_seg == req->total_segs) || + qp->s_acked == qp->s_tail) { + qpriv->s_state = TID_OP(WRITE_DATA_LAST); + goto done; + } + + if (req->ack_seg == req->comp_seg) { + qpriv->s_state = TID_OP(WRITE_DATA); + goto done; + } + + /* + * The PSN to start with is the next PSN after the + * RESYNC PSN. + */ + psn = mask_psn(psn + 1); + generation = psn >> HFI1_KDETH_BTH_SEQ_SHIFT; + spsn = 0; + + /* + * Update to the correct WQE when we get an ACK(RESYNC) + * in the middle of a request. + */ + if (delta_psn(ack_psn, wqe->lpsn)) + wqe = rvt_get_swqe_ptr(qp, qp->s_acked); + req = wqe_to_tid_req(wqe); + flow = &req->flows[req->acked_tail]; + /* + * RESYNC re-numbers the PSN ranges of all remaining + * segments. Also, PSN's start from 0 in the middle of a + * segment and the first segment size is less than the + * default number of packets. flow->resync_npkts is used + * to track the number of packets from the start of the + * real segment to the point of 0 PSN after the RESYNC + * in order to later correctly rewind the SGE. + */ + fpsn = full_flow_psn(flow, flow->flow_state.spsn); + req->r_ack_psn = psn; + flow->resync_npkts += + delta_psn(mask_psn(resync_psn + 1), fpsn); + /* + * Renumber all packet sequence number ranges + * based on the new generation. + */ + last_acked = qp->s_acked; + rptr = req; + while (1) { + /* start from last acked segment */ + for (fidx = rptr->acked_tail; + CIRC_CNT(rptr->setup_head, fidx, + MAX_FLOWS); + fidx = CIRC_NEXT(fidx, MAX_FLOWS)) { + u32 lpsn; + u32 gen; + + flow = &rptr->flows[fidx]; + gen = flow->flow_state.generation; + if (WARN_ON(gen == generation && + flow->flow_state.spsn != + spsn)) + continue; + lpsn = flow->flow_state.lpsn; + lpsn = full_flow_psn(flow, lpsn); + flow->npkts = + delta_psn(lpsn, + mask_psn(resync_psn) + ); + flow->flow_state.generation = + generation; + flow->flow_state.spsn = spsn; + flow->flow_state.lpsn = + flow->flow_state.spsn + + flow->npkts - 1; + flow->pkt = 0; + spsn += flow->npkts; + resync_psn += flow->npkts; + } + if (++last_acked == qpriv->s_tid_cur + 1) + break; + if (last_acked == qp->s_size) + last_acked = 0; + wqe = rvt_get_swqe_ptr(qp, last_acked); + rptr = wqe_to_tid_req(wqe); + } + req->cur_seg = req->ack_seg; + qpriv->s_tid_tail = qp->s_acked; + qpriv->s_state = TID_OP(WRITE_REQ); + } +done: + qpriv->s_retry = qp->s_retry_cnt; + break; + + case 3: /* NAK */ + switch ((aeth >> IB_AETH_CREDIT_SHIFT) & + IB_AETH_CREDIT_MASK) { + case 0: /* PSN sequence error */ + flow = &req->flows[req->acked_tail]; + fspsn = full_flow_psn(flow, flow->flow_state.spsn); + req->r_ack_psn = mask_psn(be32_to_cpu(ohdr->bth[2])); + req->cur_seg = req->ack_seg; + qpriv->s_tid_tail = qp->s_acked; + qpriv->s_state = TID_OP(WRITE_REQ); + qpriv->s_retry = qp->s_retry_cnt; + break; + + default: + break; + } + break; + + default: + break; + } + +ack_op_err: + spin_unlock_irqrestore(&qp->s_lock, flags); +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index 89f5af627128..499036e7a3e8 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -101,6 +101,7 @@ struct tid_rdma_request { u32 seg_len; u32 total_len; + u32 r_ack_psn; /* next expected ack PSN */ u32 r_flow_psn; /* IB PSN of next segment start */ u32 r_last_acked; /* IB PSN of last ACK'ed packet */ u32 s_next_psn; /* IB PSN of next segment start for read */ @@ -285,4 +286,6 @@ u32 hfi1_build_tid_rdma_write_ack(struct rvt_qp *qp, struct rvt_ack_entry *e, struct ib_other_headers *ohdr, u16 iflow, u32 *bth1, u32 *bth2); +void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet); + #endif /* HFI1_TID_RDMA_H */ diff --git a/drivers/infiniband/hw/hfi1/trace_tid.h b/drivers/infiniband/hw/hfi1/trace_tid.h index 51f5b0e8da71..a45b5257d6c4 100644 --- a/drivers/infiniband/hw/hfi1/trace_tid.h +++ b/drivers/infiniband/hw/hfi1/trace_tid.h @@ -52,7 +52,7 @@ u16 hfi1_trace_get_tid_idx(u32 ent); #define TID_READ_SENDER_PRN "[%s] qpn 0x%x newreq %u tid_r_reqs %u " \ "tid_r_comp %u pending_tid_r_segs %u " \ "s_flags 0x%x ps_flags 0x%x iow_flags 0x%lx " \ - "hw_flow_index %u generation 0x%x " \ + "s_state 0x%x hw_flow_index %u generation 0x%x " \ "fpsn 0x%x flow_flags 0x%x" #define TID_REQ_PRN "[%s] qpn 0x%x newreq %u opcode 0x%x psn 0x%x lpsn 0x%x " \ @@ -844,6 +844,7 @@ DECLARE_EVENT_CLASS(/* tid_read_sender */ __field(u32, s_flags) __field(u32, ps_flags) __field(unsigned long, iow_flags) + __field(u8, s_state) __field(u32, hw_flow_index) __field(u32, generation) __field(u32, fpsn) @@ -861,6 +862,7 @@ DECLARE_EVENT_CLASS(/* tid_read_sender */ __entry->s_flags = qp->s_flags; __entry->ps_flags = priv->s_flags; __entry->iow_flags = priv->s_iowait.flags; + __entry->s_state = priv->s_state; __entry->hw_flow_index = priv->flow_state.index; __entry->generation = priv->flow_state.generation; __entry->fpsn = priv->flow_state.psn; @@ -877,6 +879,7 @@ DECLARE_EVENT_CLASS(/* tid_read_sender */ __entry->s_flags, __entry->ps_flags, __entry->iow_flags, + __entry->s_state, __entry->hw_flow_index, __entry->generation, __entry->fpsn, diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index 3a97a39aeba4..30e3f5af5cf1 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -170,9 +170,12 @@ struct hfi1_qp_priv { struct tid_rdma_qp_params tid_rdma; struct rvt_qp *owner; u8 hdr_type; /* 9B or 16B */ + atomic_t n_tid_requests; /* # of sent TID RDMA requests */ unsigned long tid_timer_timeout_jiffies; /* variables for the TID RDMA SE state machine */ + u8 s_state; + u8 s_retry; u8 rnr_nak_state; /* RNR NAK state */ u8 s_nak_state; u32 s_nak_psn; @@ -197,6 +200,7 @@ struct hfi1_qp_priv { u32 r_next_psn_kdeth; u32 r_next_psn_kdeth_save; + u32 s_resync_psn; u8 sync_pt; /* Set when QP reaches sync point */ u8 resync; }; -- cgit v1.2.3-59-g8ed1b From 829eaee5d09a7500bdce9ed0bc6ec6861f8ae45b Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:50:24 -0800 Subject: IB/hfi1: Add TID RDMA retry timer This patch adds the TID RDMA retry timer to make sure that TID RDMA WRITE DATA packets for a segment are received successfully by the responder. This timer is generally armed when the last TID RDMA WRITE DATA packet for a segment is sent out and stopped when all TID RDMA DATA packets are acknowledged. Reviewed-by: Mike Marciniszyn Signed-off-by: Mitko Haralanov Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/opfn.c | 2 + drivers/infiniband/hw/hfi1/tid_rdma.c | 93 +++++++++++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/tid_rdma.h | 4 ++ drivers/infiniband/hw/hfi1/verbs.h | 2 + 4 files changed, 101 insertions(+) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/opfn.c b/drivers/infiniband/hw/hfi1/opfn.c index 82e1889ca969..370a5a8eaa71 100644 --- a/drivers/infiniband/hw/hfi1/opfn.c +++ b/drivers/infiniband/hw/hfi1/opfn.c @@ -252,6 +252,8 @@ void opfn_qp_init(struct rvt_qp *qp, struct ib_qp_attr *attr, int attr_mask) if (ibqp->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) { struct tid_rdma_params *local = &priv->tid_rdma.local; + if (attr_mask & IB_QP_TIMEOUT) + priv->tid_retry_timeout_jiffies = qp->timeout_jiffies; if (qp->pmtu == enum_to_mtu(OPA_MTU_4096) || qp->pmtu == enum_to_mtu(OPA_MTU_8192)) { tid_rdma_opfn_init(qp, local); diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 5eb8453a719e..a4faf7d6c224 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -121,6 +121,9 @@ static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx); static void hfi1_tid_timeout(struct timer_list *t); static void hfi1_add_tid_reap_timer(struct rvt_qp *qp); static void hfi1_mod_tid_reap_timer(struct rvt_qp *qp); +static void hfi1_mod_tid_retry_timer(struct rvt_qp *qp); +static int hfi1_stop_tid_retry_timer(struct rvt_qp *qp); +static void hfi1_tid_retry_timeout(struct timer_list *t); static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p) { @@ -330,6 +333,7 @@ int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp, qpriv->r_tid_alloc = HFI1_QP_WQE_INVALID; atomic_set(&qpriv->n_tid_requests, 0); timer_setup(&qpriv->s_tid_timer, hfi1_tid_timeout, 0); + timer_setup(&qpriv->s_tid_retry_timer, hfi1_tid_retry_timeout, 0); INIT_LIST_HEAD(&qpriv->tid_wait); if (init_attr->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) { @@ -4396,11 +4400,19 @@ void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet) if (qpriv->s_flags & RVT_S_WAIT_ACK) qpriv->s_flags &= ~RVT_S_WAIT_ACK; if (!hfi1_tid_rdma_is_resync_psn(psn)) { + /* Check if there is any pending TID ACK */ + if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE && + req->ack_seg < req->cur_seg) + hfi1_mod_tid_retry_timer(qp); + else + hfi1_stop_tid_retry_timer(qp); hfi1_schedule_send(qp); } else { u32 spsn, fpsn, last_acked, generation; struct tid_rdma_request *rptr; + /* ACK(RESYNC) */ + hfi1_stop_tid_retry_timer(qp); /* Allow new requests (see hfi1_make_tid_rdma_pkt) */ qp->s_flags &= ~HFI1_S_WAIT_HALT; /* @@ -4506,6 +4518,7 @@ done: break; case 3: /* NAK */ + hfi1_stop_tid_retry_timer(qp); switch ((aeth >> IB_AETH_CREDIT_SHIFT) & IB_AETH_CREDIT_MASK) { case 0: /* PSN sequence error */ @@ -4530,3 +4543,83 @@ done: ack_op_err: spin_unlock_irqrestore(&qp->s_lock, flags); } + +void hfi1_add_tid_retry_timer(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct ib_qp *ibqp = &qp->ibqp; + struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device); + + lockdep_assert_held(&qp->s_lock); + if (!(priv->s_flags & HFI1_S_TID_RETRY_TIMER)) { + priv->s_flags |= HFI1_S_TID_RETRY_TIMER; + priv->s_tid_retry_timer.expires = jiffies + + priv->tid_retry_timeout_jiffies + rdi->busy_jiffies; + add_timer(&priv->s_tid_retry_timer); + } +} + +static void hfi1_mod_tid_retry_timer(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct ib_qp *ibqp = &qp->ibqp; + struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device); + + lockdep_assert_held(&qp->s_lock); + priv->s_flags |= HFI1_S_TID_RETRY_TIMER; + mod_timer(&priv->s_tid_retry_timer, jiffies + + priv->tid_retry_timeout_jiffies + rdi->busy_jiffies); +} + +static int hfi1_stop_tid_retry_timer(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + int rval = 0; + + lockdep_assert_held(&qp->s_lock); + if (priv->s_flags & HFI1_S_TID_RETRY_TIMER) { + rval = del_timer(&priv->s_tid_retry_timer); + priv->s_flags &= ~HFI1_S_TID_RETRY_TIMER; + } + return rval; +} + +void hfi1_del_tid_retry_timer(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + + del_timer_sync(&priv->s_tid_retry_timer); + priv->s_flags &= ~HFI1_S_TID_RETRY_TIMER; +} + +static void hfi1_tid_retry_timeout(struct timer_list *t) +{ + struct hfi1_qp_priv *priv = from_timer(priv, t, s_tid_retry_timer); + struct rvt_qp *qp = priv->owner; + struct rvt_swqe *wqe; + unsigned long flags; + + spin_lock_irqsave(&qp->r_lock, flags); + spin_lock(&qp->s_lock); + if (priv->s_flags & HFI1_S_TID_RETRY_TIMER) { + hfi1_stop_tid_retry_timer(qp); + if (!priv->s_retry) { + wqe = rvt_get_swqe_ptr(qp, qp->s_acked); + hfi1_trdma_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR); + rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); + } else { + priv->s_flags &= ~RVT_S_WAIT_ACK; + /* Only send one packet (the RESYNC) */ + priv->s_flags |= RVT_S_SEND_ONE; + /* + * No additional request shall be made by this QP until + * the RESYNC has been complete. + */ + qp->s_flags |= HFI1_S_WAIT_HALT; + priv->s_state = TID_OP(RESYNC); + priv->s_retry--; + } + } + spin_unlock(&qp->s_lock); + spin_unlock_irqrestore(&qp->r_lock, flags); +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index 499036e7a3e8..3be5f79ed1fb 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -29,6 +29,7 @@ #define HFI1_R_TID_RSC_TIMER BIT(2) /* BIT(4) reserved for RVT_S_ACK_PENDING. */ #define HFI1_S_TID_WAIT_INTERLCK BIT(5) +#define HFI1_S_TID_RETRY_TIMER BIT(17) #define HFI1_R_TID_SW_PSN BIT(19) /* @@ -288,4 +289,7 @@ u32 hfi1_build_tid_rdma_write_ack(struct rvt_qp *qp, struct rvt_ack_entry *e, void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet); +void hfi1_add_tid_retry_timer(struct rvt_qp *qp); +void hfi1_del_tid_retry_timer(struct rvt_qp *qp); + #endif /* HFI1_TID_RDMA_H */ diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index 30e3f5af5cf1..bfd642e831f7 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -164,6 +164,7 @@ struct hfi1_qp_priv { u8 s_sc; /* SC[0..4] for next packet */ struct iowait s_iowait; struct timer_list s_tid_timer; /* for timing tid wait */ + struct timer_list s_tid_retry_timer; /* for timing tid ack */ struct list_head tid_wait; /* for queueing tid space */ struct hfi1_opfn_data opfn; struct tid_flow_state flow_state; @@ -172,6 +173,7 @@ struct hfi1_qp_priv { u8 hdr_type; /* 9B or 16B */ atomic_t n_tid_requests; /* # of sent TID RDMA requests */ unsigned long tid_timer_timeout_jiffies; + unsigned long tid_retry_timeout_jiffies; /* variables for the TID RDMA SE state machine */ u8 s_state; -- cgit v1.2.3-59-g8ed1b From 6e391c6a4a8f97d34fa859c906387c05e91adbe9 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:50:36 -0800 Subject: IB/hfi1: Add a function to build TID RDMA RESYNC packet This patch adds a function to build TID RDMA RESYNC packet, which is sent by the requester to notify the responder that no TID RDMA ACK packet has been received for a given KDETH PSN. Signed-off-by: Mitko Haralanov Signed-off-by: Mike Marciniszyn Signed-off-by: Ashutosh Dixit Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/tid_rdma.c | 26 ++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/tid_rdma.h | 4 ++++ 2 files changed, 30 insertions(+) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index a4faf7d6c224..e10f6d714dff 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -4623,3 +4623,29 @@ static void hfi1_tid_retry_timeout(struct timer_list *t) spin_unlock(&qp->s_lock); spin_unlock_irqrestore(&qp->r_lock, flags); } + +u32 hfi1_build_tid_rdma_resync(struct rvt_qp *qp, struct rvt_swqe *wqe, + struct ib_other_headers *ohdr, u32 *bth1, + u32 *bth2, u16 fidx) +{ + struct hfi1_qp_priv *qpriv = qp->priv; + struct tid_rdma_params *remote; + struct tid_rdma_request *req = wqe_to_tid_req(wqe); + struct tid_rdma_flow *flow = &req->flows[fidx]; + u32 generation; + + rcu_read_lock(); + remote = rcu_dereference(qpriv->tid_rdma.remote); + KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth1, JKEY, remote->jkey); + ohdr->u.tid_rdma.ack.verbs_qp = cpu_to_be32(qp->remote_qpn); + *bth1 = remote->qp; + rcu_read_unlock(); + + generation = kern_flow_generation_next(flow->flow_state.generation); + *bth2 = mask_psn((generation << HFI1_KDETH_BTH_SEQ_SHIFT) - 1); + qpriv->s_resync_psn = *bth2; + *bth2 |= IB_BTH_REQ_ACK; + KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth0, KVER, 0x1); + + return sizeof(ohdr->u.tid_rdma.resync) / sizeof(u32); +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index 3be5f79ed1fb..d876b0efeac2 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -292,4 +292,8 @@ void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet); void hfi1_add_tid_retry_timer(struct rvt_qp *qp); void hfi1_del_tid_retry_timer(struct rvt_qp *qp); +u32 hfi1_build_tid_rdma_resync(struct rvt_qp *qp, struct rvt_swqe *wqe, + struct ib_other_headers *ohdr, u32 *bth1, + u32 *bth2, u16 fidx); + #endif /* HFI1_TID_RDMA_H */ -- cgit v1.2.3-59-g8ed1b From 7cf0ad679de46c61739238c3f4542f14cc7bbc69 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:50:46 -0800 Subject: IB/hfi1: Add a function to receive TID RDMA RESYNC packet This patch adds a function to receive TID RDMA RESYNC packet on the responder side. The QP's hardware flow will be updated and all allocated software flows will be updated accordingly in order to drop all stale packets. Signed-off-by: Mitko Haralanov Signed-off-by: Mike Marciniszyn Signed-off-by: Ashutosh Dixit Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/tid_rdma.c | 103 ++++++++++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/tid_rdma.h | 2 + 2 files changed, 105 insertions(+) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index e10f6d714dff..1901d5b6bbb9 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -4649,3 +4649,106 @@ u32 hfi1_build_tid_rdma_resync(struct rvt_qp *qp, struct rvt_swqe *wqe, return sizeof(ohdr->u.tid_rdma.resync) / sizeof(u32); } + +void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet) +{ + struct ib_other_headers *ohdr = packet->ohdr; + struct rvt_qp *qp = packet->qp; + struct hfi1_qp_priv *qpriv = qp->priv; + struct hfi1_ctxtdata *rcd = qpriv->rcd; + struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); + struct rvt_ack_entry *e; + struct tid_rdma_request *req; + struct tid_rdma_flow *flow; + struct tid_flow_state *fs = &qpriv->flow_state; + u32 psn, generation, idx, gen_next; + bool is_fecn; + unsigned long flags; + + is_fecn = process_ecn(qp, packet); + psn = mask_psn(be32_to_cpu(ohdr->bth[2])); + + generation = mask_psn(psn + 1) >> HFI1_KDETH_BTH_SEQ_SHIFT; + spin_lock_irqsave(&qp->s_lock, flags); + + gen_next = (fs->generation == KERN_GENERATION_RESERVED) ? + generation : kern_flow_generation_next(fs->generation); + /* + * RESYNC packet contains the "next" generation and can only be + * from the current or previous generations + */ + if (generation != mask_generation(gen_next - 1) && + generation != gen_next) + goto bail; + /* Already processing a resync */ + if (qpriv->resync) + goto bail; + + spin_lock(&rcd->exp_lock); + if (fs->index >= RXE_NUM_TID_FLOWS) { + /* + * If we don't have a flow, save the generation so it can be + * applied when a new flow is allocated + */ + fs->generation = generation; + } else { + /* Reprogram the QP flow with new generation */ + rcd->flows[fs->index].generation = generation; + fs->generation = kern_setup_hw_flow(rcd, fs->index); + } + fs->psn = 0; + /* + * Disable SW PSN checking since a RESYNC is equivalent to a + * sync point and the flow has/will be reprogrammed + */ + qpriv->s_flags &= ~HFI1_R_TID_SW_PSN; + + /* + * Reset all TID flow information with the new generation. + * This is done for all requests and segments after the + * last received segment + */ + for (idx = qpriv->r_tid_tail; ; idx++) { + u16 flow_idx; + + if (idx > rvt_size_atomic(&dev->rdi)) + idx = 0; + e = &qp->s_ack_queue[idx]; + if (e->opcode == TID_OP(WRITE_REQ)) { + req = ack_to_tid_req(e); + + /* start from last unacked segment */ + for (flow_idx = req->clear_tail; + CIRC_CNT(req->setup_head, flow_idx, + MAX_FLOWS); + flow_idx = CIRC_NEXT(flow_idx, MAX_FLOWS)) { + u32 lpsn; + u32 next; + + flow = &req->flows[flow_idx]; + lpsn = full_flow_psn(flow, + flow->flow_state.lpsn); + next = flow->flow_state.r_next_psn; + flow->npkts = delta_psn(lpsn, next - 1); + flow->flow_state.generation = fs->generation; + flow->flow_state.spsn = fs->psn; + flow->flow_state.lpsn = + flow->flow_state.spsn + flow->npkts - 1; + flow->flow_state.r_next_psn = + full_flow_psn(flow, + flow->flow_state.spsn); + fs->psn += flow->npkts; + } + } + if (idx == qp->s_tail_ack_queue) + break; + } + + spin_unlock(&rcd->exp_lock); + qpriv->resync = true; + /* RESYNC request always gets a TID RDMA ACK. */ + qpriv->s_nak_state = 0; + qpriv->s_flags |= RVT_S_ACK_PENDING; +bail: + spin_unlock_irqrestore(&qp->s_lock, flags); +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index d876b0efeac2..bdcf18455d9d 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -296,4 +296,6 @@ u32 hfi1_build_tid_rdma_resync(struct rvt_qp *qp, struct rvt_swqe *wqe, struct ib_other_headers *ohdr, u32 *bth1, u32 *bth2, u16 fidx); +void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet); + #endif /* HFI1_TID_RDMA_H */ -- cgit v1.2.3-59-g8ed1b From 6e38fca6b1524e9a9aa0d2a10d99975eef1791c1 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:50:56 -0800 Subject: IB/hfi1: Resend the TID RDMA WRITE DATA packets This patch adds the logic to resend TID RDMA WRITE DATA packets. The tracking indices will be reset properly so that the correct TID entries will be used. Reviewed-by: Mike Marciniszyn Signed-off-by: Mitko Haralanov Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/tid_rdma.c | 63 +++++++++++++++++++++++++++++++---- drivers/infiniband/hw/hfi1/verbs.h | 1 + 2 files changed, 58 insertions(+), 6 deletions(-) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 1901d5b6bbb9..cb6321b0d2c9 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -3059,8 +3059,9 @@ void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe, { struct tid_rdma_request *req = wqe_to_tid_req(wqe); struct tid_rdma_flow *flow; - int diff; - u32 tididx = 0; + struct hfi1_qp_priv *qpriv = qp->priv; + int diff, delta_pkts; + u32 tididx = 0, i; u16 fidx; if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { @@ -3076,11 +3077,20 @@ void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe, return; } } else { - return; + fidx = req->acked_tail; + flow = &req->flows[fidx]; + *bth2 = mask_psn(req->r_ack_psn); } + if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) + delta_pkts = delta_psn(*bth2, flow->flow_state.ib_spsn); + else + delta_pkts = delta_psn(*bth2, + full_flow_psn(flow, + flow->flow_state.spsn)); + trace_hfi1_tid_flow_restart_req(qp, fidx, flow); - diff = delta_psn(*bth2, flow->flow_state.ib_spsn); + diff = delta_pkts + flow->resync_npkts; flow->sent = 0; flow->pkt = 0; @@ -3104,6 +3114,18 @@ void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe, break; } } + if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) { + rvt_skip_sge(&qpriv->tid_ss, (req->cur_seg * req->seg_len) + + flow->sent, 0); + /* + * Packet PSN is based on flow_state.spsn + flow->pkt. However, + * during a RESYNC, the generation is incremented and the + * sequence is reset to 0. Since we've adjusted the npkts in the + * flow and the SGE has been sufficiently advanced, we have to + * adjust flow->pkt in order to calculate the correct PSN. + */ + flow->pkt -= flow->resync_npkts; + } if (flow->tid_offset == EXP_TID_GET(flow->tid_entry[tididx], LEN) * PAGE_SIZE) { @@ -3111,13 +3133,42 @@ void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe, flow->tid_offset = 0; } flow->tid_idx = tididx; - /* Move flow_idx to correct index */ - req->flow_idx = fidx; + if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) + /* Move flow_idx to correct index */ + req->flow_idx = fidx; + else + req->clear_tail = fidx; trace_hfi1_tid_flow_restart_req(qp, fidx, flow); trace_hfi1_tid_req_restart_req(qp, 0, wqe->wr.opcode, wqe->psn, wqe->lpsn, req); req->state = TID_REQUEST_ACTIVE; + if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) { + /* Reset all the flows that we are going to resend */ + fidx = CIRC_NEXT(fidx, MAX_FLOWS); + i = qpriv->s_tid_tail; + do { + for (; CIRC_CNT(req->setup_head, fidx, MAX_FLOWS); + fidx = CIRC_NEXT(fidx, MAX_FLOWS)) { + req->flows[fidx].sent = 0; + req->flows[fidx].pkt = 0; + req->flows[fidx].tid_idx = 0; + req->flows[fidx].tid_offset = 0; + req->flows[fidx].resync_npkts = 0; + } + if (i == qpriv->s_tid_cur) + break; + do { + i = (++i == qp->s_size ? 0 : i); + wqe = rvt_get_swqe_ptr(qp, i); + } while (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE); + req = wqe_to_tid_req(wqe); + req->cur_seg = req->ack_seg; + fidx = req->acked_tail; + /* Pull req->clear_tail back */ + req->clear_tail = fidx; + } while (1); + } } void hfi1_qp_kern_exp_rcv_clear_all(struct rvt_qp *qp) diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index bfd642e831f7..ce40ea9f43c3 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -171,6 +171,7 @@ struct hfi1_qp_priv { struct tid_rdma_qp_params tid_rdma; struct rvt_qp *owner; u8 hdr_type; /* 9B or 16B */ + struct rvt_sge_state tid_ss; /* SGE state pointer for 2nd leg */ atomic_t n_tid_requests; /* # of sent TID RDMA requests */ unsigned long tid_timer_timeout_jiffies; unsigned long tid_retry_timeout_jiffies; -- cgit v1.2.3-59-g8ed1b From 70dcb2e3dc6aa827d74e09c830ea06c660274880 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:51:07 -0800 Subject: IB/hfi1: Add the TID second leg send packet builder To improve performance, the TID RDMA WRITE protocol is designed to own a second leg to send data and ack packets in the KDETH PSN space. This patch adds the packet builder for the requester side, which contains the state machine to build TID RDMA WRITE DATA and TID RDMA RESYNC packet. Reviewed-by: Mike Marciniszyn Signed-off-by: Mitko Haralanov Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/tid_rdma.c | 211 ++++++++++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/tid_rdma.h | 7 ++ drivers/infiniband/hw/hfi1/verbs.h | 2 + 3 files changed, 220 insertions(+) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index cb6321b0d2c9..44c5c0010888 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -331,6 +331,7 @@ int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp, qpriv->r_tid_tail = HFI1_QP_WQE_INVALID; qpriv->r_tid_ack = HFI1_QP_WQE_INVALID; qpriv->r_tid_alloc = HFI1_QP_WQE_INVALID; + atomic_set(&qpriv->n_requests, 0); atomic_set(&qpriv->n_tid_requests, 0); timer_setup(&qpriv->s_tid_timer, hfi1_tid_timeout, 0); timer_setup(&qpriv->s_tid_retry_timer, hfi1_tid_retry_timeout, 0); @@ -4803,3 +4804,213 @@ void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet) bail: spin_unlock_irqrestore(&qp->s_lock, flags); } + +/* + * Call this function when the last TID RDMA WRITE DATA packet for a request + * is built. + */ +static void update_tid_tail(struct rvt_qp *qp) + __must_hold(&qp->s_lock) +{ + struct hfi1_qp_priv *priv = qp->priv; + u32 i; + struct rvt_swqe *wqe; + + lockdep_assert_held(&qp->s_lock); + /* Can't move beyond s_tid_cur */ + if (priv->s_tid_tail == priv->s_tid_cur) + return; + for (i = priv->s_tid_tail + 1; ; i++) { + if (i == qp->s_size) + i = 0; + + if (i == priv->s_tid_cur) + break; + wqe = rvt_get_swqe_ptr(qp, i); + if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) + break; + } + priv->s_tid_tail = i; + priv->s_state = TID_OP(WRITE_RESP); +} + +int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps) + __must_hold(&qp->s_lock) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct rvt_swqe *wqe; + u32 bth1 = 0, bth2 = 0, hwords = 5, len, middle = 0; + struct ib_other_headers *ohdr; + struct rvt_sge_state *ss = &qp->s_sge; + struct rvt_ack_entry *e = &qp->s_ack_queue[qp->s_tail_ack_queue]; + struct tid_rdma_request *req = ack_to_tid_req(e); + bool last = false; + u8 opcode = TID_OP(WRITE_DATA); + + lockdep_assert_held(&qp->s_lock); + /* + * Prioritize the sending of the requests and responses over the + * sending of the TID RDMA data packets. + */ + if (((atomic_read(&priv->n_tid_requests) < HFI1_TID_RDMA_WRITE_CNT) && + atomic_read(&priv->n_requests) && + !(qp->s_flags & (RVT_S_BUSY | RVT_S_WAIT_ACK | + HFI1_S_ANY_WAIT_IO))) || + (e->opcode == TID_OP(WRITE_REQ) && req->cur_seg < req->alloc_seg && + !(qp->s_flags & (RVT_S_BUSY | HFI1_S_ANY_WAIT_IO)))) { + struct iowait_work *iowork; + + iowork = iowait_get_ib_work(&priv->s_iowait); + ps->s_txreq = get_waiting_verbs_txreq(iowork); + if (ps->s_txreq || hfi1_make_rc_req(qp, ps)) { + priv->s_flags |= HFI1_S_TID_BUSY_SET; + return 1; + } + } + + ps->s_txreq = get_txreq(ps->dev, qp); + if (!ps->s_txreq) + goto bail_no_tx; + + ohdr = &ps->s_txreq->phdr.hdr.ibh.u.oth; + + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) { + if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND)) + goto bail; + /* We are in the error state, flush the work request. */ + if (qp->s_last == READ_ONCE(qp->s_head)) + goto bail; + /* If DMAs are in progress, we can't flush immediately. */ + if (iowait_sdma_pending(&priv->s_iowait)) { + qp->s_flags |= RVT_S_WAIT_DMA; + goto bail; + } + clear_ahg(qp); + wqe = rvt_get_swqe_ptr(qp, qp->s_last); + hfi1_trdma_send_complete(qp, wqe, qp->s_last != qp->s_acked ? + IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR); + /* will get called again */ + goto done_free_tx; + } + + if (priv->s_flags & RVT_S_WAIT_ACK) + goto bail; + + /* Check whether there is anything to do. */ + if (priv->s_tid_tail == HFI1_QP_WQE_INVALID) + goto bail; + wqe = rvt_get_swqe_ptr(qp, priv->s_tid_tail); + req = wqe_to_tid_req(wqe); + switch (priv->s_state) { + case TID_OP(WRITE_REQ): + case TID_OP(WRITE_RESP): + priv->tid_ss.sge = wqe->sg_list[0]; + priv->tid_ss.sg_list = wqe->sg_list + 1; + priv->tid_ss.num_sge = wqe->wr.num_sge; + priv->tid_ss.total_len = wqe->length; + + if (priv->s_state == TID_OP(WRITE_REQ)) + hfi1_tid_rdma_restart_req(qp, wqe, &bth2); + priv->s_state = TID_OP(WRITE_DATA); + /* fall through */ + + case TID_OP(WRITE_DATA): + /* + * 1. Check whether TID RDMA WRITE RESP available. + * 2. If no: + * 2.1 If have more segments and no TID RDMA WRITE RESP, + * set HFI1_S_WAIT_TID_RESP + * 2.2 Return indicating no progress made. + * 3. If yes: + * 3.1 Build TID RDMA WRITE DATA packet. + * 3.2 If last packet in segment: + * 3.2.1 Change KDETH header bits + * 3.2.2 Advance RESP pointers. + * 3.3 Return indicating progress made. + */ + wqe = rvt_get_swqe_ptr(qp, priv->s_tid_tail); + req = wqe_to_tid_req(wqe); + len = wqe->length; + + if (!req->comp_seg || req->cur_seg == req->comp_seg) + goto bail; + + last = hfi1_build_tid_rdma_packet(wqe, ohdr, &bth1, &bth2, + &len); + + if (last) { + /* move pointer to next flow */ + req->clear_tail = CIRC_NEXT(req->clear_tail, + MAX_FLOWS); + if (++req->cur_seg < req->total_segs) { + if (!CIRC_CNT(req->setup_head, req->clear_tail, + MAX_FLOWS)) + qp->s_flags |= HFI1_S_WAIT_TID_RESP; + } else { + priv->s_state = TID_OP(WRITE_DATA_LAST); + opcode = TID_OP(WRITE_DATA_LAST); + + /* Advance the s_tid_tail now */ + update_tid_tail(qp); + } + } + hwords += sizeof(ohdr->u.tid_rdma.w_data) / sizeof(u32); + ss = &priv->tid_ss; + break; + + case TID_OP(RESYNC): + /* Use generation from the most recently received response */ + wqe = rvt_get_swqe_ptr(qp, priv->s_tid_cur); + req = wqe_to_tid_req(wqe); + /* If no responses for this WQE look at the previous one */ + if (!req->comp_seg) { + wqe = rvt_get_swqe_ptr(qp, + (!priv->s_tid_cur ? qp->s_size : + priv->s_tid_cur) - 1); + req = wqe_to_tid_req(wqe); + } + hwords += hfi1_build_tid_rdma_resync(qp, wqe, ohdr, &bth1, + &bth2, + CIRC_PREV(req->setup_head, + MAX_FLOWS)); + ss = NULL; + len = 0; + opcode = TID_OP(RESYNC); + break; + + default: + goto bail; + } + if (priv->s_flags & RVT_S_SEND_ONE) { + priv->s_flags &= ~RVT_S_SEND_ONE; + priv->s_flags |= RVT_S_WAIT_ACK; + bth2 |= IB_BTH_REQ_ACK; + } + qp->s_len -= len; + ps->s_txreq->hdr_dwords = hwords; + ps->s_txreq->sde = priv->s_sde; + ps->s_txreq->ss = ss; + ps->s_txreq->s_cur_size = len; + hfi1_make_ruc_header(qp, ohdr, (opcode << 24), bth1, bth2, + middle, ps); + return 1; +done_free_tx: + hfi1_put_txreq(ps->s_txreq); + ps->s_txreq = NULL; + return 1; + +bail: + hfi1_put_txreq(ps->s_txreq); +bail_no_tx: + ps->s_txreq = NULL; + priv->s_flags &= ~RVT_S_BUSY; + /* + * If we didn't get a txreq, the QP will be woken up later to try + * again, set the flags to the the wake up which work item to wake + * up. + * (A better algorithm should be found to do this and generalize the + * sleep/wakeup flags.) + */ + iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID); + return 0; +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index bdcf18455d9d..0ce0ef6d60f2 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -26,9 +26,13 @@ * * HFI1_S_TID_WAIT_INTERLCK - QP is waiting for requester interlock */ +#define HFI1_S_TID_BUSY_SET BIT(0) +/* BIT(1) reserved for RVT_S_BUSY. */ #define HFI1_R_TID_RSC_TIMER BIT(2) +/* BIT(3) reserved for RVT_S_RESP_PENDING. */ /* BIT(4) reserved for RVT_S_ACK_PENDING. */ #define HFI1_S_TID_WAIT_INTERLCK BIT(5) +/* BIT(7) - BIT(15) reserved for RVT_S_WAIT_*. */ #define HFI1_S_TID_RETRY_TIMER BIT(17) #define HFI1_R_TID_SW_PSN BIT(19) @@ -298,4 +302,7 @@ u32 hfi1_build_tid_rdma_resync(struct rvt_qp *qp, struct rvt_swqe *wqe, void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet); +struct hfi1_pkt_state; +int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps); + #endif /* HFI1_TID_RDMA_H */ diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index ce40ea9f43c3..3e45149d22c4 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -172,6 +172,8 @@ struct hfi1_qp_priv { struct rvt_qp *owner; u8 hdr_type; /* 9B or 16B */ struct rvt_sge_state tid_ss; /* SGE state pointer for 2nd leg */ + atomic_t n_requests; /* # of TID RDMA requests in the */ + /* queue */ atomic_t n_tid_requests; /* # of sent TID RDMA requests */ unsigned long tid_timer_timeout_jiffies; unsigned long tid_retry_timeout_jiffies; -- cgit v1.2.3-59-g8ed1b From 24c5bfeaf1e66efbc15cd9a6f5565c38d8cdb630 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:51:17 -0800 Subject: IB/hfi1: Add the TID second leg ACK packet builder This patch adds the TID packet builder for the responder side, which contains the state machine to build TID RDMA ACK packet for either TID RDMA WRITE DATA or TID RDMA RESYNC packets. Reviewed-by: Mike Marciniszyn Signed-off-by: Mitko Haralanov Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/tid_rdma.c | 141 ++++++++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 44c5c0010888..15c12243c166 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -124,6 +124,9 @@ static void hfi1_mod_tid_reap_timer(struct rvt_qp *qp); static void hfi1_mod_tid_retry_timer(struct rvt_qp *qp); static int hfi1_stop_tid_retry_timer(struct rvt_qp *qp); static void hfi1_tid_retry_timeout(struct timer_list *t); +static int make_tid_rdma_ack(struct rvt_qp *qp, + struct ib_other_headers *ohdr, + struct hfi1_pkt_state *ps); static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p) { @@ -4874,6 +4877,10 @@ int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps) ohdr = &ps->s_txreq->phdr.hdr.ibh.u.oth; + if ((priv->s_flags & RVT_S_ACK_PENDING) && + make_tid_rdma_ack(qp, ohdr, ps)) + return 1; + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) { if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND)) goto bail; @@ -5014,3 +5021,137 @@ bail_no_tx: iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID); return 0; } + +static int make_tid_rdma_ack(struct rvt_qp *qp, + struct ib_other_headers *ohdr, + struct hfi1_pkt_state *ps) +{ + struct rvt_ack_entry *e; + struct hfi1_qp_priv *qpriv = qp->priv; + struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); + u32 hwords, next; + u32 len = 0; + u32 bth1 = 0, bth2 = 0; + int middle = 0; + u16 flow; + struct tid_rdma_request *req, *nreq; + + /* Don't send an ACK if we aren't supposed to. */ + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) + goto bail; + + /* header size in 32-bit words LRH+BTH = (8+12)/4. */ + hwords = 5; + + e = &qp->s_ack_queue[qpriv->r_tid_ack]; + req = ack_to_tid_req(e); + /* + * In the RESYNC case, we are exactly one segment past the + * previously sent ack or at the previously sent NAK. So to send + * the resync ack, we go back one segment (which might be part of + * the previous request) and let the do-while loop execute again. + * The advantage of executing the do-while loop is that any data + * received after the previous ack is automatically acked in the + * RESYNC ack. It turns out that for the do-while loop we only need + * to pull back qpriv->r_tid_ack, not the segment + * indices/counters. The scheme works even if the previous request + * was not a TID WRITE request. + */ + if (qpriv->resync) { + if (!req->ack_seg || req->ack_seg == req->total_segs) + qpriv->r_tid_ack = !qpriv->r_tid_ack ? + rvt_size_atomic(&dev->rdi) : + qpriv->r_tid_ack - 1; + e = &qp->s_ack_queue[qpriv->r_tid_ack]; + req = ack_to_tid_req(e); + } + + /* + * If we've sent all the ACKs that we can, we are done + * until we get more segments... + */ + if (!qpriv->s_nak_state && !qpriv->resync && + req->ack_seg == req->comp_seg) + goto bail; + + do { + /* + * To deal with coalesced ACKs, the acked_tail pointer + * into the flow array is used. The distance between it + * and the clear_tail is the number of flows that are + * being ACK'ed. + */ + req->ack_seg += + /* Get up-to-date value */ + CIRC_CNT(req->clear_tail, req->acked_tail, + MAX_FLOWS); + /* Advance acked index */ + req->acked_tail = req->clear_tail; + + /* + * req->clear_tail points to the segment currently being + * received. So, when sending an ACK, the previous + * segment is being ACK'ed. + */ + flow = CIRC_PREV(req->acked_tail, MAX_FLOWS); + if (req->ack_seg != req->total_segs) + break; + req->state = TID_REQUEST_COMPLETE; + + next = qpriv->r_tid_ack + 1; + if (next > rvt_size_atomic(&dev->rdi)) + next = 0; + qpriv->r_tid_ack = next; + if (qp->s_ack_queue[next].opcode != TID_OP(WRITE_REQ)) + break; + nreq = ack_to_tid_req(&qp->s_ack_queue[next]); + if (!nreq->comp_seg || nreq->ack_seg == nreq->comp_seg) + break; + + /* Move to the next ack entry now */ + e = &qp->s_ack_queue[qpriv->r_tid_ack]; + req = ack_to_tid_req(e); + } while (1); + + /* + * At this point qpriv->r_tid_ack == qpriv->r_tid_tail but e and + * req could be pointing at the previous ack queue entry + */ + if (qpriv->s_nak_state || + (qpriv->resync && + !hfi1_tid_rdma_is_resync_psn(qpriv->r_next_psn_kdeth - 1) && + (cmp_psn(qpriv->r_next_psn_kdeth - 1, + full_flow_psn(&req->flows[flow], + req->flows[flow].flow_state.lpsn)) > 0))) { + /* + * A NAK will implicitly acknowledge all previous TID RDMA + * requests. Therefore, we NAK with the req->acked_tail + * segment for the request at qpriv->r_tid_ack (same at + * this point as the req->clear_tail segment for the + * qpriv->r_tid_tail request) + */ + e = &qp->s_ack_queue[qpriv->r_tid_ack]; + req = ack_to_tid_req(e); + flow = req->acked_tail; + } + + hwords += hfi1_build_tid_rdma_write_ack(qp, e, ohdr, flow, &bth1, + &bth2); + len = 0; + qpriv->s_flags &= ~RVT_S_ACK_PENDING; + ps->s_txreq->hdr_dwords = hwords; + ps->s_txreq->sde = qpriv->s_sde; + ps->s_txreq->s_cur_size = len; + ps->s_txreq->ss = NULL; + hfi1_make_ruc_header(qp, ohdr, (TID_OP(ACK) << 24), bth1, bth2, middle, + ps); + return 1; +bail: + /* + * Ensure s_rdma_ack_cnt changes are committed prior to resetting + * RVT_S_RESP_PENDING + */ + smp_wmb(); + qpriv->s_flags &= ~RVT_S_ACK_PENDING; + return 0; +} -- cgit v1.2.3-59-g8ed1b From 572f0c3301138961a596c522729afb5801135d6e Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:51:27 -0800 Subject: IB/hfi1: Add the dual leg code The "Second Leg" of the TID RDMA WRITE protocol deals with the transfer of data and ack packets, which are in the KDETH PSN space, as opposed to the IB PSN space. Therefore, the Second Leg could be considered as a separate state machine. As such, it is handled by a different work queue item which is scheduled along with the normal IB state machine work item. Reviewed-by: Mike Marciniszyn Signed-off-by: Mitko Haralanov Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/iowait.h | 12 +++ drivers/infiniband/hw/hfi1/qp.c | 34 +++++++- drivers/infiniband/hw/hfi1/qp.h | 1 + drivers/infiniband/hw/hfi1/ruc.c | 32 ++++++-- drivers/infiniband/hw/hfi1/tid_rdma.c | 141 ++++++++++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/tid_rdma.h | 4 + drivers/infiniband/hw/hfi1/verbs.h | 3 + 7 files changed, 217 insertions(+), 10 deletions(-) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/iowait.h b/drivers/infiniband/hw/hfi1/iowait.h index 23a58ac0d47c..bd913701761d 100644 --- a/drivers/infiniband/hw/hfi1/iowait.h +++ b/drivers/infiniband/hw/hfi1/iowait.h @@ -185,6 +185,18 @@ static inline bool iowait_schedule(struct iowait *wait, return !!queue_work_on(cpu, wq, &wait->wait[IOWAIT_IB_SE].iowork); } +/** + * iowait_tid_schedule - schedule the tid SE + * @wait: the iowait structure + * @wq: the work queue + * @cpu: the cpu + */ +static inline bool iowait_tid_schedule(struct iowait *wait, + struct workqueue_struct *wq, int cpu) +{ + return !!queue_work_on(cpu, wq, &wait->wait[IOWAIT_TID_SE].iowork); +} + /** * iowait_sdma_drain() - wait for DMAs to drain * diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index 31b4b60f4364..96632c77f36f 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -431,6 +431,11 @@ static void hfi1_qp_schedule(struct rvt_qp *qp) if (ret) iowait_clear_flag(&priv->s_iowait, IOWAIT_PENDING_IB); } + if (iowait_flag_set(&priv->s_iowait, IOWAIT_PENDING_TID)) { + ret = hfi1_schedule_tid_send(qp); + if (ret) + iowait_clear_flag(&priv->s_iowait, IOWAIT_PENDING_TID); + } } void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag) @@ -450,8 +455,27 @@ void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag) void hfi1_qp_unbusy(struct rvt_qp *qp, struct iowait_work *wait) { - if (iowait_set_work_flag(wait) == IOWAIT_IB_SE) + struct hfi1_qp_priv *priv = qp->priv; + + if (iowait_set_work_flag(wait) == IOWAIT_IB_SE) { qp->s_flags &= ~RVT_S_BUSY; + /* + * If we are sending a first-leg packet from the second leg, + * we need to clear the busy flag from priv->s_flags to + * avoid a race condition when the qp wakes up before + * the call to hfi1_verbs_send() returns to the second + * leg. In that case, the second leg will terminate without + * being re-scheduled, resulting in failure to send TID RDMA + * WRITE DATA and TID RDMA ACK packets. + */ + if (priv->s_flags & HFI1_S_TID_BUSY_SET) { + priv->s_flags &= ~(HFI1_S_TID_BUSY_SET | + RVT_S_BUSY); + iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID); + } + } else { + priv->s_flags &= ~RVT_S_BUSY; + } } static int iowait_sleep( @@ -694,7 +718,7 @@ void *qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp) &priv->s_iowait, 1, _hfi1_do_send, - NULL, + _hfi1_do_tid_send, iowait_sleep, iowait_wakeup, iowait_sdma_drained); @@ -851,7 +875,8 @@ void notify_error_qp(struct rvt_qp *qp) if (lock) { write_seqlock(lock); if (!list_empty(&priv->s_iowait.list) && - !(qp->s_flags & RVT_S_BUSY)) { + !(qp->s_flags & RVT_S_BUSY) && + !(priv->s_flags & RVT_S_BUSY)) { qp->s_flags &= ~RVT_S_ANY_WAIT_IO; list_del_init(&priv->s_iowait.list); priv->s_iowait.lock = NULL; @@ -860,7 +885,8 @@ void notify_error_qp(struct rvt_qp *qp) write_sequnlock(lock); } - if (!(qp->s_flags & RVT_S_BUSY)) { + if (!(qp->s_flags & RVT_S_BUSY) && !(priv->s_flags & RVT_S_BUSY)) { + qp->s_hdrwords = 0; if (qp->s_rdma_mr) { rvt_put_mr(qp->s_rdma_mr); qp->s_rdma_mr = NULL; diff --git a/drivers/infiniband/hw/hfi1/qp.h b/drivers/infiniband/hw/hfi1/qp.h index d531b760ea93..b670321365d3 100644 --- a/drivers/infiniband/hw/hfi1/qp.h +++ b/drivers/infiniband/hw/hfi1/qp.h @@ -82,6 +82,7 @@ extern const struct rvt_operation_params hfi1_post_parms[]; #define HFI1_S_ANY_WAIT_IO (RVT_S_ANY_WAIT_IO | HFI1_S_WAIT_PIO_DRAIN) #define HFI1_S_ANY_WAIT (HFI1_S_ANY_WAIT_IO | RVT_S_ANY_WAIT_SEND) +#define HFI1_S_ANY_TID_WAIT_SEND (RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_DMA) /* * Send if not busy or waiting for I/O and either diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c index f96c0f544cb0..124a3ec1e15c 100644 --- a/drivers/infiniband/hw/hfi1/ruc.c +++ b/drivers/infiniband/hw/hfi1/ruc.c @@ -453,11 +453,13 @@ void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr, #define SEND_RESCHED_TIMEOUT (5 * HZ) /* 5s in jiffies */ /** - * schedule_send_yield - test for a yield required for QP send engine + * hfi1_schedule_send_yield - test for a yield required for QP + * send engine * @timeout: Final time for timeout slice for jiffies * @qp: a pointer to QP * @ps: a pointer to a structure with commonly lookup values for * the the send engine progress + * @tid - true if it is the tid leg * * This routine checks if the time slice for the QP has expired * for RC QPs, if so an additional work entry is queued. At this @@ -465,8 +467,8 @@ void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr, * returns true if a yield is required, otherwise, false * is returned. */ -static bool schedule_send_yield(struct rvt_qp *qp, - struct hfi1_pkt_state *ps) +bool hfi1_schedule_send_yield(struct rvt_qp *qp, struct hfi1_pkt_state *ps, + bool tid) { ps->pkts_sent = true; @@ -474,8 +476,24 @@ static bool schedule_send_yield(struct rvt_qp *qp, if (!ps->in_thread || workqueue_congested(ps->cpu, ps->ppd->hfi1_wq)) { spin_lock_irqsave(&qp->s_lock, ps->flags); - qp->s_flags &= ~RVT_S_BUSY; - hfi1_schedule_send(qp); + if (!tid) { + qp->s_flags &= ~RVT_S_BUSY; + hfi1_schedule_send(qp); + } else { + struct hfi1_qp_priv *priv = qp->priv; + + if (priv->s_flags & + HFI1_S_TID_BUSY_SET) { + qp->s_flags &= ~RVT_S_BUSY; + priv->s_flags &= + ~(HFI1_S_TID_BUSY_SET | + RVT_S_BUSY); + } else { + priv->s_flags &= ~RVT_S_BUSY; + } + hfi1_schedule_tid_send(qp); + } + spin_unlock_irqrestore(&qp->s_lock, ps->flags); this_cpu_inc(*ps->ppd->dd->send_schedule); trace_hfi1_rc_expired_time_slice(qp, true); @@ -576,6 +594,8 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread) do { /* Check for a constructed packet to be sent. */ if (ps.s_txreq) { + if (priv->s_flags & HFI1_S_TID_BUSY_SET) + qp->s_flags |= RVT_S_BUSY; spin_unlock_irqrestore(&qp->s_lock, ps.flags); /* * If the packet cannot be sent now, return and @@ -585,7 +605,7 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread) return; /* allow other tasks to run */ - if (schedule_send_yield(qp, &ps)) + if (hfi1_schedule_send_yield(qp, &ps, false)) return; spin_lock_irqsave(&qp->s_lock, ps.flags); diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 15c12243c166..80111dd1d876 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -127,6 +127,7 @@ static void hfi1_tid_retry_timeout(struct timer_list *t); static int make_tid_rdma_ack(struct rvt_qp *qp, struct ib_other_headers *ohdr, struct hfi1_pkt_state *ps); +static void hfi1_do_tid_send(struct rvt_qp *qp); static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p) { @@ -3048,6 +3049,7 @@ nak_psn: qpriv->s_flags |= RVT_S_ACK_PENDING; if (qpriv->r_tid_ack == HFI1_QP_WQE_INVALID) qpriv->r_tid_ack = qpriv->r_tid_tail; + hfi1_schedule_tid_send(qp); } goto unlock; } @@ -3517,6 +3519,7 @@ static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx) ret = -EAGAIN; to_seg = MAX_FLOWS >> 1; qpriv->s_flags |= RVT_S_ACK_PENDING; + hfi1_schedule_tid_send(qp); break; } @@ -4128,6 +4131,7 @@ void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet) } qp->s_flags &= ~HFI1_S_WAIT_TID_RESP; + hfi1_schedule_tid_send(qp); goto ack_done; ack_op_err: @@ -4287,6 +4291,7 @@ void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet) done: priv->s_flags |= RVT_S_ACK_PENDING; + hfi1_schedule_tid_send(qp); exit: priv->r_next_psn_kdeth = flow->flow_state.r_next_psn; spin_unlock_irqrestore(&qp->s_lock, flags); @@ -4299,6 +4304,7 @@ send_nak: priv->s_flags |= RVT_S_ACK_PENDING; if (priv->r_tid_ack == HFI1_QP_WQE_INVALID) priv->r_tid_ack = priv->r_tid_tail; + hfi1_schedule_tid_send(qp); } goto done; } @@ -4567,6 +4573,7 @@ void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet) req->cur_seg = req->ack_seg; qpriv->s_tid_tail = qp->s_acked; qpriv->s_state = TID_OP(WRITE_REQ); + hfi1_schedule_tid_send(qp); } done: qpriv->s_retry = qp->s_retry_cnt; @@ -4584,6 +4591,7 @@ done: qpriv->s_tid_tail = qp->s_acked; qpriv->s_state = TID_OP(WRITE_REQ); qpriv->s_retry = qp->s_retry_cnt; + hfi1_schedule_tid_send(qp); break; default: @@ -4673,6 +4681,7 @@ static void hfi1_tid_retry_timeout(struct timer_list *t) qp->s_flags |= HFI1_S_WAIT_HALT; priv->s_state = TID_OP(RESYNC); priv->s_retry--; + hfi1_schedule_tid_send(qp); } } spin_unlock(&qp->s_lock); @@ -4804,6 +4813,7 @@ void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet) /* RESYNC request always gets a TID RDMA ACK. */ qpriv->s_nak_state = 0; qpriv->s_flags |= RVT_S_ACK_PENDING; + hfi1_schedule_tid_send(qp); bail: spin_unlock_irqrestore(&qp->s_lock, flags); } @@ -5155,3 +5165,134 @@ bail: qpriv->s_flags &= ~RVT_S_ACK_PENDING; return 0; } + +static int hfi1_send_tid_ok(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + + return !(priv->s_flags & RVT_S_BUSY || + qp->s_flags & HFI1_S_ANY_WAIT_IO) && + (verbs_txreq_queued(iowait_get_tid_work(&priv->s_iowait)) || + (priv->s_flags & RVT_S_RESP_PENDING) || + !(qp->s_flags & HFI1_S_ANY_TID_WAIT_SEND)); +} + +void _hfi1_do_tid_send(struct work_struct *work) +{ + struct iowait_work *w = container_of(work, struct iowait_work, iowork); + struct rvt_qp *qp = iowait_to_qp(w->iow); + + hfi1_do_tid_send(qp); +} + +static void hfi1_do_tid_send(struct rvt_qp *qp) +{ + struct hfi1_pkt_state ps; + struct hfi1_qp_priv *priv = qp->priv; + + ps.dev = to_idev(qp->ibqp.device); + ps.ibp = to_iport(qp->ibqp.device, qp->port_num); + ps.ppd = ppd_from_ibp(ps.ibp); + ps.wait = iowait_get_tid_work(&priv->s_iowait); + ps.in_thread = false; + ps.timeout_int = qp->timeout_jiffies / 8; + + spin_lock_irqsave(&qp->s_lock, ps.flags); + + /* Return if we are already busy processing a work request. */ + if (!hfi1_send_tid_ok(qp)) { + if (qp->s_flags & HFI1_S_ANY_WAIT_IO) + iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID); + spin_unlock_irqrestore(&qp->s_lock, ps.flags); + return; + } + + priv->s_flags |= RVT_S_BUSY; + + ps.timeout = jiffies + ps.timeout_int; + ps.cpu = priv->s_sde ? priv->s_sde->cpu : + cpumask_first(cpumask_of_node(ps.ppd->dd->node)); + ps.pkts_sent = false; + + /* insure a pre-built packet is handled */ + ps.s_txreq = get_waiting_verbs_txreq(ps.wait); + do { + /* Check for a constructed packet to be sent. */ + if (ps.s_txreq) { + if (priv->s_flags & HFI1_S_TID_BUSY_SET) { + qp->s_flags |= RVT_S_BUSY; + ps.wait = iowait_get_ib_work(&priv->s_iowait); + } + spin_unlock_irqrestore(&qp->s_lock, ps.flags); + + /* + * If the packet cannot be sent now, return and + * the send tasklet will be woken up later. + */ + if (hfi1_verbs_send(qp, &ps)) + return; + + /* allow other tasks to run */ + if (hfi1_schedule_send_yield(qp, &ps, true)) + return; + + spin_lock_irqsave(&qp->s_lock, ps.flags); + if (priv->s_flags & HFI1_S_TID_BUSY_SET) { + qp->s_flags &= ~RVT_S_BUSY; + priv->s_flags &= ~HFI1_S_TID_BUSY_SET; + ps.wait = iowait_get_tid_work(&priv->s_iowait); + if (iowait_flag_set(&priv->s_iowait, + IOWAIT_PENDING_IB)) + hfi1_schedule_send(qp); + } + } + } while (hfi1_make_tid_rdma_pkt(qp, &ps)); + iowait_starve_clear(ps.pkts_sent, &priv->s_iowait); + spin_unlock_irqrestore(&qp->s_lock, ps.flags); +} + +static bool _hfi1_schedule_tid_send(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct hfi1_ibport *ibp = + to_iport(qp->ibqp.device, qp->port_num); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device); + + return iowait_tid_schedule(&priv->s_iowait, ppd->hfi1_wq, + priv->s_sde ? + priv->s_sde->cpu : + cpumask_first(cpumask_of_node(dd->node))); +} + +/** + * hfi1_schedule_tid_send - schedule progress on TID RDMA state machine + * @qp: the QP + * + * This schedules qp progress on the TID RDMA state machine. Caller + * should hold the s_lock. + * Unlike hfi1_schedule_send(), this cannot use hfi1_send_ok() because + * the two state machines can step on each other with respect to the + * RVT_S_BUSY flag. + * Therefore, a modified test is used. + * @return true if the second leg is scheduled; + * false if the second leg is not scheduled. + */ +bool hfi1_schedule_tid_send(struct rvt_qp *qp) +{ + lockdep_assert_held(&qp->s_lock); + if (hfi1_send_tid_ok(qp)) { + /* + * The following call returns true if the qp is not on the + * queue and false if the qp is already on the queue before + * this call. Either way, the qp will be on the queue when the + * call returns. + */ + _hfi1_schedule_tid_send(qp); + return true; + } + if (qp->s_flags & HFI1_S_ANY_WAIT_IO) + iowait_set_flag(&((struct hfi1_qp_priv *)qp->priv)->s_iowait, + IOWAIT_PENDING_TID); + return false; +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index 0ce0ef6d60f2..7f8f17ba6c14 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -305,4 +305,8 @@ void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet); struct hfi1_pkt_state; int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps); +void _hfi1_do_tid_send(struct work_struct *work); + +bool hfi1_schedule_tid_send(struct rvt_qp *qp); + #endif /* HFI1_TID_RDMA_H */ diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index 3e45149d22c4..bee3d21a548e 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -443,6 +443,9 @@ void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr, u32 bth0, u32 bth1, u32 bth2, int middle, struct hfi1_pkt_state *ps); +bool hfi1_schedule_send_yield(struct rvt_qp *qp, struct hfi1_pkt_state *ps, + bool tid); + void _hfi1_do_send(struct work_struct *work); void hfi1_do_send_from_rvt(struct rvt_qp *qp); -- cgit v1.2.3-59-g8ed1b From 3c6cb20a0d17d7a75778fb0935d6fa427c8177af Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:51:39 -0800 Subject: IB/hfi1: Add TID RDMA WRITE functionality into RDMA verbs This patch integrates TID RDMA WRITE protocol into normal RDMA verbs framework. The TID RDMA WRITE protocol is an end-to-end protocol between the hfi1 drivers on two OPA nodes that converts a qualified RDMA WRITE request into a TID RDMA WRITE request to avoid data copying on the responder side. Reviewed-by: Mike Marciniszyn Signed-off-by: Mitko Haralanov Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/qp.c | 7 + drivers/infiniband/hw/hfi1/rc.c | 487 +++++++++++++++++++++++++++++---- drivers/infiniband/hw/hfi1/tid_rdma.c | 14 + drivers/infiniband/hw/hfi1/user_sdma.c | 3 +- drivers/infiniband/hw/hfi1/verbs.c | 17 +- drivers/infiniband/hw/hfi1/verbs.h | 1 + include/rdma/rdmavt_qp.h | 1 + 7 files changed, 481 insertions(+), 49 deletions(-) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index 96632c77f36f..cfd598e4b303 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -138,6 +138,12 @@ const struct rvt_operation_params hfi1_post_parms[RVT_OPERATION_MAX] = { .flags = RVT_OPERATION_USE_RESERVE, }, +[IB_WR_TID_RDMA_WRITE] = { + .length = sizeof(struct ib_rdma_wr), + .qpt_support = BIT(IB_QPT_RC), + .flags = RVT_OPERATION_IGN_RNR_CNT, +}, + }; static void flush_list_head(struct list_head *l) @@ -780,6 +786,7 @@ void quiesce_qp(struct rvt_qp *qp) struct hfi1_qp_priv *priv = qp->priv; hfi1_del_tid_reap_timer(qp); + hfi1_del_tid_retry_timer(qp); iowait_sdma_drain(&priv->s_iowait); qp_pio_drain(qp); flush_tx_list(qp); diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index fcb733ea8dfb..6d2abea896e5 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -111,16 +111,17 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, struct hfi1_pkt_state *ps) { struct rvt_ack_entry *e; - u32 hwords; + u32 hwords, hdrlen; u32 len = 0; u32 bth0 = 0, bth2 = 0; u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT); int middle = 0; u32 pmtu = qp->pmtu; - struct hfi1_qp_priv *priv = qp->priv; + struct hfi1_qp_priv *qpriv = qp->priv; bool last_pkt; u32 delta; u8 next = qp->s_tail_ack_queue; + struct tid_rdma_request *req; trace_hfi1_rsp_make_rc_ack(qp, 0); lockdep_assert_held(&qp->s_lock); @@ -128,7 +129,7 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) goto bail; - if (priv->hdr_type == HFI1_PKT_TYPE_9B) + if (qpriv->hdr_type == HFI1_PKT_TYPE_9B) /* header size in 32-bit words LRH+BTH = (8+12)/4. */ hwords = 5; else @@ -206,6 +207,21 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, hwords++; qp->s_ack_rdma_psn = e->psn; bth2 = mask_psn(qp->s_ack_rdma_psn++); + } else if (e->opcode == TID_OP(WRITE_REQ)) { + /* + * If a TID RDMA WRITE RESP is being resent, we have to + * wait for the actual request. All requests that are to + * be resent will have their state set to + * TID_REQUEST_RESEND. When the new request arrives, the + * state will be changed to TID_REQUEST_RESEND_ACTIVE. + */ + req = ack_to_tid_req(e); + if (req->state == TID_REQUEST_RESEND || + req->state == TID_REQUEST_INIT_RESEND) + goto bail; + qp->s_ack_state = TID_OP(WRITE_RESP); + qp->s_ack_rdma_psn = mask_psn(e->psn + req->cur_seg); + goto write_resp; } else if (e->opcode == TID_OP(READ_REQ)) { /* * If a TID RDMA read response is being resent and @@ -267,6 +283,59 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, bth2 = mask_psn(qp->s_ack_rdma_psn++); break; + case TID_OP(WRITE_RESP): +write_resp: + /* + * 1. Check if RVT_S_ACK_PENDING is set. If yes, + * goto normal. + * 2. Attempt to allocate TID resources. + * 3. Remove RVT_S_RESP_PENDING flags from s_flags + * 4. If resources not available: + * 4.1 Set RVT_S_WAIT_TID_SPACE + * 4.2 Queue QP on RCD TID queue + * 4.3 Put QP on iowait list. + * 4.4 Build IB RNR NAK with appropriate timeout value + * 4.5 Return indication progress made. + * 5. If resources are available: + * 5.1 Program HW flow CSRs + * 5.2 Build TID RDMA WRITE RESP packet + * 5.3 If more resources needed, do 2.1 - 2.3. + * 5.4 Wake up next QP on RCD TID queue. + * 5.5 Return indication progress made. + */ + + e = &qp->s_ack_queue[qp->s_tail_ack_queue]; + req = ack_to_tid_req(e); + + /* + * Send scheduled RNR NAK's. RNR NAK's need to be sent at + * segment boundaries, not at request boundaries. Don't change + * s_ack_state because we are still in the middle of a request + */ + if (qpriv->rnr_nak_state == TID_RNR_NAK_SEND && + qp->s_tail_ack_queue == qpriv->r_tid_alloc && + req->cur_seg == req->alloc_seg) { + qpriv->rnr_nak_state = TID_RNR_NAK_SENT; + goto normal_no_state; + } + + bth2 = mask_psn(qp->s_ack_rdma_psn); + hdrlen = hfi1_build_tid_rdma_write_resp(qp, e, ohdr, &bth1, + bth2, &len, + &ps->s_txreq->ss); + if (!hdrlen) + return 0; + + hwords += hdrlen; + bth0 = qp->s_ack_state << 24; + qp->s_ack_rdma_psn++; + if (req->cur_seg != req->total_segs) + break; + + e->sent = 1; + qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST); + break; + case TID_OP(READ_RESP): read_resp: e = &qp->s_ack_queue[qp->s_tail_ack_queue]; @@ -298,8 +367,7 @@ normal: * (see above). */ qp->s_ack_state = OP(SEND_ONLY); - qp->s_flags &= ~RVT_S_ACK_PENDING; - ps->s_txreq->ss = NULL; +normal_no_state: if (qp->s_nak_state) ohdr->u.aeth = cpu_to_be32((qp->r_msn & IB_MSN_MASK) | @@ -311,9 +379,11 @@ normal: len = 0; bth0 = OP(ACKNOWLEDGE) << 24; bth2 = mask_psn(qp->s_ack_psn); + qp->s_flags &= ~RVT_S_ACK_PENDING; + ps->s_txreq->ss = NULL; } qp->s_rdma_ack_cnt++; - ps->s_txreq->sde = priv->s_sde; + ps->s_txreq->sde = qpriv->s_sde; ps->s_txreq->s_cur_size = len; ps->s_txreq->hdr_dwords = hwords; hfi1_make_ruc_header(qp, ohdr, bth0, bth1, bth2, middle, ps); @@ -366,6 +436,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) int middle = 0; int delta; struct tid_rdma_flow *flow = NULL; + struct tid_rdma_params *remote; trace_hfi1_sender_make_rc_req(qp); lockdep_assert_held(&qp->s_lock); @@ -414,7 +485,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) goto done_free_tx; } - if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK)) + if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK | HFI1_S_WAIT_HALT)) goto bail; if (cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) { @@ -586,6 +657,108 @@ no_flow_control: qp->s_cur = 0; break; + case IB_WR_TID_RDMA_WRITE: + if (newreq) { + /* + * Limit the number of TID RDMA WRITE requests. + */ + if (atomic_read(&priv->n_tid_requests) >= + HFI1_TID_RDMA_WRITE_CNT) + goto bail; + + if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) + qp->s_lsn++; + } + + hwords += hfi1_build_tid_rdma_write_req(qp, wqe, ohdr, + &bth1, &bth2, + &len); + ss = NULL; + if (priv->s_tid_cur == HFI1_QP_WQE_INVALID) { + priv->s_tid_cur = qp->s_cur; + if (priv->s_tid_tail == HFI1_QP_WQE_INVALID) { + priv->s_tid_tail = qp->s_cur; + priv->s_state = TID_OP(WRITE_RESP); + } + } else if (priv->s_tid_cur == priv->s_tid_head) { + struct rvt_swqe *__w; + struct tid_rdma_request *__r; + + __w = rvt_get_swqe_ptr(qp, priv->s_tid_cur); + __r = wqe_to_tid_req(__w); + + /* + * The s_tid_cur pointer is advanced to s_cur if + * any of the following conditions about the WQE + * to which s_ti_cur currently points to are + * satisfied: + * 1. The request is not a TID RDMA WRITE + * request, + * 2. The request is in the INACTIVE or + * COMPLETE states (TID RDMA READ requests + * stay at INACTIVE and TID RDMA WRITE + * transition to COMPLETE when done), + * 3. The request is in the ACTIVE or SYNC + * state and the number of completed + * segments is equal to the total segment + * count. + * (If ACTIVE, the request is waiting for + * ACKs. If SYNC, the request has not + * received any responses because it's + * waiting on a sync point.) + */ + if (__w->wr.opcode != IB_WR_TID_RDMA_WRITE || + __r->state == TID_REQUEST_INACTIVE || + __r->state == TID_REQUEST_COMPLETE || + ((__r->state == TID_REQUEST_ACTIVE || + __r->state == TID_REQUEST_SYNC) && + __r->comp_seg == __r->total_segs)) { + if (priv->s_tid_tail == + priv->s_tid_cur && + priv->s_state == + TID_OP(WRITE_DATA_LAST)) { + priv->s_tid_tail = qp->s_cur; + priv->s_state = + TID_OP(WRITE_RESP); + } + priv->s_tid_cur = qp->s_cur; + } + /* + * A corner case: when the last TID RDMA WRITE + * request was completed, s_tid_head, + * s_tid_cur, and s_tid_tail all point to the + * same location. Other requests are posted and + * s_cur wraps around to the same location, + * where a new TID RDMA WRITE is posted. In + * this case, none of the indices need to be + * updated. However, the priv->s_state should. + */ + if (priv->s_tid_tail == qp->s_cur && + priv->s_state == TID_OP(WRITE_DATA_LAST)) + priv->s_state = TID_OP(WRITE_RESP); + } + req = wqe_to_tid_req(wqe); + if (newreq) { + priv->s_tid_head = qp->s_cur; + priv->pending_tid_w_resp += req->total_segs; + atomic_inc(&priv->n_tid_requests); + atomic_dec(&priv->n_requests); + } else { + req->state = TID_REQUEST_RESEND; + req->comp_seg = delta_psn(bth2, wqe->psn); + /* + * Pull back any segments since we are going + * to re-receive them. + */ + req->setup_head = req->clear_tail; + priv->pending_tid_w_resp += + delta_psn(wqe->lpsn, bth2) + 1; + } + + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + case IB_WR_RDMA_READ: /* * Don't allow more operations to be started @@ -745,7 +918,8 @@ no_flow_control: if (qp->s_tail >= qp->s_size) qp->s_tail = 0; } - if (wqe->wr.opcode == IB_WR_RDMA_READ) + if (wqe->wr.opcode == IB_WR_RDMA_READ || + wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) qp->s_psn = wqe->lpsn + 1; else if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) qp->s_psn = req->s_next_psn; @@ -865,6 +1039,33 @@ no_flow_control: if (qp->s_cur == qp->s_size) qp->s_cur = 0; break; + + case TID_OP(WRITE_RESP): + /* + * This value for s_state is used for restarting a TID RDMA + * WRITE request. See comment in OP(RDMA_READ_RESPONSE_MIDDLE + * for more). + */ + req = wqe_to_tid_req(wqe); + req->state = TID_REQUEST_RESEND; + rcu_read_lock(); + remote = rcu_dereference(priv->tid_rdma.remote); + req->comp_seg = delta_psn(qp->s_psn, wqe->psn); + len = wqe->length - (req->comp_seg * remote->max_len); + rcu_read_unlock(); + + bth2 = mask_psn(qp->s_psn); + hwords += hfi1_build_tid_rdma_write_req(qp, wqe, ohdr, &bth1, + &bth2, &len); + qp->s_psn = wqe->lpsn + 1; + ss = NULL; + qp->s_state = TID_OP(WRITE_REQ); + priv->pending_tid_w_resp += delta_psn(wqe->lpsn, bth2) + 1; + priv->s_tid_cur = qp->s_cur; + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + case TID_OP(READ_RESP): if (wqe->wr.opcode != IB_WR_TID_RDMA_READ) goto bail; @@ -965,7 +1166,8 @@ no_flow_control: } qp->s_sending_hpsn = bth2; delta = delta_psn(bth2, wqe->psn); - if (delta && delta % HFI1_PSN_CREDIT == 0) + if (delta && delta % HFI1_PSN_CREDIT == 0 && + wqe->wr.opcode != IB_WR_TID_RDMA_WRITE) bth2 |= IB_BTH_REQ_ACK; if (qp->s_flags & RVT_S_SEND_ONE) { qp->s_flags &= ~RVT_S_SEND_ONE; @@ -998,6 +1200,12 @@ bail: bail_no_tx: ps->s_txreq = NULL; qp->s_flags &= ~RVT_S_BUSY; + /* + * If we didn't get a txreq, the QP will be woken up later to try + * again. Set the flags to indicate which work item to wake + * up. + */ + iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_IB); return 0; } @@ -1285,6 +1493,7 @@ static void reset_psn(struct rvt_qp *qp, u32 psn) lockdep_assert_held(&qp->s_lock); qp->s_cur = n; priv->pending_tid_r_segs = 0; + priv->pending_tid_w_resp = 0; qp->s_num_rd_atomic = 0; /* @@ -1342,6 +1551,10 @@ static void reset_psn(struct rvt_qp *qp, u32 psn) qp->s_state = OP(RDMA_READ_RESPONSE_LAST); break; + case IB_WR_TID_RDMA_WRITE: + qp->s_state = TID_OP(WRITE_RESP); + break; + case IB_WR_RDMA_READ: qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE); break; @@ -1435,7 +1648,7 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait) qp->s_flags &= ~(RVT_S_WAIT_FENCE | RVT_S_WAIT_RDMAR | RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_PSN | - RVT_S_WAIT_ACK); + RVT_S_WAIT_ACK | HFI1_S_WAIT_TID_RESP); if (wait) qp->s_flags |= RVT_S_SEND_ONE; reset_psn(qp, psn); @@ -1443,7 +1656,8 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait) /* * Set qp->s_sending_psn to the next PSN after the given one. - * This would be psn+1 except when RDMA reads are present. + * This would be psn+1 except when RDMA reads or TID RDMA ops + * are present. */ static void reset_sending_psn(struct rvt_qp *qp, u32 psn) { @@ -1456,7 +1670,8 @@ static void reset_sending_psn(struct rvt_qp *qp, u32 psn) wqe = rvt_get_swqe_ptr(qp, n); if (cmp_psn(psn, wqe->lpsn) <= 0) { if (wqe->wr.opcode == IB_WR_RDMA_READ || - wqe->wr.opcode == IB_WR_TID_RDMA_READ) + wqe->wr.opcode == IB_WR_TID_RDMA_READ || + wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) qp->s_sending_psn = wqe->lpsn + 1; else qp->s_sending_psn = psn + 1; @@ -1479,8 +1694,9 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah) struct rvt_swqe *wqe; struct ib_header *hdr = NULL; struct hfi1_16b_header *hdr_16b = NULL; - u32 opcode; + u32 opcode, head, tail; u32 psn; + struct tid_rdma_request *req; lockdep_assert_held(&qp->s_lock); if (!(ib_rvt_state_ops[qp->state] & RVT_SEND_OR_FLUSH_OR_RECV_OK)) @@ -1507,29 +1723,84 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah) opcode = ib_bth_get_opcode(ohdr); if ((opcode >= OP(RDMA_READ_RESPONSE_FIRST) && opcode <= OP(ATOMIC_ACKNOWLEDGE)) || - opcode == TID_OP(READ_RESP)) { + opcode == TID_OP(READ_RESP) || + opcode == TID_OP(WRITE_RESP)) { WARN_ON(!qp->s_rdma_ack_cnt); qp->s_rdma_ack_cnt--; return; } psn = ib_bth_get_psn(ohdr); - reset_sending_psn(qp, psn); + /* + * Don't attempt to reset the sending PSN for packets in the + * KDETH PSN space since the PSN does not match anything. + */ + if (opcode != TID_OP(WRITE_DATA) && + opcode != TID_OP(WRITE_DATA_LAST) && + opcode != TID_OP(ACK) && opcode != TID_OP(RESYNC)) + reset_sending_psn(qp, psn); + + /* Handle TID RDMA WRITE packets differently */ + if (opcode >= TID_OP(WRITE_REQ) && + opcode <= TID_OP(WRITE_DATA_LAST)) { + head = priv->s_tid_head; + tail = priv->s_tid_cur; + /* + * s_tid_cur is set to s_tid_head in the case, where + * a new TID RDMA request is being started and all + * previous ones have been completed. + * Therefore, we need to do a secondary check in order + * to properly determine whether we should start the + * RC timer. + */ + wqe = rvt_get_swqe_ptr(qp, tail); + req = wqe_to_tid_req(wqe); + if (head == tail && req->comp_seg < req->total_segs) { + if (tail == 0) + tail = qp->s_size - 1; + else + tail -= 1; + } + } else { + head = qp->s_tail; + tail = qp->s_acked; + } /* * Start timer after a packet requesting an ACK has been sent and * there are still requests that haven't been acked. */ - if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail && + if ((psn & IB_BTH_REQ_ACK) && tail != head && + opcode != TID_OP(WRITE_DATA) && opcode != TID_OP(WRITE_DATA_LAST) && + opcode != TID_OP(RESYNC) && !(qp->s_flags & - (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) && - (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { + (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) && + (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { if (opcode == TID_OP(READ_REQ)) rvt_add_retry_timer_ext(qp, priv->timeout_shift); else rvt_add_retry_timer(qp); } + /* Start TID RDMA ACK timer */ + if ((opcode == TID_OP(WRITE_DATA) || + opcode == TID_OP(WRITE_DATA_LAST) || + opcode == TID_OP(RESYNC)) && + (psn & IB_BTH_REQ_ACK) && + !(priv->s_flags & HFI1_S_TID_RETRY_TIMER) && + (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { + /* + * The TID RDMA ACK packet could be received before this + * function is called. Therefore, add the timer only if TID + * RDMA ACK packets are actually pending. + */ + wqe = rvt_get_swqe_ptr(qp, qp->s_acked); + req = wqe_to_tid_req(wqe); + if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE && + req->ack_seg < req->cur_seg) + hfi1_add_tid_retry_timer(qp); + } + while (qp->s_last != qp->s_acked) { u32 s_last; @@ -1628,7 +1899,16 @@ struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, } qp->s_retry = qp->s_retry_cnt; - update_last_psn(qp, wqe->lpsn); + /* + * Don't update the last PSN if the request being completed is + * a TID RDMA WRITE request. + * Completion of the TID RDMA WRITE requests are done by the + * TID RDMA ACKs and as such could be for a request that has + * already been ACKed as far as the IB state machine is + * concerned. + */ + if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE) + update_last_psn(qp, wqe->lpsn); /* * If we are completing a request which is in the process of @@ -1658,6 +1938,54 @@ struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, return wqe; } +static void set_restart_qp(struct rvt_qp *qp, struct hfi1_ctxtdata *rcd) +{ + /* Retry this request. */ + if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) { + qp->r_flags |= RVT_R_RDMAR_SEQ; + hfi1_restart_rc(qp, qp->s_last_psn + 1, 0); + if (list_empty(&qp->rspwait)) { + qp->r_flags |= RVT_R_RSP_SEND; + rvt_get_qp(qp); + list_add_tail(&qp->rspwait, &rcd->qp_wait_list); + } + } +} + +/** + * update_qp_retry_state - Update qp retry state. + * @qp: the QP + * @psn: the packet sequence number of the TID RDMA WRITE RESP. + * @spsn: The start psn for the given TID RDMA WRITE swqe. + * @lpsn: The last psn for the given TID RDMA WRITE swqe. + * + * This function is called to update the qp retry state upon + * receiving a TID WRITE RESP after the qp is scheduled to retry + * a request. + */ +static void update_qp_retry_state(struct rvt_qp *qp, u32 psn, u32 spsn, + u32 lpsn) +{ + struct hfi1_qp_priv *qpriv = qp->priv; + + qp->s_psn = psn + 1; + /* + * If this is the first TID RDMA WRITE RESP packet for the current + * request, change the s_state so that the retry will be processed + * correctly. Similarly, if this is the last TID RDMA WRITE RESP + * packet, change the s_state and advance the s_cur. + */ + if (cmp_psn(psn, lpsn) >= 0) { + qp->s_cur = qpriv->s_tid_cur + 1; + if (qp->s_cur >= qp->s_size) + qp->s_cur = 0; + qp->s_state = TID_OP(WRITE_REQ); + } else if (!cmp_psn(psn, spsn)) { + qp->s_cur = qpriv->s_tid_cur; + qp->s_state = TID_OP(WRITE_RESP); + } +} + /** * do_rc_ack - process an incoming RC ACK * @qp: the QP the ACK came in on @@ -1679,6 +2007,7 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, int ret = 0; u32 ack_psn; int diff; + struct rvt_dev_info *rdi; lockdep_assert_held(&qp->s_lock); /* @@ -1725,18 +2054,10 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, (opcode != TID_OP(READ_RESP) || diff != 0)) || ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) && - (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) { - /* Retry this request. */ - if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) { - qp->r_flags |= RVT_R_RDMAR_SEQ; - hfi1_restart_rc(qp, qp->s_last_psn + 1, 0); - if (list_empty(&qp->rspwait)) { - qp->r_flags |= RVT_R_RSP_SEND; - rvt_get_qp(qp); - list_add_tail(&qp->rspwait, - &rcd->qp_wait_list); - } - } + (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0)) || + (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE && + (delta_psn(psn, qp->s_last_psn) != 1))) { + set_restart_qp(qp, rcd); /* * No need to process the ACK/NAK since we are * restarting an earlier request. @@ -1768,6 +2089,14 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, hfi1_schedule_send(qp); } } + + /* + * TID RDMA WRITE requests will be completed by the TID RDMA + * ACK packet handler (see tid_rdma.c). + */ + if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) + break; + wqe = do_rc_completion(qp, wqe, ibp); if (qp->s_acked == qp->s_tail) break; @@ -1785,17 +2114,60 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, else rvt_stop_rc_timers(qp); } else if (qp->s_acked != qp->s_tail) { + struct rvt_swqe *__w = NULL; + + if (qpriv->s_tid_cur != HFI1_QP_WQE_INVALID) + __w = rvt_get_swqe_ptr(qp, qpriv->s_tid_cur); + /* - * We are expecting more ACKs so - * mod the retry timer. - */ - rvt_mod_retry_timer(qp); - /* - * We can stop re-sending the earlier packets and - * continue with the next packet the receiver wants. + * Stop timers if we've received all of the TID RDMA + * WRITE * responses. */ - if (cmp_psn(qp->s_psn, psn) <= 0) - reset_psn(qp, psn + 1); + if (__w && __w->wr.opcode == IB_WR_TID_RDMA_WRITE && + opcode == TID_OP(WRITE_RESP)) { + /* + * Normally, the loop above would correctly + * process all WQEs from s_acked onward and + * either complete them or check for correct + * PSN sequencing. + * However, for TID RDMA, due to pipelining, + * the response may not be for the request at + * s_acked so the above look would just be + * skipped. This does not allow for checking + * the PSN sequencing. It has to be done + * separately. + */ + if (cmp_psn(psn, qp->s_last_psn + 1)) { + set_restart_qp(qp, rcd); + goto bail_stop; + } + /* + * If the psn is being resent, stop the + * resending. + */ + if (qp->s_cur != qp->s_tail && + cmp_psn(qp->s_psn, psn) <= 0) + update_qp_retry_state(qp, psn, + __w->psn, + __w->lpsn); + else if (--qpriv->pending_tid_w_resp) + rvt_mod_retry_timer(qp); + else + rvt_stop_rc_timers(qp); + } else { + /* + * We are expecting more ACKs so + * mod the retry timer. + */ + rvt_mod_retry_timer(qp); + /* + * We can stop re-sending the earlier packets + * and continue with the next packet the + * receiver wants. + */ + if (cmp_psn(qp->s_psn, psn) <= 0) + reset_psn(qp, psn + 1); + } } else { /* No more acks - kill all timers */ rvt_stop_rc_timers(qp); @@ -1811,6 +2183,15 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, rvt_get_credit(qp, aeth); qp->s_rnr_retry = qp->s_rnr_retry_cnt; qp->s_retry = qp->s_retry_cnt; + /* + * If the current request is a TID RDMA WRITE request and the + * response is not a TID RDMA WRITE RESP packet, s_last_psn + * can't be advanced. + */ + if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE && + opcode != TID_OP(WRITE_RESP) && + cmp_psn(psn, wqe->psn) >= 0) + return 1; update_last_psn(qp, psn); return 1; @@ -1820,20 +2201,31 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, goto bail_stop; if (qp->s_flags & RVT_S_WAIT_RNR) goto bail_stop; - if (qp->s_rnr_retry == 0) { + rdi = ib_to_rvt(qp->ibqp.device); + if (qp->s_rnr_retry == 0 && + !((rdi->post_parms[wqe->wr.opcode].flags & + RVT_OPERATION_IGN_RNR_CNT) && + qp->s_rnr_retry_cnt == 0)) { status = IB_WC_RNR_RETRY_EXC_ERR; goto class_b; } - if (qp->s_rnr_retry_cnt < 7) + if (qp->s_rnr_retry_cnt < 7 && qp->s_rnr_retry_cnt > 0) qp->s_rnr_retry--; - /* The last valid PSN is the previous PSN. */ - update_last_psn(qp, psn - 1); + /* + * The last valid PSN is the previous PSN. For TID RDMA WRITE + * request, s_last_psn should be incremented only when a TID + * RDMA WRITE RESP is received to avoid skipping lost TID RDMA + * WRITE RESP packets. + */ + if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) { + reset_psn(qp, qp->s_last_psn + 1); + } else { + update_last_psn(qp, psn - 1); + reset_psn(qp, psn); + } ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn); - - reset_psn(qp, psn); - qp->s_flags &= ~(RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_ACK); rvt_stop_rc_timers(qp); rvt_add_rnr_timer(qp, aeth); @@ -1918,6 +2310,7 @@ static void rdma_seq_err(struct rvt_qp *qp, struct hfi1_ibport *ibp, u32 psn, while (cmp_psn(psn, wqe->lpsn) > 0) { if (wqe->wr.opcode == IB_WR_RDMA_READ || wqe->wr.opcode == IB_WR_TID_RDMA_READ || + wqe->wr.opcode == IB_WR_TID_RDMA_WRITE || wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) break; diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 80111dd1d876..490e47a0f68b 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -3205,6 +3205,20 @@ void hfi1_qp_kern_exp_rcv_clear_all(struct rvt_qp *qp) do { struct hfi1_swqe_priv *priv = wqe->priv; + ret = hfi1_kern_exp_rcv_clear(&priv->tid_req); + } while (!ret); + } + for (i = qp->s_acked_ack_queue; i != qp->r_head_ack_queue;) { + struct rvt_ack_entry *e = &qp->s_ack_queue[i]; + + if (++i == rvt_max_atomic(ib_to_rvt(qp->ibqp.device))) + i = 0; + /* Free only locally allocated TID entries */ + if (e->opcode != TID_OP(WRITE_REQ)) + continue; + do { + struct hfi1_ack_priv *priv = e->priv; + ret = hfi1_kern_exp_rcv_clear(&priv->tid_req); } while (!ret); } diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c index e5e7fad09f32..6764114b886c 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.c +++ b/drivers/infiniband/hw/hfi1/user_sdma.c @@ -1126,7 +1126,8 @@ static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags) 0xffffffull), psn = val & mask; if (expct) - psn = (psn & ~BTH_SEQ_MASK) | ((psn + frags) & BTH_SEQ_MASK); + psn = (psn & ~HFI1_KDETH_BTH_SEQ_MASK) | + ((psn + frags) & HFI1_KDETH_BTH_SEQ_MASK); else psn = psn + frags; return psn & mask; diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 7b87b77582bd..ab97d71cdd92 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -161,6 +161,7 @@ MODULE_PARM_DESC(wss_clean_period, "Count of verbs copies before an entry in the */ const enum ib_wc_opcode ib_hfi1_wc_opcode[] = { [IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE, + [IB_WR_TID_RDMA_WRITE] = IB_WC_RDMA_WRITE, [IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE, [IB_WR_SEND] = IB_WC_SEND, [IB_WR_SEND_WITH_IMM] = IB_WC_SEND, @@ -203,6 +204,12 @@ const u8 hdr_len_by_opcode[256] = { [IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE] = 12 + 8 + 4, [IB_OPCODE_TID_RDMA_READ_REQ] = 12 + 8 + 36, [IB_OPCODE_TID_RDMA_READ_RESP] = 12 + 8 + 36, + [IB_OPCODE_TID_RDMA_WRITE_REQ] = 12 + 8 + 36, + [IB_OPCODE_TID_RDMA_WRITE_RESP] = 12 + 8 + 36, + [IB_OPCODE_TID_RDMA_WRITE_DATA] = 12 + 8 + 36, + [IB_OPCODE_TID_RDMA_WRITE_DATA_LAST] = 12 + 8 + 36, + [IB_OPCODE_TID_RDMA_ACK] = 12 + 8 + 36, + [IB_OPCODE_TID_RDMA_RESYNC] = 12 + 8 + 36, /* UC */ [IB_OPCODE_UC_SEND_FIRST] = 12 + 8, [IB_OPCODE_UC_SEND_MIDDLE] = 12 + 8, @@ -248,8 +255,14 @@ static const opcode_handler opcode_handler_tbl[256] = { [IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE] = &hfi1_rc_rcv, /* TID RDMA has separate handlers for different opcodes.*/ + [IB_OPCODE_TID_RDMA_WRITE_REQ] = &hfi1_rc_rcv_tid_rdma_write_req, + [IB_OPCODE_TID_RDMA_WRITE_RESP] = &hfi1_rc_rcv_tid_rdma_write_resp, + [IB_OPCODE_TID_RDMA_WRITE_DATA] = &hfi1_rc_rcv_tid_rdma_write_data, + [IB_OPCODE_TID_RDMA_WRITE_DATA_LAST] = &hfi1_rc_rcv_tid_rdma_write_data, [IB_OPCODE_TID_RDMA_READ_REQ] = &hfi1_rc_rcv_tid_rdma_read_req, [IB_OPCODE_TID_RDMA_READ_RESP] = &hfi1_rc_rcv_tid_rdma_read_resp, + [IB_OPCODE_TID_RDMA_RESYNC] = &hfi1_rc_rcv_tid_rdma_resync, + [IB_OPCODE_TID_RDMA_ACK] = &hfi1_rc_rcv_tid_rdma_ack, /* UC */ [IB_OPCODE_UC_SEND_FIRST] = &hfi1_uc_rcv, @@ -1332,7 +1345,9 @@ static void hfi1_fill_device_attr(struct hfi1_devdata *dd) rdi->dparms.props.max_mr_size = U64_MAX; rdi->dparms.props.max_fast_reg_page_list_len = UINT_MAX; rdi->dparms.props.max_qp = hfi1_max_qps; - rdi->dparms.props.max_qp_wr = hfi1_max_qp_wrs; + rdi->dparms.props.max_qp_wr = + (hfi1_max_qp_wrs >= HFI1_QP_WQE_INVALID ? + HFI1_QP_WQE_INVALID - 1 : hfi1_max_qp_wrs); rdi->dparms.props.max_send_sge = hfi1_max_sges; rdi->dparms.props.max_recv_sge = hfi1_max_sges; rdi->dparms.props.max_sge_rd = hfi1_max_sges; diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index bee3d21a548e..62ace0b2d17a 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -193,6 +193,7 @@ struct hfi1_qp_priv { u32 r_tid_ack; /* the TID RDMA request to be ACK'ed */ u32 r_tid_alloc; /* Request for which we are allocating resources */ u32 pending_tid_w_segs; /* Num of pending tid write segments */ + u32 pending_tid_w_resp; /* Num of pending tid write responses */ u32 alloc_w_segs; /* Number of segments for which write */ /* resources have been allocated for this QP */ diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index 4ee612ab6cb4..f0fbd4063fef 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -246,6 +246,7 @@ struct rvt_ack_entry { #define RVT_OPERATION_ATOMIC_SGE 0x00000004 #define RVT_OPERATION_LOCAL 0x00000008 #define RVT_OPERATION_USE_RESERVE 0x00000010 +#define RVT_OPERATION_IGN_RNR_CNT 0x00000020 #define RVT_OPERATION_MAX (IB_WR_RESERVED10 + 1) -- cgit v1.2.3-59-g8ed1b From c6c231175ccdf188d443c27e5456b9e2f65e44d4 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:51:49 -0800 Subject: IB/hfi1: Add interlock between TID RDMA WRITE and other requests This locking mechanism is designed to provent vavious memory corruption scenarios from occurring when requests are pipelined, especially when RDMA WRITE requests are interleaved with TID RDMA READ requests: 1. READ-AFTER-READ; 2. READ-AFTER-WRITE; 3. WRITE-AFTER-READ; 4. WRITE-AFTER-WRITE. When memory corruption is likely, a request will be held back until previous requests have been completed. Reviewed-by: Mike Marciniszyn Signed-off-by: Mitko Haralanov Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/rc.c | 6 +++++ drivers/infiniband/hw/hfi1/tid_rdma.c | 46 +++++++++++++++++++++++++++++++++-- drivers/infiniband/hw/hfi1/tid_rdma.h | 9 +++++++ 3 files changed, 59 insertions(+), 2 deletions(-) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 6d2abea896e5..cfb863364f50 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -173,6 +173,12 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, } e = &qp->s_ack_queue[qp->s_tail_ack_queue]; + /* Check for tid write fence */ + if ((qpriv->s_flags & HFI1_R_TID_WAIT_INTERLCK) || + hfi1_tid_rdma_ack_interlock(qp, e)) { + iowait_set_flag(&qpriv->s_iowait, IOWAIT_PENDING_IB); + goto bail; + } if (e->opcode == OP(RDMA_READ_REQUEST)) { /* * If a RDMA read response is being resent and diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 490e47a0f68b..286752011f25 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -2179,6 +2179,7 @@ static int tid_rdma_rcv_error(struct hfi1_packet *packet, req->state = TID_REQUEST_RESEND; req->cur_seg = req->comp_seg; } + qpriv->s_flags &= ~HFI1_R_TID_WAIT_INTERLCK; } /* Re-process old requests.*/ if (qp->s_acked_ack_queue == qp->s_tail_ack_queue) @@ -3229,6 +3230,7 @@ bool hfi1_tid_rdma_wqe_interlock(struct rvt_qp *qp, struct rvt_swqe *wqe) struct rvt_swqe *prev; struct hfi1_qp_priv *priv = qp->priv; u32 s_prev; + struct tid_rdma_request *req; s_prev = (qp->s_cur == 0 ? qp->s_size : qp->s_cur) - 1; prev = rvt_get_swqe_ptr(qp, s_prev); @@ -3240,14 +3242,28 @@ bool hfi1_tid_rdma_wqe_interlock(struct rvt_qp *qp, struct rvt_swqe *wqe) case IB_WR_ATOMIC_CMP_AND_SWP: case IB_WR_ATOMIC_FETCH_AND_ADD: case IB_WR_RDMA_WRITE: + switch (prev->wr.opcode) { + case IB_WR_TID_RDMA_WRITE: + req = wqe_to_tid_req(prev); + if (req->ack_seg != req->total_segs) + goto interlock; + default: + break; + } case IB_WR_RDMA_READ: - break; + if (prev->wr.opcode != IB_WR_TID_RDMA_WRITE) + break; + /* fall through */ case IB_WR_TID_RDMA_READ: switch (prev->wr.opcode) { case IB_WR_RDMA_READ: if (qp->s_acked != qp->s_cur) goto interlock; break; + case IB_WR_TID_RDMA_WRITE: + req = wqe_to_tid_req(prev); + if (req->ack_seg != req->total_segs) + goto interlock; default: break; } @@ -5157,7 +5173,9 @@ static int make_tid_rdma_ack(struct rvt_qp *qp, e = &qp->s_ack_queue[qpriv->r_tid_ack]; req = ack_to_tid_req(e); flow = req->acked_tail; - } + } else if (req->ack_seg == req->total_segs && + qpriv->s_flags & HFI1_R_TID_WAIT_INTERLCK) + qpriv->s_flags &= ~HFI1_R_TID_WAIT_INTERLCK; hwords += hfi1_build_tid_rdma_write_ack(qp, e, ohdr, flow, &bth1, &bth2); @@ -5310,3 +5328,27 @@ bool hfi1_schedule_tid_send(struct rvt_qp *qp) IOWAIT_PENDING_TID); return false; } + +bool hfi1_tid_rdma_ack_interlock(struct rvt_qp *qp, struct rvt_ack_entry *e) +{ + struct rvt_ack_entry *prev; + struct tid_rdma_request *req; + struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); + struct hfi1_qp_priv *priv = qp->priv; + u32 s_prev; + + s_prev = qp->s_tail_ack_queue == 0 ? rvt_size_atomic(&dev->rdi) : + (qp->s_tail_ack_queue - 1); + prev = &qp->s_ack_queue[s_prev]; + + if ((e->opcode == TID_OP(READ_REQ) || + e->opcode == OP(RDMA_READ_REQUEST)) && + prev->opcode == TID_OP(WRITE_REQ)) { + req = ack_to_tid_req(prev); + if (req->ack_seg != req->total_segs) { + priv->s_flags |= HFI1_R_TID_WAIT_INTERLCK; + return true; + } + } + return false; +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index 7f8f17ba6c14..44468188a374 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -25,6 +25,7 @@ * s_flags, there are no collisions. * * HFI1_S_TID_WAIT_INTERLCK - QP is waiting for requester interlock + * HFI1_R_TID_WAIT_INTERLCK - QP is waiting for responder interlock */ #define HFI1_S_TID_BUSY_SET BIT(0) /* BIT(1) reserved for RVT_S_BUSY. */ @@ -32,9 +33,15 @@ /* BIT(3) reserved for RVT_S_RESP_PENDING. */ /* BIT(4) reserved for RVT_S_ACK_PENDING. */ #define HFI1_S_TID_WAIT_INTERLCK BIT(5) +#define HFI1_R_TID_WAIT_INTERLCK BIT(6) /* BIT(7) - BIT(15) reserved for RVT_S_WAIT_*. */ +/* BIT(16) reserved for RVT_S_SEND_ONE */ #define HFI1_S_TID_RETRY_TIMER BIT(17) +/* BIT(18) reserved for RVT_S_ECN. */ #define HFI1_R_TID_SW_PSN BIT(19) +/* BIT(26) reserved for HFI1_S_WAIT_HALT */ +/* BIT(27) reserved for HFI1_S_WAIT_TID_RESP */ +/* BIT(28) reserved for HFI1_S_WAIT_TID_SPACE */ /* * Unlike regular IB RDMA VERBS, which do not require an entry @@ -309,4 +316,6 @@ void _hfi1_do_tid_send(struct work_struct *work); bool hfi1_schedule_tid_send(struct rvt_qp *qp); +bool hfi1_tid_rdma_ack_interlock(struct rvt_qp *qp, struct rvt_ack_entry *e); + #endif /* HFI1_TID_RDMA_H */ -- cgit v1.2.3-59-g8ed1b From ad00889e7ca226a2bed2b210f17c93b7be1b1542 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:51:59 -0800 Subject: IB/hfi1: Enable TID RDMA WRITE protocol This patch enables TID RDMA WRITE protocol by converting a qualified RDMA WRITE request into a TID RDMA WRITE request internally: (1) The TID RDMA cability must be enabled; (2) The request must start on a 4K page boundary; (3) The request length must be a multiple of 4K and must be larger or equal to 256K. Signed-off-by: Mitko Haralanov Signed-off-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/tid_rdma.c | 22 ++++++++++++++++++++++ drivers/infiniband/hw/hfi1/tid_rdma.h | 3 ++- 2 files changed, 24 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 286752011f25..db3188f66dba 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -3322,6 +3322,18 @@ void setup_tid_rdma_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe) new_opcode = IB_WR_TID_RDMA_READ; do_tid_rdma = true; } + } else if (wqe->wr.opcode == IB_WR_RDMA_WRITE) { + /* + * TID RDMA is enabled for this RDMA WRITE request iff: + * 1. The remote address is page-aligned, + * 2. The length is larger than the minimum segment size, + * 3. The length is page-multiple. + */ + if (!(wqe->rdma_wr.remote_addr & ~PAGE_MASK) && + !(wqe->length & ~PAGE_MASK)) { + new_opcode = IB_WR_TID_RDMA_WRITE; + do_tid_rdma = true; + } } if (do_tid_rdma) { @@ -3338,12 +3350,22 @@ void setup_tid_rdma_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe) priv->tid_req.n_flows = remote->max_read; qpriv->tid_r_reqs++; wqe->lpsn += rvt_div_round_up_mtu(qp, wqe->length) - 1; + } else { + wqe->lpsn += priv->tid_req.total_segs - 1; + atomic_inc(&qpriv->n_requests); } priv->tid_req.cur_seg = 0; priv->tid_req.comp_seg = 0; priv->tid_req.ack_seg = 0; priv->tid_req.state = TID_REQUEST_INACTIVE; + /* + * Reset acked_tail. + * TID RDMA READ does not have ACKs so it does not + * update the pointer. We have to reset it so TID RDMA + * WRITE does not get confused. + */ + priv->tid_req.acked_tail = priv->tid_req.setup_head; trace_hfi1_tid_req_setup_tid_wqe(qp, 1, wqe->wr.opcode, wqe->psn, wqe->lpsn, &priv->tid_req); diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index 44468188a374..53ab24ef4f02 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -266,7 +266,8 @@ static inline void hfi1_setup_tid_rdma_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe) { if (wqe->priv && - wqe->wr.opcode == IB_WR_RDMA_READ && + (wqe->wr.opcode == IB_WR_RDMA_READ || + wqe->wr.opcode == IB_WR_RDMA_WRITE) && wqe->length >= TID_RDMA_MIN_SEGMENT_SIZE) setup_tid_rdma_wqe(qp, wqe); } -- cgit v1.2.3-59-g8ed1b From a05c9bdcfd16cec3a004cca339ab45de4cdf4799 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:52:09 -0800 Subject: IB/hfi1: Add static trace for TID RDMA WRITE protocol This patch makes the following changes to the static trace: 1. Adds the decoding of TID RDMA WRITE packets in IB header trace; 2. Adds trace events for various stages of the TID RDMA WRITE protocol. These events provide a fine-grained control for monitoring and debugging the hfi1 driver in the filed. Reviewed-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/rc.c | 12 + drivers/infiniband/hw/hfi1/tid_rdma.c | 88 +++++ drivers/infiniband/hw/hfi1/trace.c | 66 ++++ drivers/infiniband/hw/hfi1/trace_ibhdrs.h | 6 + drivers/infiniband/hw/hfi1/trace_tid.h | 517 +++++++++++++++++++++++++++++- drivers/infiniband/hw/hfi1/trace_tx.h | 6 + 6 files changed, 692 insertions(+), 3 deletions(-) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index cfb863364f50..82afa7736be7 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -162,6 +162,7 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, qp->s_acked_ack_queue == qp->s_tail_ack_queue) qp->s_acked_ack_queue = next; qp->s_tail_ack_queue = next; + trace_hfi1_rsp_make_rc_ack(qp, e->psn); /* FALLTHROUGH */ case OP(SEND_ONLY): case OP(ACKNOWLEDGE): @@ -263,6 +264,7 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, bth2 = mask_psn(e->psn); e->sent = 1; } + trace_hfi1_tid_write_rsp_make_rc_ack(qp); bth0 = qp->s_ack_state << 24; break; @@ -335,6 +337,8 @@ write_resp: hwords += hdrlen; bth0 = qp->s_ack_state << 24; qp->s_ack_rdma_psn++; + trace_hfi1_tid_req_make_rc_ack_write(qp, 0, e->opcode, e->psn, + e->lpsn, req); if (req->cur_seg != req->total_segs) break; @@ -761,6 +765,11 @@ no_flow_control: delta_psn(wqe->lpsn, bth2) + 1; } + trace_hfi1_tid_write_sender_make_req(qp, newreq); + trace_hfi1_tid_req_make_req_write(qp, newreq, + wqe->wr.opcode, + wqe->psn, wqe->lpsn, + req); if (++qp->s_cur == qp->s_size) qp->s_cur = 0; break; @@ -1070,6 +1079,8 @@ no_flow_control: priv->s_tid_cur = qp->s_cur; if (++qp->s_cur == qp->s_size) qp->s_cur = 0; + trace_hfi1_tid_req_make_req_write(qp, 0, wqe->wr.opcode, + wqe->psn, wqe->lpsn, req); break; case TID_OP(READ_RESP): @@ -1625,6 +1636,7 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait) wqe = do_rc_completion(qp, wqe, ibp); qp->s_flags &= ~RVT_S_WAIT_ACK; } else { + trace_hfi1_tid_write_sender_restart_rc(qp, 0); if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { struct tid_rdma_request *req; diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index db3188f66dba..a49eb3d9b5b9 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -2962,6 +2962,12 @@ bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd, e = &qp->s_ack_queue[qpriv->r_tid_tail]; req = ack_to_tid_req(e); flow = &req->flows[req->clear_tail]; + trace_hfi1_eflags_err_write(qp, rcv_type, rte, psn); + trace_hfi1_rsp_handle_kdeth_eflags(qp, psn); + trace_hfi1_tid_write_rsp_handle_kdeth_eflags(qp); + trace_hfi1_tid_req_handle_kdeth_eflags(qp, 0, e->opcode, e->psn, + e->lpsn, req); + trace_hfi1_tid_flow_handle_kdeth_eflags(qp, req->clear_tail, flow); switch (rcv_type) { case RHF_RCV_TYPE_EXPECTED: @@ -3489,6 +3495,8 @@ static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx) lockdep_assert_held(&qp->s_lock); while (1) { + trace_hfi1_rsp_tid_write_alloc_res(qp, 0); + trace_hfi1_tid_write_rsp_alloc_res(qp); /* * Don't allocate more segments if a RNR NAK has already been * scheduled to avoid messing up qp->r_psn: the RNR NAK will @@ -3517,6 +3525,8 @@ static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx) if (e->opcode != TID_OP(WRITE_REQ)) goto next_req; req = ack_to_tid_req(e); + trace_hfi1_tid_req_write_alloc_res(qp, 0, e->opcode, e->psn, + e->lpsn, req); /* Finished allocating for all segments of this request */ if (req->alloc_seg >= req->total_segs) goto next_req; @@ -3633,6 +3643,7 @@ send_rnr_nak: */ qp->s_flags &= ~(RVT_S_ACK_PENDING); + trace_hfi1_rsp_tid_write_alloc_res(qp, qp->r_psn); /* * qpriv->rnr_nak_state is used to determine when the scheduled RNR NAK * has actually been sent. qp->s_flags RVT_S_ACK_PENDING bit cannot be @@ -3686,6 +3697,7 @@ void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet) is_fecn = process_ecn(qp, packet); psn = mask_psn(be32_to_cpu(ohdr->bth[2])); + trace_hfi1_rsp_rcv_tid_write_req(qp, psn); if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST)) rvt_comm_est(qp); @@ -3794,6 +3806,9 @@ void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet) qp->r_msn++; qp->r_psn++; + trace_hfi1_tid_req_rcv_write_req(qp, 0, e->opcode, e->psn, e->lpsn, + req); + if (qpriv->r_tid_tail == HFI1_QP_WQE_INVALID) { qpriv->r_tid_tail = qp->r_head_ack_queue; } else if (qpriv->r_tid_tail == qpriv->r_tid_head) { @@ -3814,6 +3829,7 @@ update_head: qpriv->r_tid_head = qp->r_head_ack_queue; hfi1_tid_write_alloc_resources(qp, true); + trace_hfi1_tid_write_rsp_rcv_req(qp); /* Schedule the send tasklet. */ qp->s_flags |= RVT_S_RESP_PENDING; @@ -3855,6 +3871,10 @@ u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp *qp, struct rvt_ack_entry *e, void *resp_addr = NULL; struct tid_rdma_params *remote; + trace_hfi1_tid_req_build_write_resp(qp, 0, e->opcode, e->psn, e->lpsn, + req); + trace_hfi1_tid_write_rsp_build_resp(qp); + trace_hfi1_rsp_build_tid_write_resp(qp, bth2); flow = &req->flows[req->flow_idx]; switch (req->state) { default: @@ -3876,12 +3896,14 @@ u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp *qp, struct rvt_ack_entry *e, goto done; req->state = TID_REQUEST_ACTIVE; + trace_hfi1_tid_flow_build_write_resp(qp, req->flow_idx, flow); req->flow_idx = CIRC_NEXT(req->flow_idx, MAX_FLOWS); hfi1_add_tid_reap_timer(qp); break; case TID_REQUEST_RESEND_ACTIVE: case TID_REQUEST_RESEND: + trace_hfi1_tid_flow_build_write_resp(qp, req->flow_idx, flow); req->flow_idx = CIRC_NEXT(req->flow_idx, MAX_FLOWS); if (!CIRC_CNT(req->setup_head, req->flow_idx, MAX_FLOWS)) req->state = TID_REQUEST_ACTIVE; @@ -3996,6 +4018,9 @@ static void hfi1_tid_timeout(struct timer_list *t) if (qpriv->s_flags & HFI1_R_TID_RSC_TIMER) { dd_dev_warn(dd_from_ibdev(qp->ibqp.device), "[QP%u] %s %d\n", qp->ibqp.qp_num, __func__, __LINE__); + trace_hfi1_msg_tid_timeout(/* msg */ + qp, "resource timeout = ", + (u64)qpriv->tid_timer_timeout_jiffies); hfi1_stop_tid_reap_timer(qp); /* * Go though the entire ack queue and clear any outstanding @@ -4102,6 +4127,8 @@ void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet) if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd)) goto ack_done; + trace_hfi1_ack(qp, psn); + flow = &req->flows[req->setup_head]; flow->pkt = 0; flow->tid_idx = 0; @@ -4129,13 +4156,17 @@ void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet) } memcpy(flow->tid_entry, packet->ebuf, pktlen); flow->tidcnt = pktlen / sizeof(*flow->tid_entry); + trace_hfi1_tid_flow_rcv_write_resp(qp, req->setup_head, flow); req->comp_seg++; + trace_hfi1_tid_write_sender_rcv_resp(qp, 0); /* * Walk the TID_ENTRY list to make sure we have enough space for a * complete segment. */ for (i = 0; i < flow->tidcnt; i++) { + trace_hfi1_tid_entry_rcv_write_resp(/* entry */ + qp, i, flow->tid_entry[i]); if (!EXP_TID_GET(flow->tid_entry[i], LEN)) { status = IB_WC_LOC_LEN_ERR; goto ack_err; @@ -4147,6 +4178,8 @@ void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet) goto ack_err; } + trace_hfi1_tid_req_rcv_write_resp(qp, 0, wqe->wr.opcode, wqe->psn, + wqe->lpsn, req); /* * If this is the first response for this request, set the initial * flow index to the current flow. @@ -4221,6 +4254,8 @@ bool hfi1_build_tid_rdma_packet(struct rvt_swqe *wqe, next_offset = flow->tid_offset + *len; last_pkt = (flow->tid_idx == (flow->tidcnt - 1) && next_offset >= tidlen) || (flow->sent >= flow->length); + trace_hfi1_tid_entry_build_write_data(qp, flow->tid_idx, tidentry); + trace_hfi1_tid_flow_build_write_data(qp, req->clear_tail, flow); rcu_read_lock(); remote = rcu_dereference(qpriv->tid_rdma.remote); @@ -4303,6 +4338,10 @@ void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet) * - The entire request is complete and there are no more requests * (of any kind) in the queue. */ + trace_hfi1_rsp_rcv_tid_write_data(qp, psn); + trace_hfi1_tid_req_rcv_write_data(qp, 0, e->opcode, e->psn, e->lpsn, + req); + trace_hfi1_tid_write_rsp_rcv_data(qp); if (priv->r_tid_ack == HFI1_QP_WQE_INVALID) priv->r_tid_ack = priv->r_tid_tail; @@ -4451,6 +4490,7 @@ void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet) unsigned long flags; u16 fidx; + trace_hfi1_tid_write_sender_rcv_tid_ack(qp, 0); is_fecn = process_ecn(qp, packet); psn = mask_psn(be32_to_cpu(ohdr->bth[2])); aeth = be32_to_cpu(ohdr->u.tid_rdma.ack.aeth); @@ -4458,6 +4498,7 @@ void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet) resync_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.ack.tid_flow_psn)); spin_lock_irqsave(&qp->s_lock, flags); + trace_hfi1_rcv_tid_ack(qp, aeth, psn, req_psn, resync_psn); /* If we are waiting for an ACK to RESYNC, drop any other packets */ if ((qp->s_flags & HFI1_S_WAIT_HALT) && @@ -4480,7 +4521,10 @@ void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet) goto ack_op_err; req = wqe_to_tid_req(wqe); + trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn, + wqe->lpsn, req); flow = &req->flows[req->acked_tail]; + trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail, flow); /* Drop stale ACK/NAK */ if (cmp_psn(psn, full_flow_psn(flow, flow->flow_state.spsn)) < 0) @@ -4493,11 +4537,14 @@ void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet) /* advance acked segment pointer */ req->acked_tail = CIRC_NEXT(req->acked_tail, MAX_FLOWS); req->r_last_acked = flow->flow_state.resp_ib_psn; + trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn, + wqe->lpsn, req); if (req->ack_seg == req->total_segs) { req->state = TID_REQUEST_COMPLETE; wqe = do_rc_completion(qp, wqe, to_iport(qp->ibqp.device, qp->port_num)); + trace_hfi1_sender_rcv_tid_ack(qp); atomic_dec(&qpriv->n_tid_requests); if (qp->s_acked == qp->s_tail) break; @@ -4506,8 +4553,11 @@ void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet) req = wqe_to_tid_req(wqe); } flow = &req->flows[req->acked_tail]; + trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail, flow); } + trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn, + wqe->lpsn, req); switch (aeth >> 29) { case 0: /* ACK */ if (qpriv->s_flags & RVT_S_WAIT_ACK) @@ -4614,6 +4664,9 @@ void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet) flow->pkt = 0; spsn += flow->npkts; resync_psn += flow->npkts; + trace_hfi1_tid_flow_rcv_tid_ack(qp, + fidx, + flow); } if (++last_acked == qpriv->s_tid_cur + 1) break; @@ -4638,6 +4691,8 @@ done: case 0: /* PSN sequence error */ flow = &req->flows[req->acked_tail]; fspsn = full_flow_psn(flow, flow->flow_state.spsn); + trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail, + flow); req->r_ack_psn = mask_psn(be32_to_cpu(ohdr->bth[2])); req->cur_seg = req->ack_seg; qpriv->s_tid_tail = qp->s_acked; @@ -4713,16 +4768,28 @@ static void hfi1_tid_retry_timeout(struct timer_list *t) struct rvt_qp *qp = priv->owner; struct rvt_swqe *wqe; unsigned long flags; + struct tid_rdma_request *req; spin_lock_irqsave(&qp->r_lock, flags); spin_lock(&qp->s_lock); + trace_hfi1_tid_write_sender_retry_timeout(qp, 0); if (priv->s_flags & HFI1_S_TID_RETRY_TIMER) { hfi1_stop_tid_retry_timer(qp); if (!priv->s_retry) { + trace_hfi1_msg_tid_retry_timeout(/* msg */ + qp, + "Exhausted retries. Tid retry timeout = ", + (u64)priv->tid_retry_timeout_jiffies); + wqe = rvt_get_swqe_ptr(qp, qp->s_acked); hfi1_trdma_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR); rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); } else { + wqe = rvt_get_swqe_ptr(qp, qp->s_acked); + req = wqe_to_tid_req(wqe); + trace_hfi1_tid_req_tid_retry_timeout(/* req */ + qp, 0, wqe->wr.opcode, wqe->psn, wqe->lpsn, req); + priv->s_flags &= ~RVT_S_WAIT_ACK; /* Only send one packet (the RESYNC) */ priv->s_flags |= RVT_S_SEND_ONE; @@ -4818,6 +4885,7 @@ void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet) * sync point and the flow has/will be reprogrammed */ qpriv->s_flags &= ~HFI1_R_TID_SW_PSN; + trace_hfi1_tid_write_rsp_rcv_resync(qp); /* * Reset all TID flow information with the new generation. @@ -4832,6 +4900,8 @@ void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet) e = &qp->s_ack_queue[idx]; if (e->opcode == TID_OP(WRITE_REQ)) { req = ack_to_tid_req(e); + trace_hfi1_tid_req_rcv_resync(qp, 0, e->opcode, e->psn, + e->lpsn, req); /* start from last unacked segment */ for (flow_idx = req->clear_tail; @@ -4854,6 +4924,8 @@ void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet) full_flow_psn(flow, flow->flow_state.spsn); fs->psn += flow->npkts; + trace_hfi1_tid_flow_rcv_resync(qp, flow_idx, + flow); } } if (idx == qp->s_tail_ack_queue) @@ -4913,6 +4985,7 @@ int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps) u8 opcode = TID_OP(WRITE_DATA); lockdep_assert_held(&qp->s_lock); + trace_hfi1_tid_write_sender_make_tid_pkt(qp, 0); /* * Prioritize the sending of the requests and responses over the * sending of the TID RDMA data packets. @@ -4970,6 +5043,8 @@ int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps) goto bail; wqe = rvt_get_swqe_ptr(qp, priv->s_tid_tail); req = wqe_to_tid_req(wqe); + trace_hfi1_tid_req_make_tid_pkt(qp, 0, wqe->wr.opcode, wqe->psn, + wqe->lpsn, req); switch (priv->s_state) { case TID_OP(WRITE_REQ): case TID_OP(WRITE_RESP): @@ -4997,6 +5072,8 @@ int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps) * 3.2.2 Advance RESP pointers. * 3.3 Return indicating progress made. */ + trace_hfi1_sender_make_tid_pkt(qp); + trace_hfi1_tid_write_sender_make_tid_pkt(qp, 0); wqe = rvt_get_swqe_ptr(qp, priv->s_tid_tail); req = wqe_to_tid_req(wqe); len = wqe->length; @@ -5004,6 +5081,8 @@ int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps) if (!req->comp_seg || req->cur_seg == req->comp_seg) goto bail; + trace_hfi1_tid_req_make_tid_pkt(qp, 0, wqe->wr.opcode, + wqe->psn, wqe->lpsn, req); last = hfi1_build_tid_rdma_packet(wqe, ohdr, &bth1, &bth2, &len); @@ -5028,6 +5107,7 @@ int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps) break; case TID_OP(RESYNC): + trace_hfi1_sender_make_tid_pkt(qp); /* Use generation from the most recently received response */ wqe = rvt_get_swqe_ptr(qp, priv->s_tid_cur); req = wqe_to_tid_req(wqe); @@ -5098,6 +5178,7 @@ static int make_tid_rdma_ack(struct rvt_qp *qp, u16 flow; struct tid_rdma_request *req, *nreq; + trace_hfi1_tid_write_rsp_make_tid_ack(qp); /* Don't send an ACK if we aren't supposed to. */ if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) goto bail; @@ -5128,6 +5209,9 @@ static int make_tid_rdma_ack(struct rvt_qp *qp, req = ack_to_tid_req(e); } + trace_hfi1_rsp_make_tid_ack(qp, e->psn); + trace_hfi1_tid_req_make_tid_ack(qp, 0, e->opcode, e->psn, e->lpsn, + req); /* * If we've sent all the ACKs that we can, we are done * until we get more segments... @@ -5199,6 +5283,9 @@ static int make_tid_rdma_ack(struct rvt_qp *qp, qpriv->s_flags & HFI1_R_TID_WAIT_INTERLCK) qpriv->s_flags &= ~HFI1_R_TID_WAIT_INTERLCK; + trace_hfi1_tid_write_rsp_make_tid_ack(qp); + trace_hfi1_tid_req_make_tid_ack(qp, 0, e->opcode, e->psn, e->lpsn, + req); hwords += hfi1_build_tid_rdma_write_ack(qp, e, ohdr, flow, &bth1, &bth2); len = 0; @@ -5251,6 +5338,7 @@ static void hfi1_do_tid_send(struct rvt_qp *qp) ps.in_thread = false; ps.timeout_int = qp->timeout_jiffies / 8; + trace_hfi1_rc_do_tid_send(qp, false); spin_lock_irqsave(&qp->s_lock, ps.flags); /* Return if we are already busy processing a work request. */ diff --git a/drivers/infiniband/hw/hfi1/trace.c b/drivers/infiniband/hw/hfi1/trace.c index 28181d711fed..9a3d236bcc88 100644 --- a/drivers/infiniband/hw/hfi1/trace.c +++ b/drivers/infiniband/hw/hfi1/trace.c @@ -133,6 +133,11 @@ const char *hfi1_trace_get_packet_l2_str(u8 l2) #define TID_RDMA_KDETH_DATA "kdeth0 0x%x: kver %u sh %u intr %u tidctrl %u tid %x offset %x kdeth1 0x%x: jkey %x" #define TID_READ_REQ_PRN "tid_flow_psn 0x%x tid_flow_qp 0x%x verbs_qp 0x%x" #define TID_READ_RSP_PRN "verbs_qp 0x%x" +#define TID_WRITE_REQ_PRN "original_qp 0x%x" +#define TID_WRITE_RSP_PRN "tid_flow_psn 0x%x tid_flow_qp 0x%x verbs_qp 0x%x" +#define TID_WRITE_DATA_PRN "verbs_qp 0x%x" +#define TID_ACK_PRN "tid_flow_psn 0x%x verbs_psn 0x%x tid_flow_qp 0x%x verbs_qp 0x%x" +#define TID_RESYNC_PRN "verbs_qp 0x%x" #define OP(transport, op) IB_OPCODE_## transport ## _ ## op @@ -327,6 +332,45 @@ const char *parse_everbs_hdrs( parse_syndrome(be32_to_cpu(eh->aeth) >> 24), be32_to_cpu(eh->aeth) & IB_MSN_MASK); break; + case OP(TID_RDMA, WRITE_REQ): + trace_seq_printf(p, TID_RDMA_KDETH " " RETH_PRN " " + TID_WRITE_REQ_PRN, + le32_to_cpu(eh->tid_rdma.w_req.kdeth0), + le32_to_cpu(eh->tid_rdma.w_req.kdeth1), + ib_u64_get(&eh->tid_rdma.w_req.reth.vaddr), + be32_to_cpu(eh->tid_rdma.w_req.reth.rkey), + be32_to_cpu(eh->tid_rdma.w_req.reth.length), + be32_to_cpu(eh->tid_rdma.w_req.verbs_qp)); + break; + case OP(TID_RDMA, WRITE_RESP): + trace_seq_printf(p, TID_RDMA_KDETH " " AETH_PRN " " + TID_WRITE_RSP_PRN, + le32_to_cpu(eh->tid_rdma.w_rsp.kdeth0), + le32_to_cpu(eh->tid_rdma.w_rsp.kdeth1), + be32_to_cpu(eh->tid_rdma.w_rsp.aeth) >> 24, + parse_syndrome(/* aeth */ + be32_to_cpu(eh->tid_rdma.w_rsp.aeth) + >> 24), + (be32_to_cpu(eh->tid_rdma.w_rsp.aeth) & + IB_MSN_MASK), + be32_to_cpu(eh->tid_rdma.w_rsp.tid_flow_psn), + be32_to_cpu(eh->tid_rdma.w_rsp.tid_flow_qp), + be32_to_cpu(eh->tid_rdma.w_rsp.verbs_qp)); + break; + case OP(TID_RDMA, WRITE_DATA_LAST): + case OP(TID_RDMA, WRITE_DATA): + trace_seq_printf(p, TID_RDMA_KDETH_DATA " " TID_WRITE_DATA_PRN, + le32_to_cpu(eh->tid_rdma.w_data.kdeth0), + KDETH_GET(eh->tid_rdma.w_data.kdeth0, KVER), + KDETH_GET(eh->tid_rdma.w_data.kdeth0, SH), + KDETH_GET(eh->tid_rdma.w_data.kdeth0, INTR), + KDETH_GET(eh->tid_rdma.w_data.kdeth0, TIDCTRL), + KDETH_GET(eh->tid_rdma.w_data.kdeth0, TID), + KDETH_GET(eh->tid_rdma.w_data.kdeth0, OFFSET), + le32_to_cpu(eh->tid_rdma.w_data.kdeth1), + KDETH_GET(eh->tid_rdma.w_data.kdeth1, JKEY), + be32_to_cpu(eh->tid_rdma.w_data.verbs_qp)); + break; case OP(TID_RDMA, READ_REQ): trace_seq_printf(p, TID_RDMA_KDETH " " RETH_PRN " " TID_READ_REQ_PRN, @@ -359,6 +403,28 @@ const char *parse_everbs_hdrs( IB_MSN_MASK), be32_to_cpu(eh->tid_rdma.r_rsp.verbs_qp)); break; + case OP(TID_RDMA, ACK): + trace_seq_printf(p, TID_RDMA_KDETH " " AETH_PRN " " + TID_ACK_PRN, + le32_to_cpu(eh->tid_rdma.ack.kdeth0), + le32_to_cpu(eh->tid_rdma.ack.kdeth1), + be32_to_cpu(eh->tid_rdma.ack.aeth) >> 24, + parse_syndrome(/* aeth */ + be32_to_cpu(eh->tid_rdma.ack.aeth) + >> 24), + (be32_to_cpu(eh->tid_rdma.ack.aeth) & + IB_MSN_MASK), + be32_to_cpu(eh->tid_rdma.ack.tid_flow_psn), + be32_to_cpu(eh->tid_rdma.ack.verbs_psn), + be32_to_cpu(eh->tid_rdma.ack.tid_flow_qp), + be32_to_cpu(eh->tid_rdma.ack.verbs_qp)); + break; + case OP(TID_RDMA, RESYNC): + trace_seq_printf(p, TID_RDMA_KDETH " " TID_RESYNC_PRN, + le32_to_cpu(eh->tid_rdma.resync.kdeth0), + le32_to_cpu(eh->tid_rdma.resync.kdeth1), + be32_to_cpu(eh->tid_rdma.resync.verbs_qp)); + break; /* aeth + atomicacketh */ case OP(RC, ATOMIC_ACKNOWLEDGE): trace_seq_printf(p, AETH_PRN " " ATOMICACKETH_PRN, diff --git a/drivers/infiniband/hw/hfi1/trace_ibhdrs.h b/drivers/infiniband/hw/hfi1/trace_ibhdrs.h index 1116238bf24d..d1372cc66de6 100644 --- a/drivers/infiniband/hw/hfi1/trace_ibhdrs.h +++ b/drivers/infiniband/hw/hfi1/trace_ibhdrs.h @@ -79,8 +79,14 @@ __print_symbolic(opcode, \ ib_opcode_name(RC_ATOMIC_ACKNOWLEDGE), \ ib_opcode_name(RC_COMPARE_SWAP), \ ib_opcode_name(RC_FETCH_ADD), \ + ib_opcode_name(TID_RDMA_WRITE_REQ), \ + ib_opcode_name(TID_RDMA_WRITE_RESP), \ + ib_opcode_name(TID_RDMA_WRITE_DATA), \ + ib_opcode_name(TID_RDMA_WRITE_DATA_LAST), \ ib_opcode_name(TID_RDMA_READ_REQ), \ ib_opcode_name(TID_RDMA_READ_RESP), \ + ib_opcode_name(TID_RDMA_RESYNC), \ + ib_opcode_name(TID_RDMA_ACK), \ ib_opcode_name(UC_SEND_FIRST), \ ib_opcode_name(UC_SEND_MIDDLE), \ ib_opcode_name(UC_SEND_LAST), \ diff --git a/drivers/infiniband/hw/hfi1/trace_tid.h b/drivers/infiniband/hw/hfi1/trace_tid.h index a45b5257d6c4..548dfc45a407 100644 --- a/drivers/infiniband/hw/hfi1/trace_tid.h +++ b/drivers/infiniband/hw/hfi1/trace_tid.h @@ -56,16 +56,33 @@ u16 hfi1_trace_get_tid_idx(u32 ent); "fpsn 0x%x flow_flags 0x%x" #define TID_REQ_PRN "[%s] qpn 0x%x newreq %u opcode 0x%x psn 0x%x lpsn 0x%x " \ - "cur_seg %u comp_seg %u ack_seg %u " \ + "cur_seg %u comp_seg %u ack_seg %u alloc_seg %u " \ "total_segs %u setup_head %u clear_tail %u flow_idx %u " \ - "state %u r_flow_psn 0x%x " \ - "s_next_psn 0x%x" + "acked_tail %u state %u r_ack_psn 0x%x r_flow_psn 0x%x " \ + "r_last_ackd 0x%x s_next_psn 0x%x" #define RCV_ERR_PRN "[%s] qpn 0x%x s_flags 0x%x state 0x%x " \ "s_acked_ack_queue %u s_tail_ack_queue %u " \ "r_head_ack_queue %u opcode 0x%x psn 0x%x r_psn 0x%x " \ " diff %d" +#define TID_WRITE_RSPDR_PRN "[%s] qpn 0x%x r_tid_head %u r_tid_tail %u " \ + "r_tid_ack %u r_tid_alloc %u alloc_w_segs %u " \ + "pending_tid_w_segs %u sync_pt %s " \ + "ps_nak_psn 0x%x ps_nak_state 0x%x " \ + "prnr_nak_state 0x%x hw_flow_index %u generation "\ + "0x%x fpsn 0x%x flow_flags 0x%x resync %s" \ + "r_next_psn_kdeth 0x%x" + +#define TID_WRITE_SENDER_PRN "[%s] qpn 0x%x newreq %u s_tid_cur %u " \ + "s_tid_tail %u s_tid_head %u " \ + "pending_tid_w_resp %u n_requests %u " \ + "n_tid_requests %u s_flags 0x%x ps_flags 0x%x "\ + "iow_flags 0x%lx s_state 0x%x s_retry %u" + +#define KDETH_EFLAGS_ERR_PRN "[%s] qpn 0x%x TID ERR: RcvType 0x%x " \ + "RcvTypeError 0x%x PSN 0x%x" + DECLARE_EVENT_CLASS(/* class */ hfi1_exp_tid_reg_unreg, TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages, @@ -382,6 +399,18 @@ DEFINE_EVENT(/* event */ TP_ARGS(qp, msg, more) ); +DEFINE_EVENT(/* event */ + hfi1_msg_template, hfi1_msg_tid_timeout, + TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more), + TP_ARGS(qp, msg, more) +); + +DEFINE_EVENT(/* event */ + hfi1_msg_template, hfi1_msg_tid_retry_timeout, + TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more), + TP_ARGS(qp, msg, more) +); + DECLARE_EVENT_CLASS(/* tid_flow_page */ hfi1_tid_flow_page_template, TP_PROTO(struct rvt_qp *qp, struct tid_rdma_flow *flow, u32 index, @@ -562,6 +591,42 @@ DEFINE_EVENT(/* event */ TP_ARGS(qp, index, flow) ); +DEFINE_EVENT(/* event */ + hfi1_tid_flow_template, hfi1_tid_flow_build_write_resp, + TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow), + TP_ARGS(qp, index, flow) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_flow_template, hfi1_tid_flow_rcv_write_resp, + TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow), + TP_ARGS(qp, index, flow) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_flow_template, hfi1_tid_flow_build_write_data, + TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow), + TP_ARGS(qp, index, flow) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_flow_template, hfi1_tid_flow_rcv_tid_ack, + TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow), + TP_ARGS(qp, index, flow) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_flow_template, hfi1_tid_flow_rcv_resync, + TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow), + TP_ARGS(qp, index, flow) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_flow_template, hfi1_tid_flow_handle_kdeth_eflags, + TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow), + TP_ARGS(qp, index, flow) +); + DECLARE_EVENT_CLASS(/* tid_node */ hfi1_tid_node_template, TP_PROTO(struct rvt_qp *qp, const char *msg, u32 index, u32 base, @@ -656,6 +721,18 @@ DEFINE_EVENT(/* event */ TP_ARGS(qp, index, ent) ); +DEFINE_EVENT(/* event */ + hfi1_tid_entry_template, hfi1_tid_entry_rcv_write_resp, + TP_PROTO(struct rvt_qp *qp, int index, u32 entry), + TP_ARGS(qp, index, entry) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_entry_template, hfi1_tid_entry_build_write_data, + TP_PROTO(struct rvt_qp *qp, int index, u32 entry), + TP_ARGS(qp, index, entry) +); + DECLARE_EVENT_CLASS(/* rsp_info */ hfi1_responder_info_template, TP_PROTO(struct rvt_qp *qp, u32 psn), @@ -738,6 +815,42 @@ DEFINE_EVENT(/* event */ TP_ARGS(qp, psn) ); +DEFINE_EVENT(/* event */ + hfi1_responder_info_template, hfi1_rsp_tid_write_alloc_res, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + +DEFINE_EVENT(/* event */ + hfi1_responder_info_template, hfi1_rsp_rcv_tid_write_req, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + +DEFINE_EVENT(/* event */ + hfi1_responder_info_template, hfi1_rsp_build_tid_write_resp, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + +DEFINE_EVENT(/* event */ + hfi1_responder_info_template, hfi1_rsp_rcv_tid_write_data, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + +DEFINE_EVENT(/* event */ + hfi1_responder_info_template, hfi1_rsp_make_tid_ack, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + +DEFINE_EVENT(/* event */ + hfi1_responder_info_template, hfi1_rsp_handle_kdeth_eflags, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + DECLARE_EVENT_CLASS(/* sender_info */ hfi1_sender_info_template, TP_PROTO(struct rvt_qp *qp), @@ -830,6 +943,18 @@ DEFINE_EVENT(/* event */ TP_ARGS(qp) ); +DEFINE_EVENT(/* event */ + hfi1_sender_info_template, hfi1_sender_rcv_tid_ack, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_sender_info_template, hfi1_sender_make_tid_pkt, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + DECLARE_EVENT_CLASS(/* tid_read_sender */ hfi1_tid_read_sender_template, TP_PROTO(struct rvt_qp *qp, char newreq), @@ -908,12 +1033,16 @@ DECLARE_EVENT_CLASS(/* tid_rdma_request */ __field(u32, cur_seg) __field(u32, comp_seg) __field(u32, ack_seg) + __field(u32, alloc_seg) __field(u32, total_segs) __field(u16, setup_head) __field(u16, clear_tail) __field(u16, flow_idx) + __field(u16, acked_tail) __field(u32, state) + __field(u32, r_ack_psn) __field(u32, r_flow_psn) + __field(u32, r_last_acked) __field(u32, s_next_psn) ), TP_fast_assign(/* assign */ @@ -926,12 +1055,16 @@ DECLARE_EVENT_CLASS(/* tid_rdma_request */ __entry->cur_seg = req->cur_seg; __entry->comp_seg = req->comp_seg; __entry->ack_seg = req->ack_seg; + __entry->alloc_seg = req->alloc_seg; __entry->total_segs = req->total_segs; __entry->setup_head = req->setup_head; __entry->clear_tail = req->clear_tail; __entry->flow_idx = req->flow_idx; + __entry->acked_tail = req->acked_tail; __entry->state = req->state; + __entry->r_ack_psn = req->r_ack_psn; __entry->r_flow_psn = req->r_flow_psn; + __entry->r_last_acked = req->r_last_acked; __entry->s_next_psn = req->s_next_psn; ), TP_printk(/* print */ @@ -945,12 +1078,16 @@ DECLARE_EVENT_CLASS(/* tid_rdma_request */ __entry->cur_seg, __entry->comp_seg, __entry->ack_seg, + __entry->alloc_seg, __entry->total_segs, __entry->setup_head, __entry->clear_tail, __entry->flow_idx, + __entry->acked_tail, __entry->state, + __entry->r_ack_psn, __entry->r_flow_psn, + __entry->r_last_acked, __entry->s_next_psn ) ); @@ -1004,6 +1141,97 @@ DEFINE_EVENT(/* event */ TP_ARGS(qp, newreq, opcode, psn, lpsn, req) ); +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_write_alloc_res, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_write_req, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_build_write_resp, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_write_resp, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_write_data, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_tid_ack, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_tid_retry_timeout, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_resync, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_make_tid_pkt, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_make_tid_ack, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_handle_kdeth_eflags, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_make_rc_ack_write, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_make_req_write, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + DECLARE_EVENT_CLASS(/* rc_rcv_err */ hfi1_rc_rcv_err_template, TP_PROTO(struct rvt_qp *qp, u32 opcode, u32 psn, int diff), @@ -1090,6 +1318,289 @@ DEFINE_EVENT(/* event */ TP_ARGS(qp, index, sge) ); +DECLARE_EVENT_CLASS(/* tid_write_sp */ + hfi1_tid_write_rsp_template, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(u32, r_tid_head) + __field(u32, r_tid_tail) + __field(u32, r_tid_ack) + __field(u32, r_tid_alloc) + __field(u32, alloc_w_segs) + __field(u32, pending_tid_w_segs) + __field(bool, sync_pt) + __field(u32, ps_nak_psn) + __field(u8, ps_nak_state) + __field(u8, prnr_nak_state) + __field(u32, hw_flow_index) + __field(u32, generation) + __field(u32, fpsn) + __field(u32, flow_flags) + __field(bool, resync) + __field(u32, r_next_psn_kdeth) + ), + TP_fast_assign(/* assign */ + struct hfi1_qp_priv *priv = qp->priv; + + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->qpn = qp->ibqp.qp_num; + __entry->r_tid_head = priv->r_tid_head; + __entry->r_tid_tail = priv->r_tid_tail; + __entry->r_tid_ack = priv->r_tid_ack; + __entry->r_tid_alloc = priv->r_tid_alloc; + __entry->alloc_w_segs = priv->alloc_w_segs; + __entry->pending_tid_w_segs = priv->pending_tid_w_segs; + __entry->sync_pt = priv->sync_pt; + __entry->ps_nak_psn = priv->s_nak_psn; + __entry->ps_nak_state = priv->s_nak_state; + __entry->prnr_nak_state = priv->rnr_nak_state; + __entry->hw_flow_index = priv->flow_state.index; + __entry->generation = priv->flow_state.generation; + __entry->fpsn = priv->flow_state.psn; + __entry->flow_flags = priv->flow_state.flags; + __entry->resync = priv->resync; + __entry->r_next_psn_kdeth = priv->r_next_psn_kdeth; + ), + TP_printk(/* print */ + TID_WRITE_RSPDR_PRN, + __get_str(dev), + __entry->qpn, + __entry->r_tid_head, + __entry->r_tid_tail, + __entry->r_tid_ack, + __entry->r_tid_alloc, + __entry->alloc_w_segs, + __entry->pending_tid_w_segs, + __entry->sync_pt ? "yes" : "no", + __entry->ps_nak_psn, + __entry->ps_nak_state, + __entry->prnr_nak_state, + __entry->hw_flow_index, + __entry->generation, + __entry->fpsn, + __entry->flow_flags, + __entry->resync ? "yes" : "no", + __entry->r_next_psn_kdeth + ) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_alloc_res, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_rcv_req, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_build_resp, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_rcv_data, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_rcv_resync, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_make_tid_ack, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_handle_kdeth_eflags, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_make_rc_ack, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DECLARE_EVENT_CLASS(/* tid_write_sender */ + hfi1_tid_write_sender_template, + TP_PROTO(struct rvt_qp *qp, char newreq), + TP_ARGS(qp, newreq), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(char, newreq) + __field(u32, s_tid_cur) + __field(u32, s_tid_tail) + __field(u32, s_tid_head) + __field(u32, pending_tid_w_resp) + __field(u32, n_requests) + __field(u32, n_tid_requests) + __field(u32, s_flags) + __field(u32, ps_flags) + __field(unsigned long, iow_flags) + __field(u8, s_state) + __field(u8, s_retry) + ), + TP_fast_assign(/* assign */ + struct hfi1_qp_priv *priv = qp->priv; + + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->qpn = qp->ibqp.qp_num; + __entry->newreq = newreq; + __entry->s_tid_cur = priv->s_tid_cur; + __entry->s_tid_tail = priv->s_tid_tail; + __entry->s_tid_head = priv->s_tid_head; + __entry->pending_tid_w_resp = priv->pending_tid_w_resp; + __entry->n_requests = atomic_read(&priv->n_requests); + __entry->n_tid_requests = atomic_read(&priv->n_tid_requests); + __entry->s_flags = qp->s_flags; + __entry->ps_flags = priv->s_flags; + __entry->iow_flags = priv->s_iowait.flags; + __entry->s_state = priv->s_state; + __entry->s_retry = priv->s_retry; + ), + TP_printk(/* print */ + TID_WRITE_SENDER_PRN, + __get_str(dev), + __entry->qpn, + __entry->newreq, + __entry->s_tid_cur, + __entry->s_tid_tail, + __entry->s_tid_head, + __entry->pending_tid_w_resp, + __entry->n_requests, + __entry->n_tid_requests, + __entry->s_flags, + __entry->ps_flags, + __entry->iow_flags, + __entry->s_state, + __entry->s_retry + ) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_sender_template, hfi1_tid_write_sender_rcv_resp, + TP_PROTO(struct rvt_qp *qp, char newreq), + TP_ARGS(qp, newreq) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_sender_template, hfi1_tid_write_sender_rcv_tid_ack, + TP_PROTO(struct rvt_qp *qp, char newreq), + TP_ARGS(qp, newreq) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_sender_template, hfi1_tid_write_sender_retry_timeout, + TP_PROTO(struct rvt_qp *qp, char newreq), + TP_ARGS(qp, newreq) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_sender_template, hfi1_tid_write_sender_make_tid_pkt, + TP_PROTO(struct rvt_qp *qp, char newreq), + TP_ARGS(qp, newreq) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_sender_template, hfi1_tid_write_sender_make_req, + TP_PROTO(struct rvt_qp *qp, char newreq), + TP_ARGS(qp, newreq) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_sender_template, hfi1_tid_write_sender_restart_rc, + TP_PROTO(struct rvt_qp *qp, char newreq), + TP_ARGS(qp, newreq) +); + +DECLARE_EVENT_CLASS(/* tid_ack */ + hfi1_tid_ack_template, + TP_PROTO(struct rvt_qp *qp, u32 aeth, u32 psn, + u32 req_psn, u32 resync_psn), + TP_ARGS(qp, aeth, psn, req_psn, resync_psn), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(u32, aeth) + __field(u32, psn) + __field(u32, req_psn) + __field(u32, resync_psn) + ), + TP_fast_assign(/* assign */ + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)) + __entry->qpn = qp->ibqp.qp_num; + __entry->aeth = aeth; + __entry->psn = psn; + __entry->req_psn = req_psn; + __entry->resync_psn = resync_psn; + ), + TP_printk(/* print */ + "[%s] qpn 0x%x aeth 0x%x psn 0x%x req_psn 0x%x resync_psn 0x%x", + __get_str(dev), + __entry->qpn, + __entry->aeth, + __entry->psn, + __entry->req_psn, + __entry->resync_psn + ) +); + +DEFINE_EVENT(/* rcv_tid_ack */ + hfi1_tid_ack_template, hfi1_rcv_tid_ack, + TP_PROTO(struct rvt_qp *qp, u32 aeth, u32 psn, + u32 req_psn, u32 resync_psn), + TP_ARGS(qp, aeth, psn, req_psn, resync_psn) +); + +DECLARE_EVENT_CLASS(/* kdeth_eflags_error */ + hfi1_kdeth_eflags_error_template, + TP_PROTO(struct rvt_qp *qp, u8 rcv_type, u8 rte, u32 psn), + TP_ARGS(qp, rcv_type, rte, psn), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(u8, rcv_type) + __field(u8, rte) + __field(u32, psn) + ), + TP_fast_assign(/* assign */ + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->qpn = qp->ibqp.qp_num; + __entry->rcv_type = rcv_type; + __entry->rte = rte; + __entry->psn = psn; + ), + TP_printk(/* print */ + KDETH_EFLAGS_ERR_PRN, + __get_str(dev), + __entry->qpn, + __entry->rcv_type, + __entry->rte, + __entry->psn + ) +); + +DEFINE_EVENT(/* event */ + hfi1_kdeth_eflags_error_template, hfi1_eflags_err_write, + TP_PROTO(struct rvt_qp *qp, u8 rcv_type, u8 rte, u32 psn), + TP_ARGS(qp, rcv_type, rte, psn) +); + #endif /* __HFI1_TRACE_TID_H */ #undef TRACE_INCLUDE_PATH diff --git a/drivers/infiniband/hw/hfi1/trace_tx.h b/drivers/infiniband/hw/hfi1/trace_tx.h index 37dbb3e599c3..09eb0c9ada00 100644 --- a/drivers/infiniband/hw/hfi1/trace_tx.h +++ b/drivers/infiniband/hw/hfi1/trace_tx.h @@ -846,6 +846,12 @@ DEFINE_EVENT( TP_ARGS(qp, flag) ); +DEFINE_EVENT(/* event */ + hfi1_do_send_template, hfi1_rc_do_tid_send, + TP_PROTO(struct rvt_qp *qp, bool flag), + TP_ARGS(qp, flag) +); + DEFINE_EVENT( hfi1_do_send_template, hfi1_rc_expired_time_slice, TP_PROTO(struct rvt_qp *qp, bool flag), -- cgit v1.2.3-59-g8ed1b From 34025fb0c4c9d6b2e294f8f8f0a82491a13c83a2 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:52:19 -0800 Subject: IB/hfi1: Prioritize the sending of ACK packets ACK packets are generally associated with request completion and resource release and therefore should be sent first. This patch optimizes the send engine by using the following policies: (1) QPs with RVT_S_ACK_PENDING bit set in qp->s_flags or qpriv->s_flags should have their priority incremented; (2) QPs with ACK or TID-ACK packet queued should have their priority incremented; (3) When a QP is queued to the wait list due to resource constraints, it will be queued to the head if it has ACK packet to send; (4) When selecting qps to run from the wait list, the one with the highest priority and starve_cnt will be selected; each priority will be equivalent to a fixed number of starve_cnt (16). Reviewed-by: Mitko Haralanov Signed-off-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/iowait.c | 34 ++++++++++++- drivers/infiniband/hw/hfi1/iowait.h | 87 +++++++++++++++++++++----------- drivers/infiniband/hw/hfi1/pio.c | 18 ++++--- drivers/infiniband/hw/hfi1/qp.c | 15 +++++- drivers/infiniband/hw/hfi1/rc.c | 1 + drivers/infiniband/hw/hfi1/sdma.c | 24 +++++---- drivers/infiniband/hw/hfi1/sdma_txreq.h | 1 + drivers/infiniband/hw/hfi1/tid_rdma.c | 1 + drivers/infiniband/hw/hfi1/user_sdma.c | 6 ++- drivers/infiniband/hw/hfi1/verbs.c | 1 + drivers/infiniband/hw/hfi1/verbs_txreq.h | 1 + drivers/infiniband/hw/hfi1/vnic_sdma.c | 6 ++- 12 files changed, 144 insertions(+), 51 deletions(-) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/iowait.c b/drivers/infiniband/hw/hfi1/iowait.c index 582f1ba136ff..adb4a1ba921b 100644 --- a/drivers/infiniband/hw/hfi1/iowait.c +++ b/drivers/infiniband/hw/hfi1/iowait.c @@ -6,6 +6,9 @@ #include "iowait.h" #include "trace_iowait.h" +/* 1 priority == 16 starve_cnt */ +#define IOWAIT_PRIORITY_STARVE_SHIFT 4 + void iowait_set_flag(struct iowait *wait, u32 flag) { trace_hfi1_iowait_set(wait, flag); @@ -44,7 +47,8 @@ void iowait_init(struct iowait *wait, u32 tx_limit, uint seq, bool pkts_sent), void (*wakeup)(struct iowait *wait, int reason), - void (*sdma_drained)(struct iowait *wait)) + void (*sdma_drained)(struct iowait *wait), + void (*init_priority)(struct iowait *wait)) { int i; @@ -58,6 +62,7 @@ void iowait_init(struct iowait *wait, u32 tx_limit, wait->sleep = sleep; wait->wakeup = wakeup; wait->sdma_drained = sdma_drained; + wait->init_priority = init_priority; wait->flags = 0; for (i = 0; i < IOWAIT_SES; i++) { wait->wait[i].iow = wait; @@ -92,3 +97,30 @@ int iowait_set_work_flag(struct iowait_work *w) iowait_set_flag(w->iow, IOWAIT_PENDING_TID); return IOWAIT_TID_SE; } + +/** + * iowait_priority_update_top - update the top priority entry + * @w: the iowait struct + * @top: a pointer to the top priority entry + * @idx: the index of the current iowait in an array + * @top_idx: the array index for the iowait entry that has the top priority + * + * This function is called to compare the priority of a given + * iowait with the given top priority entry. The top index will + * be returned. + */ +uint iowait_priority_update_top(struct iowait *w, + struct iowait *top, + uint idx, uint top_idx) +{ + u8 cnt, tcnt; + + /* Convert priority into starve_cnt and compare the total.*/ + cnt = (w->priority << IOWAIT_PRIORITY_STARVE_SHIFT) + w->starved_cnt; + tcnt = (top->priority << IOWAIT_PRIORITY_STARVE_SHIFT) + + top->starved_cnt; + if (cnt > tcnt) + return idx; + else + return top_idx; +} diff --git a/drivers/infiniband/hw/hfi1/iowait.h b/drivers/infiniband/hw/hfi1/iowait.h index bd913701761d..07847cb72169 100644 --- a/drivers/infiniband/hw/hfi1/iowait.h +++ b/drivers/infiniband/hw/hfi1/iowait.h @@ -100,6 +100,7 @@ struct iowait_work { * @sleep: no space callback * @wakeup: space callback wakeup * @sdma_drained: sdma count drained + * @init_priority: callback to manipulate priority * @lock: lock protected head of wait queue * @iowork: workqueue overhead * @wait_dma: wait for sdma_busy == 0 @@ -109,7 +110,7 @@ struct iowait_work { * @tx_limit: limit for overflow queuing * @tx_count: number of tx entry's in tx_head'ed list * @flags: wait flags (one per QP) - * @wait: SE array + * @wait: SE array for multiple legs * * This is to be embedded in user's state structure * (QP or PQ). @@ -120,10 +121,13 @@ struct iowait_work { * are callbacks for the ULP to implement * what ever queuing/dequeuing of * the embedded iowait and its containing struct - * when a resource shortage like SDMA ring space is seen. + * when a resource shortage like SDMA ring space + * or PIO credit space is seen. * * Both potentially have locks help - * so sleeping is not allowed. + * so sleeping is not allowed and it is not + * supported to submit txreqs from the wakeup + * call directly because of lock conflicts. * * The wait_dma member along with the iow * @@ -143,6 +147,7 @@ struct iowait { ); void (*wakeup)(struct iowait *wait, int reason); void (*sdma_drained)(struct iowait *wait); + void (*init_priority)(struct iowait *wait); seqlock_t *lock; wait_queue_head_t wait_dma; wait_queue_head_t wait_pio; @@ -152,6 +157,7 @@ struct iowait { u32 tx_limit; u32 tx_count; u8 starved_cnt; + u8 priority; unsigned long flags; struct iowait_work wait[IOWAIT_SES]; }; @@ -171,7 +177,8 @@ void iowait_init(struct iowait *wait, u32 tx_limit, uint seq, bool pkts_sent), void (*wakeup)(struct iowait *wait, int reason), - void (*sdma_drained)(struct iowait *wait)); + void (*sdma_drained)(struct iowait *wait), + void (*init_priority)(struct iowait *wait)); /** * iowait_schedule() - schedule the default send engine work @@ -339,6 +346,8 @@ static inline u16 iowait_get_desc(struct iowait_work *w) tx = list_first_entry(&w->tx_head, struct sdma_txreq, list); num_desc = tx->num_desc; + if (tx->flags & SDMA_TXREQ_F_VIP) + w->iow->priority++; } return num_desc; } @@ -352,6 +361,37 @@ static inline u32 iowait_get_all_desc(struct iowait *w) return num_desc; } +static inline void iowait_update_priority(struct iowait_work *w) +{ + struct sdma_txreq *tx = NULL; + + if (!list_empty(&w->tx_head)) { + tx = list_first_entry(&w->tx_head, struct sdma_txreq, + list); + if (tx->flags & SDMA_TXREQ_F_VIP) + w->iow->priority++; + } +} + +static inline void iowait_update_all_priority(struct iowait *w) +{ + iowait_update_priority(&w->wait[IOWAIT_IB_SE]); + iowait_update_priority(&w->wait[IOWAIT_TID_SE]); +} + +static inline void iowait_init_priority(struct iowait *w) +{ + w->priority = 0; + if (w->init_priority) + w->init_priority(w); +} + +static inline void iowait_get_priority(struct iowait *w) +{ + iowait_init_priority(w); + iowait_update_all_priority(w); +} + /** * iowait_queue - Put the iowait on a wait queue * @pkts_sent: have some packets been sent before queuing? @@ -368,14 +408,18 @@ static inline void iowait_queue(bool pkts_sent, struct iowait *w, /* * To play fair, insert the iowait at the tail of the wait queue if it * has already sent some packets; Otherwise, put it at the head. + * However, if it has priority packets to send, also put it at the + * head. */ - if (pkts_sent) { - list_add_tail(&w->list, wait_head); + if (pkts_sent) w->starved_cnt = 0; - } else { - list_add(&w->list, wait_head); + else w->starved_cnt++; - } + + if (w->priority > 0 || !pkts_sent) + list_add(&w->list, wait_head); + else + list_add_tail(&w->list, wait_head); } /** @@ -392,27 +436,10 @@ static inline void iowait_starve_clear(bool pkts_sent, struct iowait *w) w->starved_cnt = 0; } -/** - * iowait_starve_find_max - Find the maximum of the starve count - * @w: the iowait struct - * @max: a variable containing the max starve count - * @idx: the index of the current iowait in an array - * @max_idx: a variable containing the array index for the - * iowait entry that has the max starve count - * - * This function is called to compare the starve count of a - * given iowait with the given max starve count. The max starve - * count and the index will be updated if the iowait's start - * count is larger. - */ -static inline void iowait_starve_find_max(struct iowait *w, u8 *max, - uint idx, uint *max_idx) -{ - if (w->starved_cnt > *max) { - *max = w->starved_cnt; - *max_idx = idx; - } -} +/* Update the top priority index */ +uint iowait_priority_update_top(struct iowait *w, + struct iowait *top, + uint idx, uint top_idx); /** * iowait_packet_queued() - determine if a packet is queued diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c index 04126d7e318d..a1de566fe95e 100644 --- a/drivers/infiniband/hw/hfi1/pio.c +++ b/drivers/infiniband/hw/hfi1/pio.c @@ -1599,8 +1599,7 @@ static void sc_piobufavail(struct send_context *sc) struct rvt_qp *qp; struct hfi1_qp_priv *priv; unsigned long flags; - uint i, n = 0, max_idx = 0; - u8 max_starved_cnt = 0; + uint i, n = 0, top_idx = 0; if (dd->send_contexts[sc->sw_index].type != SC_KERNEL && dd->send_contexts[sc->sw_index].type != SC_VL15) @@ -1619,11 +1618,18 @@ static void sc_piobufavail(struct send_context *sc) if (n == ARRAY_SIZE(qps)) break; wait = list_first_entry(list, struct iowait, list); + iowait_get_priority(wait); qp = iowait_to_qp(wait); priv = qp->priv; list_del_init(&priv->s_iowait.list); priv->s_iowait.lock = NULL; - iowait_starve_find_max(wait, &max_starved_cnt, n, &max_idx); + if (n) { + priv = qps[top_idx]->priv; + top_idx = iowait_priority_update_top(wait, + &priv->s_iowait, + n, top_idx); + } + /* refcount held until actual wake up */ qps[n++] = qp; } @@ -1638,12 +1644,12 @@ static void sc_piobufavail(struct send_context *sc) } write_sequnlock_irqrestore(&sc->waitlock, flags); - /* Wake up the most starved one first */ + /* Wake up the top-priority one first */ if (n) - hfi1_qp_wakeup(qps[max_idx], + hfi1_qp_wakeup(qps[top_idx], RVT_S_WAIT_PIO | HFI1_S_WAIT_PIO_DRAIN); for (i = 0; i < n; i++) - if (i != max_idx) + if (i != top_idx) hfi1_qp_wakeup(qps[i], RVT_S_WAIT_PIO | HFI1_S_WAIT_PIO_DRAIN); } diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index cfd598e4b303..d8f7add935df 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -518,6 +518,7 @@ static int iowait_sleep( ibp->rvp.n_dmawait++; qp->s_flags |= RVT_S_WAIT_DMA_DESC; + iowait_get_priority(&priv->s_iowait); iowait_queue(pkts_sent, &priv->s_iowait, &sde->dmawait); priv->s_iowait.lock = &sde->waitlock; @@ -567,6 +568,17 @@ static void iowait_sdma_drained(struct iowait *wait) spin_unlock_irqrestore(&qp->s_lock, flags); } +static void hfi1_init_priority(struct iowait *w) +{ + struct rvt_qp *qp = iowait_to_qp(w); + struct hfi1_qp_priv *priv = qp->priv; + + if (qp->s_flags & RVT_S_ACK_PENDING) + w->priority++; + if (priv->s_flags & RVT_S_ACK_PENDING) + w->priority++; +} + /** * qp_to_sdma_engine - map a qp to a send engine * @qp: the QP @@ -727,7 +739,8 @@ void *qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp) _hfi1_do_tid_send, iowait_sleep, iowait_wakeup, - iowait_sdma_drained); + iowait_sdma_drained, + hfi1_init_priority); return priv; } diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 82afa7736be7..e6726c1ab866 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -390,6 +390,7 @@ normal_no_state: bth0 = OP(ACKNOWLEDGE) << 24; bth2 = mask_psn(qp->s_ack_psn); qp->s_flags &= ~RVT_S_ACK_PENDING; + ps->s_txreq->txreq.flags |= SDMA_TXREQ_F_VIP; ps->s_txreq->ss = NULL; } qp->s_rdma_ack_cnt++; diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c index 96897a91fb0a..b0110728f541 100644 --- a/drivers/infiniband/hw/hfi1/sdma.c +++ b/drivers/infiniband/hw/hfi1/sdma.c @@ -1747,10 +1747,9 @@ retry: */ static void sdma_desc_avail(struct sdma_engine *sde, uint avail) { - struct iowait *wait, *nw; + struct iowait *wait, *nw, *twait; struct iowait *waits[SDMA_WAIT_BATCH_SIZE]; - uint i, n = 0, seq, max_idx = 0; - u8 max_starved_cnt = 0; + uint i, n = 0, seq, tidx = 0; #ifdef CONFIG_SDMA_VERBOSITY dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx, @@ -1775,13 +1774,20 @@ static void sdma_desc_avail(struct sdma_engine *sde, uint avail) continue; if (n == ARRAY_SIZE(waits)) break; + iowait_init_priority(wait); num_desc = iowait_get_all_desc(wait); if (num_desc > avail) break; avail -= num_desc; - /* Find the most starved wait memeber */ - iowait_starve_find_max(wait, &max_starved_cnt, - n, &max_idx); + /* Find the top-priority wait memeber */ + if (n) { + twait = waits[tidx]; + tidx = + iowait_priority_update_top(wait, + twait, + n, + tidx); + } list_del_init(&wait->list); waits[n++] = wait; } @@ -1790,12 +1796,12 @@ static void sdma_desc_avail(struct sdma_engine *sde, uint avail) } } while (read_seqretry(&sde->waitlock, seq)); - /* Schedule the most starved one first */ + /* Schedule the top-priority entry first */ if (n) - waits[max_idx]->wakeup(waits[max_idx], SDMA_AVAIL_REASON); + waits[tidx]->wakeup(waits[tidx], SDMA_AVAIL_REASON); for (i = 0; i < n; i++) - if (i != max_idx) + if (i != tidx) waits[i]->wakeup(waits[i], SDMA_AVAIL_REASON); } diff --git a/drivers/infiniband/hw/hfi1/sdma_txreq.h b/drivers/infiniband/hw/hfi1/sdma_txreq.h index bf7d777d756e..514a4784566b 100644 --- a/drivers/infiniband/hw/hfi1/sdma_txreq.h +++ b/drivers/infiniband/hw/hfi1/sdma_txreq.h @@ -91,6 +91,7 @@ struct sdma_desc { #define SDMA_TXREQ_F_URGENT 0x0001 #define SDMA_TXREQ_F_AHG_COPY 0x0002 #define SDMA_TXREQ_F_USE_AHG 0x0004 +#define SDMA_TXREQ_F_VIP 0x0010 struct sdma_txreq; typedef void (*callback_t)(struct sdma_txreq *, int); diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index a49eb3d9b5b9..bc2ff83026f7 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -5296,6 +5296,7 @@ static int make_tid_rdma_ack(struct rvt_qp *qp, ps->s_txreq->ss = NULL; hfi1_make_ruc_header(qp, ohdr, (TID_OP(ACK) << 24), bth1, bth2, middle, ps); + ps->s_txreq->txreq.flags |= SDMA_TXREQ_F_VIP; return 1; bail: /* diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c index 6764114b886c..8bfbc6d7ea34 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.c +++ b/drivers/infiniband/hw/hfi1/user_sdma.c @@ -144,8 +144,10 @@ static int defer_packet_queue( */ xchg(&pq->state, SDMA_PKT_Q_DEFERRED); write_seqlock(&sde->waitlock); - if (list_empty(&pq->busy.list)) + if (list_empty(&pq->busy.list)) { + iowait_get_priority(&pq->busy); iowait_queue(pkts_sent, &pq->busy, &sde->dmawait); + } write_sequnlock(&sde->waitlock); return -EBUSY; eagain: @@ -191,7 +193,7 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, pq->mm = fd->mm; iowait_init(&pq->busy, 0, NULL, NULL, defer_packet_queue, - activate_packet_queue, NULL); + activate_packet_queue, NULL, NULL); pq->reqidx = 0; pq->reqs = kcalloc(hfi1_sdma_comp_ring_size, diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index ab97d71cdd92..55a56b3d7f83 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -945,6 +945,7 @@ static int pio_wait(struct rvt_qp *qp, dev->n_piodrain += !!(flag & HFI1_S_WAIT_PIO_DRAIN); qp->s_flags |= flag; was_empty = list_empty(&sc->piowait); + iowait_get_priority(&priv->s_iowait); iowait_queue(ps->pkts_sent, &priv->s_iowait, &sc->piowait); priv->s_iowait.lock = &sc->waitlock; diff --git a/drivers/infiniband/hw/hfi1/verbs_txreq.h b/drivers/infiniband/hw/hfi1/verbs_txreq.h index 2a77af26a231..b002e96eb335 100644 --- a/drivers/infiniband/hw/hfi1/verbs_txreq.h +++ b/drivers/infiniband/hw/hfi1/verbs_txreq.h @@ -94,6 +94,7 @@ static inline struct verbs_txreq *get_txreq(struct hfi1_ibdev *dev, tx->txreq.num_desc = 0; /* Set the header type */ tx->phdr.hdr.hdr_type = priv->hdr_type; + tx->txreq.flags = 0; return tx; } diff --git a/drivers/infiniband/hw/hfi1/vnic_sdma.c b/drivers/infiniband/hw/hfi1/vnic_sdma.c index 1f81c480e028..af1b1ffcb38e 100644 --- a/drivers/infiniband/hw/hfi1/vnic_sdma.c +++ b/drivers/infiniband/hw/hfi1/vnic_sdma.c @@ -240,8 +240,10 @@ static int hfi1_vnic_sdma_sleep(struct sdma_engine *sde, } vnic_sdma->state = HFI1_VNIC_SDMA_Q_DEFERRED; - if (list_empty(&vnic_sdma->wait.list)) + if (list_empty(&vnic_sdma->wait.list)) { + iowait_get_priority(wait->iow); iowait_queue(pkts_sent, wait->iow, &sde->dmawait); + } write_sequnlock(&sde->waitlock); return -EBUSY; } @@ -281,7 +283,7 @@ void hfi1_vnic_sdma_init(struct hfi1_vnic_vport_info *vinfo) iowait_init(&vnic_sdma->wait, 0, NULL, NULL, hfi1_vnic_sdma_sleep, - hfi1_vnic_sdma_wakeup, NULL); + hfi1_vnic_sdma_wakeup, NULL, NULL); vnic_sdma->sde = &vinfo->dd->per_sdma[i]; vnic_sdma->dd = vinfo->dd; vnic_sdma->vinfo = vinfo; -- cgit v1.2.3-59-g8ed1b From 70f8a3ca68d3e1f3344d959981ca55d5f6ec77f7 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 6 Feb 2019 09:59:15 -0800 Subject: mm: make mm->pinned_vm an atomic64 counter Taking a sleeping lock to _only_ increment a variable is quite the overkill, and pretty much all users do this. Furthermore, some drivers (ie: infiniband and scif) that need pinned semantics can go to quite some trouble to actually delay via workqueue (un)accounting for pinned pages when not possible to acquire it. By making the counter atomic we no longer need to hold the mmap_sem and can simply some code around it for pinned_vm users. The counter is 64-bit such that we need not worry about overflows such as rdma user input controlled from userspace. Reviewed-by: Ira Weiny Reviewed-by: Christoph Lameter Reviewed-by: Daniel Jordan Reviewed-by: Jan Kara Signed-off-by: Davidlohr Bueso Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/umem.c | 12 ++++++------ drivers/infiniband/hw/hfi1/user_pages.c | 6 +++--- drivers/infiniband/hw/qib/qib_user_pages.c | 4 ++-- drivers/infiniband/hw/usnic/usnic_uiom.c | 8 ++++---- drivers/misc/mic/scif/scif_rma.c | 6 +++--- fs/proc/task_mmu.c | 2 +- include/linux/mm_types.h | 2 +- kernel/events/core.c | 8 ++++---- kernel/fork.c | 2 +- mm/debug.c | 5 +++-- 10 files changed, 28 insertions(+), 27 deletions(-) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 1efe0a74e06b..678abe1afcba 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -166,13 +166,13 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; down_write(&mm->mmap_sem); - if (check_add_overflow(mm->pinned_vm, npages, &new_pinned) || - (new_pinned > lock_limit && !capable(CAP_IPC_LOCK))) { + new_pinned = atomic64_read(&mm->pinned_vm) + npages; + if (new_pinned > lock_limit && !capable(CAP_IPC_LOCK)) { up_write(&mm->mmap_sem); ret = -ENOMEM; goto out; } - mm->pinned_vm = new_pinned; + atomic64_set(&mm->pinned_vm, new_pinned); up_write(&mm->mmap_sem); cur_base = addr & PAGE_MASK; @@ -234,7 +234,7 @@ umem_release: __ib_umem_release(context->device, umem, 0); vma: down_write(&mm->mmap_sem); - mm->pinned_vm -= ib_umem_num_pages(umem); + atomic64_sub(ib_umem_num_pages(umem), &mm->pinned_vm); up_write(&mm->mmap_sem); out: if (vma_list) @@ -263,7 +263,7 @@ static void ib_umem_release_defer(struct work_struct *work) struct ib_umem *umem = container_of(work, struct ib_umem, work); down_write(&umem->owning_mm->mmap_sem); - umem->owning_mm->pinned_vm -= ib_umem_num_pages(umem); + atomic64_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm); up_write(&umem->owning_mm->mmap_sem); __ib_umem_release_tail(umem); @@ -302,7 +302,7 @@ void ib_umem_release(struct ib_umem *umem) } else { down_write(&umem->owning_mm->mmap_sem); } - umem->owning_mm->pinned_vm -= ib_umem_num_pages(umem); + atomic64_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm); up_write(&umem->owning_mm->mmap_sem); __ib_umem_release_tail(umem); diff --git a/drivers/infiniband/hw/hfi1/user_pages.c b/drivers/infiniband/hw/hfi1/user_pages.c index e341e6dcc388..40a6e434190f 100644 --- a/drivers/infiniband/hw/hfi1/user_pages.c +++ b/drivers/infiniband/hw/hfi1/user_pages.c @@ -92,7 +92,7 @@ bool hfi1_can_pin_pages(struct hfi1_devdata *dd, struct mm_struct *mm, size = DIV_ROUND_UP(size, PAGE_SIZE); down_read(&mm->mmap_sem); - pinned = mm->pinned_vm; + pinned = atomic64_read(&mm->pinned_vm); up_read(&mm->mmap_sem); /* First, check the absolute limit against all pinned pages. */ @@ -112,7 +112,7 @@ int hfi1_acquire_user_pages(struct mm_struct *mm, unsigned long vaddr, size_t np return ret; down_write(&mm->mmap_sem); - mm->pinned_vm += ret; + atomic64_add(ret, &mm->pinned_vm); up_write(&mm->mmap_sem); return ret; @@ -131,7 +131,7 @@ void hfi1_release_user_pages(struct mm_struct *mm, struct page **p, if (mm) { /* during close after signal, mm can be NULL */ down_write(&mm->mmap_sem); - mm->pinned_vm -= npages; + atomic64_sub(npages, &mm->pinned_vm); up_write(&mm->mmap_sem); } } diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c index 075f09fb7ce3..c6c81022d313 100644 --- a/drivers/infiniband/hw/qib/qib_user_pages.c +++ b/drivers/infiniband/hw/qib/qib_user_pages.c @@ -75,7 +75,7 @@ static int __qib_get_user_pages(unsigned long start_page, size_t num_pages, goto bail_release; } - current->mm->pinned_vm += num_pages; + atomic64_add(num_pages, ¤t->mm->pinned_vm); ret = 0; goto bail; @@ -156,7 +156,7 @@ void qib_release_user_pages(struct page **p, size_t num_pages) __qib_release_user_pages(p, num_pages, 1); if (current->mm) { - current->mm->pinned_vm -= num_pages; + atomic64_sub(num_pages, ¤t->mm->pinned_vm); up_write(¤t->mm->mmap_sem); } } diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c index ce01a59fccc4..854436a2b437 100644 --- a/drivers/infiniband/hw/usnic/usnic_uiom.c +++ b/drivers/infiniband/hw/usnic/usnic_uiom.c @@ -129,7 +129,7 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable, uiomr->owning_mm = mm = current->mm; down_write(&mm->mmap_sem); - locked = npages + current->mm->pinned_vm; + locked = npages + atomic64_read(¤t->mm->pinned_vm); lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) { @@ -187,7 +187,7 @@ out: if (ret < 0) usnic_uiom_put_pages(chunk_list, 0); else { - mm->pinned_vm = locked; + atomic64_set(&mm->pinned_vm, locked); mmgrab(uiomr->owning_mm); } @@ -441,7 +441,7 @@ static void usnic_uiom_release_defer(struct work_struct *work) container_of(work, struct usnic_uiom_reg, work); down_write(&uiomr->owning_mm->mmap_sem); - uiomr->owning_mm->pinned_vm -= usnic_uiom_num_pages(uiomr); + atomic64_sub(usnic_uiom_num_pages(uiomr), &uiomr->owning_mm->pinned_vm); up_write(&uiomr->owning_mm->mmap_sem); __usnic_uiom_release_tail(uiomr); @@ -469,7 +469,7 @@ void usnic_uiom_reg_release(struct usnic_uiom_reg *uiomr, } else { down_write(&uiomr->owning_mm->mmap_sem); } - uiomr->owning_mm->pinned_vm -= usnic_uiom_num_pages(uiomr); + atomic64_sub(usnic_uiom_num_pages(uiomr), &uiomr->owning_mm->pinned_vm); up_write(&uiomr->owning_mm->mmap_sem); __usnic_uiom_release_tail(uiomr); diff --git a/drivers/misc/mic/scif/scif_rma.c b/drivers/misc/mic/scif/scif_rma.c index 749321eb91ae..2448368f181e 100644 --- a/drivers/misc/mic/scif/scif_rma.c +++ b/drivers/misc/mic/scif/scif_rma.c @@ -285,7 +285,7 @@ __scif_dec_pinned_vm_lock(struct mm_struct *mm, } else { down_write(&mm->mmap_sem); } - mm->pinned_vm -= nr_pages; + atomic64_sub(nr_pages, &mm->pinned_vm); up_write(&mm->mmap_sem); return 0; } @@ -299,7 +299,7 @@ static inline int __scif_check_inc_pinned_vm(struct mm_struct *mm, return 0; locked = nr_pages; - locked += mm->pinned_vm; + locked += atomic64_read(&mm->pinned_vm); lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) { dev_err(scif_info.mdev.this_device, @@ -307,7 +307,7 @@ static inline int __scif_check_inc_pinned_vm(struct mm_struct *mm, locked, lock_limit); return -ENOMEM; } - mm->pinned_vm = locked; + atomic64_set(&mm->pinned_vm, locked); return 0; } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index f0ec9edab2f3..d2902962244d 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -59,7 +59,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) SEQ_PUT_DEC("VmPeak:\t", hiwater_vm); SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm); SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm); - SEQ_PUT_DEC(" kB\nVmPin:\t", mm->pinned_vm); + SEQ_PUT_DEC(" kB\nVmPin:\t", atomic64_read(&mm->pinned_vm)); SEQ_PUT_DEC(" kB\nVmHWM:\t", hiwater_rss); SEQ_PUT_DEC(" kB\nVmRSS:\t", total_rss); SEQ_PUT_DEC(" kB\nRssAnon:\t", anon); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 2c471a2c43fa..acea2ea2d6c4 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -405,7 +405,7 @@ struct mm_struct { unsigned long total_vm; /* Total pages mapped */ unsigned long locked_vm; /* Pages that have PG_mlocked set */ - unsigned long pinned_vm; /* Refcount permanently increased */ + atomic64_t pinned_vm; /* Refcount permanently increased */ unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */ unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */ unsigned long stack_vm; /* VM_STACK */ diff --git a/kernel/events/core.c b/kernel/events/core.c index e5ede6918050..29e9f2473656 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5459,7 +5459,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) /* now it's safe to free the pages */ atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm); - vma->vm_mm->pinned_vm -= rb->aux_mmap_locked; + atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm); /* this has to be the last one */ rb_free_aux(rb); @@ -5532,7 +5532,7 @@ again: */ atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm); - vma->vm_mm->pinned_vm -= mmap_locked; + atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm); free_uid(mmap_user); out_put: @@ -5680,7 +5680,7 @@ accounting: lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; - locked = vma->vm_mm->pinned_vm + extra; + locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra; if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() && !capable(CAP_IPC_LOCK)) { @@ -5721,7 +5721,7 @@ accounting: unlock: if (!ret) { atomic_long_add(user_extra, &user->locked_vm); - vma->vm_mm->pinned_vm += extra; + atomic64_add(extra, &vma->vm_mm->pinned_vm); atomic_inc(&event->mmap_count); } else if (rb) { diff --git a/kernel/fork.c b/kernel/fork.c index b69248e6f0e0..85e08c379a9e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -981,7 +981,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_pgtables_bytes_init(mm); mm->map_count = 0; mm->locked_vm = 0; - mm->pinned_vm = 0; + atomic64_set(&mm->pinned_vm, 0); memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); spin_lock_init(&mm->arg_lock); diff --git a/mm/debug.c b/mm/debug.c index 0abb987dad9b..7d13941a72f9 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -135,7 +135,7 @@ void dump_mm(const struct mm_struct *mm) "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" "pgd %px mm_users %d mm_count %d pgtables_bytes %lu map_count %d\n" "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" - "pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n" + "pinned_vm %llx data_vm %lx exec_vm %lx stack_vm %lx\n" "start_code %lx end_code %lx start_data %lx end_data %lx\n" "start_brk %lx brk %lx start_stack %lx\n" "arg_start %lx arg_end %lx env_start %lx env_end %lx\n" @@ -166,7 +166,8 @@ void dump_mm(const struct mm_struct *mm) mm_pgtables_bytes(mm), mm->map_count, mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm, - mm->pinned_vm, mm->data_vm, mm->exec_vm, mm->stack_vm, + atomic64_read(&mm->pinned_vm), + mm->data_vm, mm->exec_vm, mm->stack_vm, mm->start_code, mm->end_code, mm->start_data, mm->end_data, mm->start_brk, mm->brk, mm->start_stack, mm->arg_start, mm->arg_end, mm->env_start, mm->env_end, -- cgit v1.2.3-59-g8ed1b From 0e15c253363e9a8824e6b9c9fc17468eb924cebd Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 6 Feb 2019 09:59:18 -0800 Subject: drivers/IB,hfi1: do not se mmap_sem This driver already uses gup_fast() and thus we can just drop the mmap_sem protection around the pinned_vm counter. Note that the window between when hfi1_can_pin_pages() is called and the actual counter is incremented remains the same as mmap_sem was _only_ used for when ->pinned_vm was touched. Reviewed-by: Ira Weiny Signed-off-by: Davidlohr Bueso Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/user_pages.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/user_pages.c b/drivers/infiniband/hw/hfi1/user_pages.c index 40a6e434190f..24b592c6522e 100644 --- a/drivers/infiniband/hw/hfi1/user_pages.c +++ b/drivers/infiniband/hw/hfi1/user_pages.c @@ -91,9 +91,7 @@ bool hfi1_can_pin_pages(struct hfi1_devdata *dd, struct mm_struct *mm, /* Convert to number of pages */ size = DIV_ROUND_UP(size, PAGE_SIZE); - down_read(&mm->mmap_sem); pinned = atomic64_read(&mm->pinned_vm); - up_read(&mm->mmap_sem); /* First, check the absolute limit against all pinned pages. */ if (pinned + npages >= ulimit && !can_lock) @@ -111,9 +109,7 @@ int hfi1_acquire_user_pages(struct mm_struct *mm, unsigned long vaddr, size_t np if (ret < 0) return ret; - down_write(&mm->mmap_sem); atomic64_add(ret, &mm->pinned_vm); - up_write(&mm->mmap_sem); return ret; } @@ -130,8 +126,6 @@ void hfi1_release_user_pages(struct mm_struct *mm, struct page **p, } if (mm) { /* during close after signal, mm can be NULL */ - down_write(&mm->mmap_sem); atomic64_sub(npages, &mm->pinned_vm); - up_write(&mm->mmap_sem); } } -- cgit v1.2.3-59-g8ed1b From e50838c27ff7e1438ea2f9ab4bfcb227f90a107f Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Fri, 15 Feb 2019 13:45:30 -0800 Subject: IB/hfi1: Fix a build warning for TID RDMA READ The following build warning was produced for the TID RDMA READ patch ("IB/hfi1: Enable TID RDMA READ protocol"): drivers/infiniband/hw/hfi1/qp.c: In function 'hfi1_setup_wqe': drivers/infiniband/hw/hfi1/qp.c:328:3: warning: this statement may fall through [-Wimplicit-fallthrough=] hfi1_setup_tid_rdma_wqe(qp, wqe); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ drivers/infiniband/hw/hfi1/qp.c:329:2: note: here case IB_QPT_UC: ^~~~ This patch will fix the issue by adding the "fall through" comment. Fixes: f1ab4efa6d32 ("IB/hfi1: Enable TID RDMA READ protocol") Reviewed-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/qp.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index d8f7add935df..9b643c2409cf 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -326,6 +326,7 @@ int hfi1_setup_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe, bool *call_send) switch (qp->ibqp.qp_type) { case IB_QPT_RC: hfi1_setup_tid_rdma_wqe(qp, wqe); + /* fall through */ case IB_QPT_UC: if (wqe->length > 0x80000000U) return -EINVAL; -- cgit v1.2.3-59-g8ed1b From 7264235ee74f51d26fbdf97bf98c6102a460484f Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 20 Feb 2019 19:02:33 -0600 Subject: IB/hfi1: Add missing break in switch statement MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the following warning by adding a missing break: drivers/infiniband/hw/hfi1/tid_rdma.c: In function ‘hfi1_tid_rdma_wqe_interlock’: drivers/infiniband/hw/hfi1/tid_rdma.c:3251:3: warning: this statement may fall through [-Wimplicit-fallthrough=] switch (prev->wr.opcode) { ^~~~~~ drivers/infiniband/hw/hfi1/tid_rdma.c:3259:2: note: here case IB_WR_RDMA_READ: ^~~~ Warning level 3 was used: -Wimplicit-fallthrough=3 This patch is part of the ongoing efforts to enable -Wimplicit-fallthrough. Fixes: c6c231175ccd ("IB/hfi1: Add interlock between TID RDMA WRITE and other requests") Signed-off-by: Gustavo A. R. Silva Reviewed-by: Kaike Wan Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/tid_rdma.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index bc2ff83026f7..fdda33aca77f 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -3256,6 +3256,7 @@ bool hfi1_tid_rdma_wqe_interlock(struct rvt_qp *qp, struct rvt_swqe *wqe) default: break; } + break; case IB_WR_RDMA_READ: if (prev->wr.opcode != IB_WR_TID_RDMA_WRITE) break; -- cgit v1.2.3-59-g8ed1b From bc5add09764c123f58942a37c8335247e683d234 Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Tue, 26 Feb 2019 08:45:35 -0800 Subject: IB/hfi1: Close race condition on user context disable and close When disabling and removing a receive context, it is possible for an asynchronous event (i.e IRQ) to occur. Because of this, there is a race between cleaning up the context, and the context being used by the asynchronous event. cpu 0 (context cleanup) rc->ref_count-- (ref_count == 0) hfi1_rcd_free() cpu 1 (IRQ (with rcd index)) rcd_get_by_index() lock ref_count+++ <-- reference count race (WARNING) return rcd unlock cpu 0 hfi1_free_ctxtdata() <-- incorrect free location lock remove rcd from array unlock free rcd This race will cause the following WARNING trace: WARNING: CPU: 0 PID: 175027 at include/linux/kref.h:52 hfi1_rcd_get_by_index+0x84/0xa0 [hfi1] CPU: 0 PID: 175027 Comm: IMB-MPI1 Kdump: loaded Tainted: G OE ------------ 3.10.0-957.el7.x86_64 #1 Hardware name: Intel Corporation S2600KP/S2600KP, BIOS SE5C610.86B.11.01.0076.C4.111920150602 11/19/2015 Call Trace: dump_stack+0x19/0x1b __warn+0xd8/0x100 warn_slowpath_null+0x1d/0x20 hfi1_rcd_get_by_index+0x84/0xa0 [hfi1] is_rcv_urgent_int+0x24/0x90 [hfi1] general_interrupt+0x1b6/0x210 [hfi1] __handle_irq_event_percpu+0x44/0x1c0 handle_irq_event_percpu+0x32/0x80 handle_irq_event+0x3c/0x60 handle_edge_irq+0x7f/0x150 handle_irq+0xe4/0x1a0 do_IRQ+0x4d/0xf0 common_interrupt+0x162/0x162 The race can also lead to a use after free which could be similar to: general protection fault: 0000 1 SMP CPU: 71 PID: 177147 Comm: IMB-MPI1 Kdump: loaded Tainted: G W OE ------------ 3.10.0-957.el7.x86_64 #1 Hardware name: Intel Corporation S2600KP/S2600KP, BIOS SE5C610.86B.11.01.0076.C4.111920150602 11/19/2015 task: ffff9962a8098000 ti: ffff99717a508000 task.ti: ffff99717a508000 __kmalloc+0x94/0x230 Call Trace: ? hfi1_user_sdma_process_request+0x9c8/0x1250 [hfi1] hfi1_user_sdma_process_request+0x9c8/0x1250 [hfi1] hfi1_aio_write+0xba/0x110 [hfi1] do_sync_readv_writev+0x7b/0xd0 do_readv_writev+0xce/0x260 ? handle_mm_fault+0x39d/0x9b0 ? pick_next_task_fair+0x5f/0x1b0 ? sched_clock_cpu+0x85/0xc0 ? __schedule+0x13a/0x890 vfs_writev+0x35/0x60 SyS_writev+0x7f/0x110 system_call_fastpath+0x22/0x27 Use the appropriate kref API to verify access. Reorder context cleanup to ensure context removal before cleanup occurs correctly. Cc: stable@vger.kernel.org # v4.14.0+ Fixes: f683c80ca68e ("IB/hfi1: Resolve kernel panics by reference counting receive contexts") Reviewed-by: Mike Marciniszyn Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/hfi.h | 2 +- drivers/infiniband/hw/hfi1/init.c | 14 +++++++++----- 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'drivers/infiniband/hw/hfi1') diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index 6582184cc985..048b5d73ba39 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -1455,7 +1455,7 @@ void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd, struct hfi1_devdata *dd, u8 hw_pidx, u8 port); void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd); int hfi1_rcd_put(struct hfi1_ctxtdata *rcd); -void hfi1_rcd_get(struct hfi1_ctxtdata *rcd); +int hfi1_rcd_get(struct hfi1_ctxtdata *rcd); struct hfi1_ctxtdata *hfi1_rcd_get_by_index_safe(struct hfi1_devdata *dd, u16 ctxt); struct hfi1_ctxtdata *hfi1_rcd_get_by_index(struct hfi1_devdata *dd, u16 ctxt); diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 7841a0ad7cb6..2cc516439fde 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -214,12 +214,12 @@ static void hfi1_rcd_free(struct kref *kref) struct hfi1_ctxtdata *rcd = container_of(kref, struct hfi1_ctxtdata, kref); - hfi1_free_ctxtdata(rcd->dd, rcd); - spin_lock_irqsave(&rcd->dd->uctxt_lock, flags); rcd->dd->rcd[rcd->ctxt] = NULL; spin_unlock_irqrestore(&rcd->dd->uctxt_lock, flags); + hfi1_free_ctxtdata(rcd->dd, rcd); + kfree(rcd); } @@ -242,10 +242,13 @@ int hfi1_rcd_put(struct hfi1_ctxtdata *rcd) * @rcd: pointer to an initialized rcd data structure * * Use this to get a reference after the init. + * + * Return : reflect kref_get_unless_zero(), which returns non-zero on + * increment, otherwise 0. */ -void hfi1_rcd_get(struct hfi1_ctxtdata *rcd) +int hfi1_rcd_get(struct hfi1_ctxtdata *rcd) { - kref_get(&rcd->kref); + return kref_get_unless_zero(&rcd->kref); } /** @@ -325,7 +328,8 @@ struct hfi1_ctxtdata *hfi1_rcd_get_by_index(struct hfi1_devdata *dd, u16 ctxt) spin_lock_irqsave(&dd->uctxt_lock, flags); if (dd->rcd[ctxt]) { rcd = dd->rcd[ctxt]; - hfi1_rcd_get(rcd); + if (!hfi1_rcd_get(rcd)) + rcd = NULL; } spin_unlock_irqrestore(&dd->uctxt_lock, flags); -- cgit v1.2.3-59-g8ed1b