diff options
Diffstat (limited to '')
79 files changed, 2021 insertions, 996 deletions
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index fabca5e51e3d..46d06678dfbe 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -11,6 +11,7 @@ #include <linux/in6.h> #include <linux/mutex.h> #include <linux/random.h> +#include <linux/rbtree.h> #include <linux/igmp.h> #include <linux/xarray.h> #include <linux/inetdevice.h> @@ -20,6 +21,7 @@ #include <net/net_namespace.h> #include <net/netns/generic.h> +#include <net/netevent.h> #include <net/tcp.h> #include <net/ipv6.h> #include <net/ip_fib.h> @@ -168,6 +170,9 @@ static struct ib_sa_client sa_client; static LIST_HEAD(dev_list); static LIST_HEAD(listen_any_list); static DEFINE_MUTEX(lock); +static struct rb_root id_table = RB_ROOT; +/* Serialize operations of id_table tree */ +static DEFINE_SPINLOCK(id_table_lock); static struct workqueue_struct *cma_wq; static unsigned int cma_pernet_id; @@ -202,6 +207,11 @@ struct xarray *cma_pernet_xa(struct net *net, enum rdma_ucm_port_space ps) } } +struct id_table_entry { + struct list_head id_list; + struct rb_node rb_node; +}; + struct cma_device { struct list_head list; struct ib_device *device; @@ -420,11 +430,21 @@ static inline u8 cma_get_ip_ver(const struct cma_hdr *hdr) return hdr->ip_version >> 4; } -static inline void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver) +static void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver) { hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF); } +static struct sockaddr *cma_src_addr(struct rdma_id_private *id_priv) +{ + return (struct sockaddr *)&id_priv->id.route.addr.src_addr; +} + +static inline struct sockaddr *cma_dst_addr(struct rdma_id_private *id_priv) +{ + return (struct sockaddr *)&id_priv->id.route.addr.dst_addr; +} + static int cma_igmp_send(struct net_device *ndev, union ib_gid *mgid, bool join) { struct in_device *in_dev = NULL; @@ -445,6 +465,117 @@ static int cma_igmp_send(struct net_device *ndev, union ib_gid *mgid, bool join) return (in_dev) ? 0 : -ENODEV; } +static int compare_netdev_and_ip(int ifindex_a, struct sockaddr *sa, + struct id_table_entry *entry_b) +{ + struct rdma_id_private *id_priv = list_first_entry( + &entry_b->id_list, struct rdma_id_private, id_list_entry); + int ifindex_b = id_priv->id.route.addr.dev_addr.bound_dev_if; + struct sockaddr *sb = cma_dst_addr(id_priv); + + if (ifindex_a != ifindex_b) + return (ifindex_a > ifindex_b) ? 1 : -1; + + if (sa->sa_family != sb->sa_family) + return sa->sa_family - sb->sa_family; + + if (sa->sa_family == AF_INET) + return memcmp((char *)&((struct sockaddr_in *)sa)->sin_addr, + (char *)&((struct sockaddr_in *)sb)->sin_addr, + sizeof(((struct sockaddr_in *)sa)->sin_addr)); + + return ipv6_addr_cmp(&((struct sockaddr_in6 *)sa)->sin6_addr, + &((struct sockaddr_in6 *)sb)->sin6_addr); +} + +static int cma_add_id_to_tree(struct rdma_id_private *node_id_priv) +{ + struct rb_node **new, *parent = NULL; + struct id_table_entry *this, *node; + unsigned long flags; + int result; + + node = kzalloc(sizeof(*node), GFP_KERNEL); + if (!node) + return -ENOMEM; + + spin_lock_irqsave(&id_table_lock, flags); + new = &id_table.rb_node; + while (*new) { + this = container_of(*new, struct id_table_entry, rb_node); + result = compare_netdev_and_ip( + node_id_priv->id.route.addr.dev_addr.bound_dev_if, + cma_dst_addr(node_id_priv), this); + + parent = *new; + if (result < 0) + new = &((*new)->rb_left); + else if (result > 0) + new = &((*new)->rb_right); + else { + list_add_tail(&node_id_priv->id_list_entry, + &this->id_list); + kfree(node); + goto unlock; + } + } + + INIT_LIST_HEAD(&node->id_list); + list_add_tail(&node_id_priv->id_list_entry, &node->id_list); + + rb_link_node(&node->rb_node, parent, new); + rb_insert_color(&node->rb_node, &id_table); + +unlock: + spin_unlock_irqrestore(&id_table_lock, flags); + return 0; +} + +static struct id_table_entry * +node_from_ndev_ip(struct rb_root *root, int ifindex, struct sockaddr *sa) +{ + struct rb_node *node = root->rb_node; + struct id_table_entry *data; + int result; + + while (node) { + data = container_of(node, struct id_table_entry, rb_node); + result = compare_netdev_and_ip(ifindex, sa, data); + if (result < 0) + node = node->rb_left; + else if (result > 0) + node = node->rb_right; + else + return data; + } + + return NULL; +} + +static void cma_remove_id_from_tree(struct rdma_id_private *id_priv) +{ + struct id_table_entry *data; + unsigned long flags; + + spin_lock_irqsave(&id_table_lock, flags); + if (list_empty(&id_priv->id_list_entry)) + goto out; + + data = node_from_ndev_ip(&id_table, + id_priv->id.route.addr.dev_addr.bound_dev_if, + cma_dst_addr(id_priv)); + if (!data) + goto out; + + list_del_init(&id_priv->id_list_entry); + if (list_empty(&data->id_list)) { + rb_erase(&data->rb_node, &id_table); + kfree(data); + } +out: + spin_unlock_irqrestore(&id_table_lock, flags); +} + static void _cma_attach_to_dev(struct rdma_id_private *id_priv, struct cma_device *cma_dev) { @@ -481,16 +612,6 @@ static void cma_release_dev(struct rdma_id_private *id_priv) mutex_unlock(&lock); } -static inline struct sockaddr *cma_src_addr(struct rdma_id_private *id_priv) -{ - return (struct sockaddr *) &id_priv->id.route.addr.src_addr; -} - -static inline struct sockaddr *cma_dst_addr(struct rdma_id_private *id_priv) -{ - return (struct sockaddr *) &id_priv->id.route.addr.dst_addr; -} - static inline unsigned short cma_family(struct rdma_id_private *id_priv) { return id_priv->id.route.addr.src_addr.ss_family; @@ -861,6 +982,7 @@ __rdma_create_id(struct net *net, rdma_cm_event_handler event_handler, refcount_set(&id_priv->refcount, 1); mutex_init(&id_priv->handler_mutex); INIT_LIST_HEAD(&id_priv->device_item); + INIT_LIST_HEAD(&id_priv->id_list_entry); INIT_LIST_HEAD(&id_priv->listen_list); INIT_LIST_HEAD(&id_priv->mc_list); get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num); @@ -1883,6 +2005,7 @@ static void _destroy_id(struct rdma_id_private *id_priv, cma_cancel_operation(id_priv, state); rdma_restrack_del(&id_priv->res); + cma_remove_id_from_tree(id_priv); if (id_priv->cma_dev) { if (rdma_cap_ib_cm(id_priv->id.device, 1)) { if (id_priv->cm_id.ib) @@ -3172,8 +3295,11 @@ int rdma_resolve_route(struct rdma_cm_id *id, unsigned long timeout_ms) cma_id_get(id_priv); if (rdma_cap_ib_sa(id->device, id->port_num)) ret = cma_resolve_ib_route(id_priv, timeout_ms); - else if (rdma_protocol_roce(id->device, id->port_num)) + else if (rdma_protocol_roce(id->device, id->port_num)) { ret = cma_resolve_iboe_route(id_priv); + if (!ret) + cma_add_id_to_tree(id_priv); + } else if (rdma_protocol_iwarp(id->device, id->port_num)) ret = cma_resolve_iw_route(id_priv); else @@ -4922,10 +5048,87 @@ out: return ret; } +static void cma_netevent_work_handler(struct work_struct *_work) +{ + struct rdma_id_private *id_priv = + container_of(_work, struct rdma_id_private, id.net_work); + struct rdma_cm_event event = {}; + + mutex_lock(&id_priv->handler_mutex); + + if (READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING || + READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL) + goto out_unlock; + + event.event = RDMA_CM_EVENT_UNREACHABLE; + event.status = -ETIMEDOUT; + + if (cma_cm_event_handler(id_priv, &event)) { + __acquire(&id_priv->handler_mutex); + id_priv->cm_id.ib = NULL; + cma_id_put(id_priv); + destroy_id_handler_unlock(id_priv); + return; + } + +out_unlock: + mutex_unlock(&id_priv->handler_mutex); + cma_id_put(id_priv); +} + +static int cma_netevent_callback(struct notifier_block *self, + unsigned long event, void *ctx) +{ + struct id_table_entry *ips_node = NULL; + struct rdma_id_private *current_id; + struct neighbour *neigh = ctx; + unsigned long flags; + + if (event != NETEVENT_NEIGH_UPDATE) + return NOTIFY_DONE; + + spin_lock_irqsave(&id_table_lock, flags); + if (neigh->tbl->family == AF_INET6) { + struct sockaddr_in6 neigh_sock_6; + + neigh_sock_6.sin6_family = AF_INET6; + neigh_sock_6.sin6_addr = *(struct in6_addr *)neigh->primary_key; + ips_node = node_from_ndev_ip(&id_table, neigh->dev->ifindex, + (struct sockaddr *)&neigh_sock_6); + } else if (neigh->tbl->family == AF_INET) { + struct sockaddr_in neigh_sock_4; + + neigh_sock_4.sin_family = AF_INET; + neigh_sock_4.sin_addr.s_addr = *(__be32 *)(neigh->primary_key); + ips_node = node_from_ndev_ip(&id_table, neigh->dev->ifindex, + (struct sockaddr *)&neigh_sock_4); + } else + goto out; + + if (!ips_node) + goto out; + + list_for_each_entry(current_id, &ips_node->id_list, id_list_entry) { + if (!memcmp(current_id->id.route.addr.dev_addr.dst_dev_addr, + neigh->ha, ETH_ALEN)) + continue; + INIT_WORK(¤t_id->id.net_work, cma_netevent_work_handler); + cma_id_get(current_id); + queue_work(cma_wq, ¤t_id->id.net_work); + } +out: + spin_unlock_irqrestore(&id_table_lock, flags); + return NOTIFY_DONE; +} + static struct notifier_block cma_nb = { .notifier_call = cma_netdev_callback }; +static struct notifier_block cma_netevent_cb = { + .notifier_call = cma_netevent_callback +}; + static void cma_send_device_removal_put(struct rdma_id_private *id_priv) { struct rdma_cm_event event = { .event = RDMA_CM_EVENT_DEVICE_REMOVAL }; @@ -5148,6 +5351,7 @@ static int __init cma_init(void) ib_sa_register_client(&sa_client); register_netdevice_notifier(&cma_nb); + register_netevent_notifier(&cma_netevent_cb); ret = ib_register_client(&cma_client); if (ret) @@ -5162,6 +5366,7 @@ static int __init cma_init(void) err_ib: ib_unregister_client(&cma_client); err: + unregister_netevent_notifier(&cma_netevent_cb); unregister_netdevice_notifier(&cma_nb); ib_sa_unregister_client(&sa_client); unregister_pernet_subsys(&cma_pernet_operations); @@ -5174,6 +5379,7 @@ static void __exit cma_cleanup(void) { cma_configfs_exit(); ib_unregister_client(&cma_client); + unregister_netevent_notifier(&cma_netevent_cb); unregister_netdevice_notifier(&cma_nb); ib_sa_unregister_client(&sa_client); unregister_pernet_subsys(&cma_pernet_operations); diff --git a/drivers/infiniband/core/cma_priv.h b/drivers/infiniband/core/cma_priv.h index 757a0ef79872..b7354c94cf1b 100644 --- a/drivers/infiniband/core/cma_priv.h +++ b/drivers/infiniband/core/cma_priv.h @@ -64,6 +64,7 @@ struct rdma_id_private { struct list_head listen_item; struct list_head listen_list; }; + struct list_head id_list_entry; struct cma_device *cma_dev; struct list_head mc_list; diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c index 94d83b665a2f..29b1ab1d5f93 100644 --- a/drivers/infiniband/core/rdma_core.c +++ b/drivers/infiniband/core/rdma_core.c @@ -68,7 +68,7 @@ static int uverbs_try_lock_object(struct ib_uobject *uobj, * In exclusive access mode, we check that the counter is zero (nobody * claimed this object) and we set it to -1. Releasing a shared access * lock is done simply by decreasing the counter. As for exclusive - * access locks, since only a single one of them is is allowed + * access locks, since only a single one of them is allowed * concurrently, setting the counter to zero is enough for releasing * this lock. */ diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c index 68197e576433..e958c43dd28f 100644 --- a/drivers/infiniband/core/roce_gid_mgmt.c +++ b/drivers/infiniband/core/roce_gid_mgmt.c @@ -250,7 +250,7 @@ static bool upper_device_filter(struct ib_device *ib_dev, u32 port, /** * is_upper_ndev_bond_master_filter - Check if a given netdevice - * is bond master device of netdevice of the the RDMA device of port. + * is bond master device of netdevice of the RDMA device of port. * @ib_dev: IB device to check * @port: Port to consider for adding default GID * @rdma_ndev: Pointer to rdma netdevice diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index 79401e6c6aa9..785c37cae3c0 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -173,7 +173,7 @@ struct bnxt_re_dev { /* Max of 2 lossless traffic class supported per port */ u16 cosq[2]; - /* QP for for handling QP1 packets */ + /* QP for handling QP1 packets */ struct bnxt_re_gsi_context gsi_ctx; struct bnxt_re_stats stats; atomic_t nq_alloc_cnt; diff --git a/drivers/infiniband/hw/hfi1/Kconfig b/drivers/infiniband/hw/hfi1/Kconfig index 6eb739052121..14b92e12bf29 100644 --- a/drivers/infiniband/hw/hfi1/Kconfig +++ b/drivers/infiniband/hw/hfi1/Kconfig @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config INFINIBAND_HFI1 tristate "Cornelis OPX Gen1 support" - depends on X86_64 && INFINIBAND_RDMAVT && I2C + depends on X86_64 && INFINIBAND_RDMAVT && I2C && !UML select MMU_NOTIFIER select CRC32 select I2C_ALGOBIT diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c index 2e4cf2b11653..629beff053ad 100644 --- a/drivers/infiniband/hw/hfi1/file_ops.c +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -1179,8 +1179,10 @@ static int setup_base_ctxt(struct hfi1_filedata *fd, goto done; ret = init_user_ctxt(fd, uctxt); - if (ret) + if (ret) { + hfi1_free_ctxt_rcv_groups(uctxt); goto done; + } user_init(uctxt); diff --git a/drivers/infiniband/hw/hfi1/ipoib_tx.c b/drivers/infiniband/hw/hfi1/ipoib_tx.c index d6bbdb8fcb50..5d9a7b09ca37 100644 --- a/drivers/infiniband/hw/hfi1/ipoib_tx.c +++ b/drivers/infiniband/hw/hfi1/ipoib_tx.c @@ -742,9 +742,7 @@ int hfi1_ipoib_txreq_init(struct hfi1_ipoib_dev_priv *priv) kzalloc_node(sizeof(*tx->sdma_hdr), GFP_KERNEL, priv->dd->node); - netif_tx_napi_add(dev, &txq->napi, - hfi1_ipoib_poll_tx_ring, - NAPI_POLL_WEIGHT); + netif_napi_add_tx(dev, &txq->napi, hfi1_ipoib_poll_tx_ring); } return 0; diff --git a/drivers/infiniband/hw/hfi1/netdev_rx.c b/drivers/infiniband/hw/hfi1/netdev_rx.c index 03b098a494b5..3dfa5aff2512 100644 --- a/drivers/infiniband/hw/hfi1/netdev_rx.c +++ b/drivers/infiniband/hw/hfi1/netdev_rx.c @@ -216,7 +216,7 @@ static int hfi1_netdev_rxq_init(struct hfi1_netdev_rx *rx) * right now. */ set_bit(NAPI_STATE_NO_BUSY_POLL, &rxq->napi.state); - netif_napi_add(dev, &rxq->napi, hfi1_netdev_rx_napi, 64); + netif_napi_add_weight(dev, &rxq->napi, hfi1_netdev_rx_napi, 64); rc = msix_netdev_request_rcd_irq(rxq->rcd); if (rc) goto bail_context_irq_failure; diff --git a/drivers/infiniband/hw/hfi1/pio_copy.c b/drivers/infiniband/hw/hfi1/pio_copy.c index 136f9a99e1e0..7690f996d5e3 100644 --- a/drivers/infiniband/hw/hfi1/pio_copy.c +++ b/drivers/infiniband/hw/hfi1/pio_copy.c @@ -172,7 +172,7 @@ static inline void jcopy(u8 *dest, const u8 *src, u32 n) } /* - * Read nbytes from "from" and and place them in the low bytes + * Read nbytes from "from" and place them in the low bytes * of pbuf->carry. Other bytes are left as-is. Any previous * value in pbuf->carry is lost. * diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 2855e9ad4b32..f848eedc6a23 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -959,6 +959,7 @@ struct hns_roce_dev { const struct hns_roce_hw *hw; void *priv; struct workqueue_struct *irq_workq; + struct work_struct ecc_work; const struct hns_roce_dfx_hw *dfx; u32 func_num; u32 is_vf; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index ba3c742258ef..cbdafaac678a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -55,6 +55,42 @@ enum { CMD_RST_PRC_EBUSY, }; +enum ecc_resource_type { + ECC_RESOURCE_QPC, + ECC_RESOURCE_CQC, + ECC_RESOURCE_MPT, + ECC_RESOURCE_SRQC, + ECC_RESOURCE_GMV, + ECC_RESOURCE_QPC_TIMER, + ECC_RESOURCE_CQC_TIMER, + ECC_RESOURCE_SCCC, + ECC_RESOURCE_COUNT, +}; + +static const struct { + const char *name; + u8 read_bt0_op; + u8 write_bt0_op; +} fmea_ram_res[] = { + { "ECC_RESOURCE_QPC", + HNS_ROCE_CMD_READ_QPC_BT0, HNS_ROCE_CMD_WRITE_QPC_BT0 }, + { "ECC_RESOURCE_CQC", + HNS_ROCE_CMD_READ_CQC_BT0, HNS_ROCE_CMD_WRITE_CQC_BT0 }, + { "ECC_RESOURCE_MPT", + HNS_ROCE_CMD_READ_MPT_BT0, HNS_ROCE_CMD_WRITE_MPT_BT0 }, + { "ECC_RESOURCE_SRQC", + HNS_ROCE_CMD_READ_SRQC_BT0, HNS_ROCE_CMD_WRITE_SRQC_BT0 }, + /* ECC_RESOURCE_GMV is handled by cmdq, not mailbox */ + { "ECC_RESOURCE_GMV", + 0, 0 }, + { "ECC_RESOURCE_QPC_TIMER", + HNS_ROCE_CMD_READ_QPC_TIMER_BT0, HNS_ROCE_CMD_WRITE_QPC_TIMER_BT0 }, + { "ECC_RESOURCE_CQC_TIMER", + HNS_ROCE_CMD_READ_CQC_TIMER_BT0, HNS_ROCE_CMD_WRITE_CQC_TIMER_BT0 }, + { "ECC_RESOURCE_SCCC", + HNS_ROCE_CMD_READ_SCCC_BT0, HNS_ROCE_CMD_WRITE_SCCC_BT0 }, +}; + static inline void set_data_seg_v2(struct hns_roce_v2_wqe_data_seg *dseg, struct ib_sge *sg) { @@ -5855,12 +5891,12 @@ static struct hns_roce_aeqe *next_aeqe_sw_v2(struct hns_roce_eq *eq) !!(eq->cons_index & eq->entries)) ? aeqe : NULL; } -static int hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev, - struct hns_roce_eq *eq) +static irqreturn_t hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev, + struct hns_roce_eq *eq) { struct device *dev = hr_dev->dev; struct hns_roce_aeqe *aeqe = next_aeqe_sw_v2(eq); - int aeqe_found = 0; + irqreturn_t aeqe_found = IRQ_NONE; int event_type; u32 queue_num; int sub_type; @@ -5914,7 +5950,7 @@ static int hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev, eq->event_type = event_type; eq->sub_type = sub_type; ++eq->cons_index; - aeqe_found = 1; + aeqe_found = IRQ_HANDLED; hns_roce_v2_init_irq_work(hr_dev, eq, queue_num); @@ -5922,7 +5958,8 @@ static int hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev, } update_eq_db(eq); - return aeqe_found; + + return IRQ_RETVAL(aeqe_found); } static struct hns_roce_ceqe *next_ceqe_sw_v2(struct hns_roce_eq *eq) @@ -5937,11 +5974,11 @@ static struct hns_roce_ceqe *next_ceqe_sw_v2(struct hns_roce_eq *eq) !!(eq->cons_index & eq->entries)) ? ceqe : NULL; } -static int hns_roce_v2_ceq_int(struct hns_roce_dev *hr_dev, - struct hns_roce_eq *eq) +static irqreturn_t hns_roce_v2_ceq_int(struct hns_roce_dev *hr_dev, + struct hns_roce_eq *eq) { struct hns_roce_ceqe *ceqe = next_ceqe_sw_v2(eq); - int ceqe_found = 0; + irqreturn_t ceqe_found = IRQ_NONE; u32 cqn; while (ceqe) { @@ -5955,21 +5992,21 @@ static int hns_roce_v2_ceq_int(struct hns_roce_dev *hr_dev, hns_roce_cq_completion(hr_dev, cqn); ++eq->cons_index; - ceqe_found = 1; + ceqe_found = IRQ_HANDLED; ceqe = next_ceqe_sw_v2(eq); } update_eq_db(eq); - return ceqe_found; + return IRQ_RETVAL(ceqe_found); } static irqreturn_t hns_roce_v2_msix_interrupt_eq(int irq, void *eq_ptr) { struct hns_roce_eq *eq = eq_ptr; struct hns_roce_dev *hr_dev = eq->hr_dev; - int int_work; + irqreturn_t int_work; if (eq->type_flag == HNS_ROCE_CEQ) /* Completion event interrupt */ @@ -5981,27 +6018,22 @@ static irqreturn_t hns_roce_v2_msix_interrupt_eq(int irq, void *eq_ptr) return IRQ_RETVAL(int_work); } -static irqreturn_t hns_roce_v2_msix_interrupt_abn(int irq, void *dev_id) +static irqreturn_t abnormal_interrupt_basic(struct hns_roce_dev *hr_dev, + u32 int_st) { - struct hns_roce_dev *hr_dev = dev_id; - struct device *dev = hr_dev->dev; - int int_work = 0; - u32 int_st; + struct pci_dev *pdev = hr_dev->pci_dev; + struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev); + const struct hnae3_ae_ops *ops = ae_dev->ops; + irqreturn_t int_work = IRQ_NONE; u32 int_en; - /* Abnormal interrupt */ - int_st = roce_read(hr_dev, ROCEE_VF_ABN_INT_ST_REG); int_en = roce_read(hr_dev, ROCEE_VF_ABN_INT_EN_REG); if (int_st & BIT(HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S)) { - struct pci_dev *pdev = hr_dev->pci_dev; - struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev); - const struct hnae3_ae_ops *ops = ae_dev->ops; + dev_err(hr_dev->dev, "AEQ overflow!\n"); - dev_err(dev, "AEQ overflow!\n"); - - int_st |= 1 << HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S; - roce_write(hr_dev, ROCEE_VF_ABN_INT_ST_REG, int_st); + roce_write(hr_dev, ROCEE_VF_ABN_INT_ST_REG, + 1 << HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S); /* Set reset level for reset_event() */ if (ops->set_default_reset_request) @@ -6013,19 +6045,165 @@ static irqreturn_t hns_roce_v2_msix_interrupt_abn(int irq, void *dev_id) int_en |= 1 << HNS_ROCE_V2_VF_ABN_INT_EN_S; roce_write(hr_dev, ROCEE_VF_ABN_INT_EN_REG, int_en); - int_work = 1; - } else if (int_st & BIT(HNS_ROCE_V2_VF_INT_ST_RAS_INT_S)) { - dev_err(dev, "RAS interrupt!\n"); + int_work = IRQ_HANDLED; + } else { + dev_err(hr_dev->dev, "there is no basic abn irq found.\n"); + } + + return IRQ_RETVAL(int_work); +} - int_st |= 1 << HNS_ROCE_V2_VF_INT_ST_RAS_INT_S; - roce_write(hr_dev, ROCEE_VF_ABN_INT_ST_REG, int_st); +static int fmea_ram_ecc_query(struct hns_roce_dev *hr_dev, + struct fmea_ram_ecc *ecc_info) +{ + struct hns_roce_cmq_desc desc; + struct hns_roce_cmq_req *req = (struct hns_roce_cmq_req *)desc.data; + int ret; - int_en |= 1 << HNS_ROCE_V2_VF_ABN_INT_EN_S; - roce_write(hr_dev, ROCEE_VF_ABN_INT_EN_REG, int_en); + hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_QUERY_RAM_ECC, true); + ret = hns_roce_cmq_send(hr_dev, &desc, 1); + if (ret) + return ret; + + ecc_info->is_ecc_err = hr_reg_read(req, QUERY_RAM_ECC_1BIT_ERR); + ecc_info->res_type = hr_reg_read(req, QUERY_RAM_ECC_RES_TYPE); + ecc_info->index = hr_reg_read(req, QUERY_RAM_ECC_TAG); + + return 0; +} + +static int fmea_recover_gmv(struct hns_roce_dev *hr_dev, u32 idx) +{ + struct hns_roce_cmq_desc desc; + struct hns_roce_cmq_req *req = (struct hns_roce_cmq_req *)desc.data; + u32 addr_upper; + u32 addr_low; + int ret; + + hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_CFG_GMV_BT, true); + hr_reg_write(req, CFG_GMV_BT_IDX, idx); + + ret = hns_roce_cmq_send(hr_dev, &desc, 1); + if (ret) { + dev_err(hr_dev->dev, + "failed to execute cmd to read gmv, ret = %d.\n", ret); + return ret; + } + + addr_low = hr_reg_read(req, CFG_GMV_BT_BA_L); + addr_upper = hr_reg_read(req, CFG_GMV_BT_BA_H); + + hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_CFG_GMV_BT, false); + hr_reg_write(req, CFG_GMV_BT_BA_L, addr_low); + hr_reg_write(req, CFG_GMV_BT_BA_H, addr_upper); + hr_reg_write(req, CFG_GMV_BT_IDX, idx); + + return hns_roce_cmq_send(hr_dev, &desc, 1); +} + +static u64 fmea_get_ram_res_addr(u32 res_type, __le64 *data) +{ + if (res_type == ECC_RESOURCE_QPC_TIMER || + res_type == ECC_RESOURCE_CQC_TIMER || + res_type == ECC_RESOURCE_SCCC) + return le64_to_cpu(*data); + + return le64_to_cpu(*data) << PAGE_SHIFT; +} - int_work = 1; +static int fmea_recover_others(struct hns_roce_dev *hr_dev, u32 res_type, + u32 index) +{ + u8 write_bt0_op = fmea_ram_res[res_type].write_bt0_op; + u8 read_bt0_op = fmea_ram_res[res_type].read_bt0_op; + struct hns_roce_cmd_mailbox *mailbox; + u64 addr; + int ret; + + mailbox = hns_roce_alloc_cmd_mailbox(hr_dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, read_bt0_op, index); + if (ret) { + dev_err(hr_dev->dev, + "failed to execute cmd to read fmea ram, ret = %d.\n", + ret); + goto out; + } + + addr = fmea_get_ram_res_addr(res_type, mailbox->buf); + + ret = hns_roce_cmd_mbox(hr_dev, addr, 0, write_bt0_op, index); + if (ret) + dev_err(hr_dev->dev, + "failed to execute cmd to write fmea ram, ret = %d.\n", + ret); + +out: + hns_roce_free_cmd_mailbox(hr_dev, mailbox); + return ret; +} + +static void fmea_ram_ecc_recover(struct hns_roce_dev *hr_dev, + struct fmea_ram_ecc *ecc_info) +{ + u32 res_type = ecc_info->res_type; + u32 index = ecc_info->index; + int ret; + + BUILD_BUG_ON(ARRAY_SIZE(fmea_ram_res) != ECC_RESOURCE_COUNT); + + if (res_type >= ECC_RESOURCE_COUNT) { + dev_err(hr_dev->dev, "unsupported fmea ram ecc type %u.\n", + res_type); + return; + } + + if (res_type == ECC_RESOURCE_GMV) + ret = fmea_recover_gmv(hr_dev, index); + else + ret = fmea_recover_others(hr_dev, res_type, index); + if (ret) + dev_err(hr_dev->dev, + "failed to recover %s, index = %u, ret = %d.\n", + fmea_ram_res[res_type].name, index, ret); +} + +static void fmea_ram_ecc_work(struct work_struct *ecc_work) +{ + struct hns_roce_dev *hr_dev = + container_of(ecc_work, struct hns_roce_dev, ecc_work); + struct fmea_ram_ecc ecc_info = {}; + + if (fmea_ram_ecc_query(hr_dev, &ecc_info)) { + dev_err(hr_dev->dev, "failed to query fmea ram ecc.\n"); + return; + } + + if (!ecc_info.is_ecc_err) { + dev_err(hr_dev->dev, "there is no fmea ram ecc err found.\n"); + return; + } + + fmea_ram_ecc_recover(hr_dev, &ecc_info); +} + +static irqreturn_t hns_roce_v2_msix_interrupt_abn(int irq, void *dev_id) +{ + struct hns_roce_dev *hr_dev = dev_id; + irqreturn_t int_work = IRQ_NONE; + u32 int_st; + + int_st = roce_read(hr_dev, ROCEE_VF_ABN_INT_ST_REG); + + if (int_st) { + int_work = abnormal_interrupt_basic(hr_dev, int_st); + } else if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09) { + queue_work(hr_dev->irq_workq, &hr_dev->ecc_work); + int_work = IRQ_HANDLED; } else { - dev_err(dev, "There is no abnormal irq found!\n"); + dev_err(hr_dev->dev, "there is no abnormal irq found.\n"); } return IRQ_RETVAL(int_work); @@ -6342,6 +6520,8 @@ static int hns_roce_v2_init_eq_table(struct hns_roce_dev *hr_dev) } } + INIT_WORK(&hr_dev->ecc_work, fmea_ram_ecc_work); + hr_dev->irq_workq = alloc_ordered_workqueue("hns_roce_irq_workq", 0); if (!hr_dev->irq_workq) { dev_err(dev, "failed to create irq workqueue.\n"); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index 7ffb7824d268..f96debac30fe 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -250,6 +250,7 @@ enum hns_roce_opcode_type { HNS_ROCE_OPC_CFG_GMV_TBL = 0x850f, HNS_ROCE_OPC_CFG_GMV_BT = 0x8510, HNS_ROCE_OPC_EXT_CFG = 0x8512, + HNS_ROCE_QUERY_RAM_ECC = 0x8513, HNS_SWITCH_PARAMETER_CFG = 0x1033, }; @@ -1107,6 +1108,11 @@ enum { #define CFG_GMV_BT_BA_H CMQ_REQ_FIELD_LOC(51, 32) #define CFG_GMV_BT_IDX CMQ_REQ_FIELD_LOC(95, 64) +/* Fields of HNS_ROCE_QUERY_RAM_ECC */ +#define QUERY_RAM_ECC_1BIT_ERR CMQ_REQ_FIELD_LOC(31, 0) +#define QUERY_RAM_ECC_RES_TYPE CMQ_REQ_FIELD_LOC(63, 32) +#define QUERY_RAM_ECC_TAG CMQ_REQ_FIELD_LOC(95, 64) + struct hns_roce_cfg_sgid_tb { __le32 table_idx_rsv; __le32 vf_sgid_l; @@ -1343,6 +1349,12 @@ struct hns_roce_dip { struct list_head node; /* all dips are on a list */ }; +struct fmea_ram_ecc { + u32 is_ecc_err; + u32 res_type; + u32 index; +}; + /* only for RNR timeout issue of HIP08 */ #define HNS_ROCE_CLOCK_ADJUST 1000 #define HNS_ROCE_MAX_CQ_PERIOD 65 @@ -1382,7 +1394,6 @@ struct hns_roce_dip { #define HNS_ROCE_V2_ASYNC_EQE_NUM 0x1000 #define HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S 0 -#define HNS_ROCE_V2_VF_INT_ST_RAS_INT_S 1 #define HNS_ROCE_EQ_DB_CMD_AEQ 0x0 #define HNS_ROCE_EQ_DB_CMD_AEQ_ARMED 0x1 diff --git a/drivers/infiniband/hw/irdma/cm.c b/drivers/infiniband/hw/irdma/cm.c index 638bf4a1ed94..e4d837f82d75 100644 --- a/drivers/infiniband/hw/irdma/cm.c +++ b/drivers/infiniband/hw/irdma/cm.c @@ -1477,12 +1477,13 @@ irdma_find_listener(struct irdma_cm_core *cm_core, u32 *dst_addr, u16 dst_port, list_for_each_entry (listen_node, &cm_core->listen_list, list) { memcpy(listen_addr, listen_node->loc_addr, sizeof(listen_addr)); listen_port = listen_node->loc_port; + if (listen_port != dst_port || + !(listener_state & listen_node->listener_state)) + continue; /* compare node pair, return node handle if a match */ - if ((!memcmp(listen_addr, dst_addr, sizeof(listen_addr)) || - !memcmp(listen_addr, ip_zero, sizeof(listen_addr))) && - listen_port == dst_port && - vlan_id == listen_node->vlan_id && - (listener_state & listen_node->listener_state)) { + if (!memcmp(listen_addr, ip_zero, sizeof(listen_addr)) || + (!memcmp(listen_addr, dst_addr, sizeof(listen_addr)) && + vlan_id == listen_node->vlan_id)) { refcount_inc(&listen_node->refcnt); spin_unlock_irqrestore(&cm_core->listen_list_lock, flags); diff --git a/drivers/infiniband/hw/irdma/ctrl.c b/drivers/infiniband/hw/irdma/ctrl.c index 58c0e181ca2b..a41e0d21143a 100644 --- a/drivers/infiniband/hw/irdma/ctrl.c +++ b/drivers/infiniband/hw/irdma/ctrl.c @@ -4872,10 +4872,12 @@ int irdma_cfg_fpm_val(struct irdma_sc_dev *dev, u32 qp_count) sd_diff = sd_needed - hmc_fpm_misc->max_sds; if (sd_diff > 128) { - if (qpwanted > 128 && sd_diff > 144) + if (!(loop_count % 2) && qpwanted > 128) { qpwanted /= 2; - mrwanted /= 2; - pblewanted /= 2; + } else { + mrwanted /= 2; + pblewanted /= 2; + } continue; } if (dev->cqp->hmc_profile != IRDMA_HMC_PROFILE_FAVOR_VF && diff --git a/drivers/infiniband/hw/irdma/hw.c b/drivers/infiniband/hw/irdma/hw.c index dd3943d22dc6..4f132c6fb653 100644 --- a/drivers/infiniband/hw/irdma/hw.c +++ b/drivers/infiniband/hw/irdma/hw.c @@ -257,10 +257,6 @@ static void irdma_process_aeq(struct irdma_pci_f *rf) iwqp->last_aeq = info->ae_id; spin_unlock_irqrestore(&iwqp->lock, flags); ctx_info = &iwqp->ctx_info; - if (rdma_protocol_roce(&iwqp->iwdev->ibdev, 1)) - ctx_info->roce_info->err_rq_idx_valid = true; - else - ctx_info->iwarp_info->err_rq_idx_valid = true; } else { if (info->ae_id != IRDMA_AE_CQ_OPERATION_ERROR) continue; @@ -370,16 +366,12 @@ static void irdma_process_aeq(struct irdma_pci_f *rf) case IRDMA_AE_LCE_FUNCTION_CATASTROPHIC: case IRDMA_AE_LCE_CQ_CATASTROPHIC: case IRDMA_AE_UDA_XMIT_DGRAM_TOO_LONG: - if (rdma_protocol_roce(&iwdev->ibdev, 1)) - ctx_info->roce_info->err_rq_idx_valid = false; - else - ctx_info->iwarp_info->err_rq_idx_valid = false; - fallthrough; default: - ibdev_err(&iwdev->ibdev, "abnormal ae_id = 0x%x bool qp=%d qp_id = %d\n", - info->ae_id, info->qp, info->qp_cq_id); + ibdev_err(&iwdev->ibdev, "abnormal ae_id = 0x%x bool qp=%d qp_id = %d, ae_src=%d\n", + info->ae_id, info->qp, info->qp_cq_id, info->ae_src); if (rdma_protocol_roce(&iwdev->ibdev, 1)) { - if (!info->sq && ctx_info->roce_info->err_rq_idx_valid) { + ctx_info->roce_info->err_rq_idx_valid = info->rq; + if (info->rq) { ctx_info->roce_info->err_rq_idx = info->wqe_idx; irdma_sc_qp_setctx_roce(&iwqp->sc_qp, iwqp->host_ctx.va, ctx_info); @@ -388,7 +380,8 @@ static void irdma_process_aeq(struct irdma_pci_f *rf) irdma_cm_disconn(iwqp); break; } - if (!info->sq && ctx_info->iwarp_info->err_rq_idx_valid) { + ctx_info->iwarp_info->err_rq_idx_valid = info->rq; + if (info->rq) { ctx_info->iwarp_info->err_rq_idx = info->wqe_idx; ctx_info->tcp_info_valid = false; ctx_info->iwarp_info_valid = true; @@ -1512,10 +1505,7 @@ static int irdma_hmc_setup(struct irdma_pci_f *rf) int status; u32 qpcnt; - if (rf->rdma_ver == IRDMA_GEN_1) - qpcnt = rsrc_limits_table[rf->limits_sel].qplimit * 2; - else - qpcnt = rsrc_limits_table[rf->limits_sel].qplimit; + qpcnt = rsrc_limits_table[rf->limits_sel].qplimit; rf->sd_type = IRDMA_SD_TYPE_DIRECT; status = irdma_cfg_fpm_val(&rf->sc_dev, qpcnt); @@ -1543,7 +1533,7 @@ static void irdma_del_init_mem(struct irdma_pci_f *rf) rf->obj_mem.pa); rf->obj_mem.va = NULL; if (rf->rdma_ver != IRDMA_GEN_1) { - kfree(rf->allocated_ws_nodes); + bitmap_free(rf->allocated_ws_nodes); rf->allocated_ws_nodes = NULL; } kfree(rf->ceqlist); @@ -1972,9 +1962,8 @@ u32 irdma_initialize_hw_rsrc(struct irdma_pci_f *rf) u32 ret; if (rf->rdma_ver != IRDMA_GEN_1) { - rf->allocated_ws_nodes = - kcalloc(BITS_TO_LONGS(IRDMA_MAX_WS_NODES), - sizeof(unsigned long), GFP_KERNEL); + rf->allocated_ws_nodes = bitmap_zalloc(IRDMA_MAX_WS_NODES, + GFP_KERNEL); if (!rf->allocated_ws_nodes) return -ENOMEM; @@ -2023,7 +2012,7 @@ u32 irdma_initialize_hw_rsrc(struct irdma_pci_f *rf) return 0; mem_rsrc_kzalloc_fail: - kfree(rf->allocated_ws_nodes); + bitmap_free(rf->allocated_ws_nodes); rf->allocated_ws_nodes = NULL; return ret; diff --git a/drivers/infiniband/hw/irdma/main.h b/drivers/infiniband/hw/irdma/main.h index ef862bced20f..65e966ad3453 100644 --- a/drivers/infiniband/hw/irdma/main.h +++ b/drivers/infiniband/hw/irdma/main.h @@ -85,7 +85,7 @@ extern struct auxiliary_driver i40iw_auxiliary_drv; #define IRDMA_NO_QSET 0xffff #define IW_CFG_FPM_QP_COUNT 32768 -#define IRDMA_MAX_PAGES_PER_FMR 512 +#define IRDMA_MAX_PAGES_PER_FMR 262144 #define IRDMA_MIN_PAGES_PER_FMR 1 #define IRDMA_CQP_COMPL_RQ_WQE_FLUSHED 2 #define IRDMA_CQP_COMPL_SQ_WQE_FLUSHED 3 diff --git a/drivers/infiniband/hw/irdma/utils.c b/drivers/infiniband/hw/irdma/utils.c index ab3c5208a123..fdf4cc88cb91 100644 --- a/drivers/infiniband/hw/irdma/utils.c +++ b/drivers/infiniband/hw/irdma/utils.c @@ -652,6 +652,7 @@ static const char *const irdma_cqp_cmd_names[IRDMA_MAX_CQP_OPS] = { }; static const struct irdma_cqp_err_info irdma_noncrit_err_list[] = { + {0xffff, 0x8002, "Invalid State"}, {0xffff, 0x8006, "Flush No Wqe Pending"}, {0xffff, 0x8007, "Modify QP Bad Close"}, {0xffff, 0x8009, "LLP Closed"}, diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index c4412ece5a6d..d78b11a41b6b 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -1776,11 +1776,11 @@ static int irdma_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) spin_unlock_irqrestore(&iwcq->lock, flags); irdma_cq_wq_destroy(iwdev->rf, cq); - irdma_cq_free_rsrc(iwdev->rf, iwcq); spin_lock_irqsave(&iwceq->ce_lock, flags); irdma_sc_cleanup_ceqes(cq, ceq); spin_unlock_irqrestore(&iwceq->ce_lock, flags); + irdma_cq_free_rsrc(iwdev->rf, iwcq); return 0; } @@ -2605,7 +2605,7 @@ static struct ib_mr *irdma_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, palloc = &iwpbl->pble_alloc; iwmr->page_cnt = max_num_sg; err_code = irdma_get_pble(iwdev->rf->pble_rsrc, palloc, iwmr->page_cnt, - true); + false); if (err_code) goto err_get_pble; @@ -2641,8 +2641,16 @@ static int irdma_set_page(struct ib_mr *ibmr, u64 addr) if (unlikely(iwmr->npages == iwmr->page_cnt)) return -ENOMEM; - pbl = palloc->level1.addr; - pbl[iwmr->npages++] = addr; + if (palloc->level == PBLE_LEVEL_2) { + struct irdma_pble_info *palloc_info = + palloc->level2.leaf + (iwmr->npages >> PBLE_512_SHIFT); + + palloc_info->addr[iwmr->npages & (PBLE_PER_PAGE - 1)] = addr; + } else { + pbl = palloc->level1.addr; + pbl[iwmr->npages] = addr; + } + iwmr->npages++; return 0; } diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 08371a80fdc2..be189e0525de 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -523,6 +523,10 @@ repoll: "Requestor" : "Responder", cq->mcq.cqn); mlx5_ib_dbg(dev, "syndrome 0x%x, vendor syndrome 0x%x\n", err_cqe->syndrome, err_cqe->vendor_err_synd); + if (wc->status != IB_WC_WR_FLUSH_ERR && + (*cur_qp)->type == MLX5_IB_QPT_REG_UMR) + dev->umrc.state = MLX5_UMR_STATE_RECOVER; + if (opcode == MLX5_CQE_REQ_ERR) { wq = &(*cur_qp)->sq; wqe_ctr = be16_to_cpu(cqe64->wqe_counter); diff --git a/drivers/infiniband/hw/mlx5/dm.c b/drivers/infiniband/hw/mlx5/dm.c index 001d766cf291..3669c90b2dad 100644 --- a/drivers/infiniband/hw/mlx5/dm.c +++ b/drivers/infiniband/hw/mlx5/dm.c @@ -336,9 +336,15 @@ err_copy: static enum mlx5_sw_icm_type get_icm_type(int uapi_type) { - return uapi_type == MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM ? - MLX5_SW_ICM_TYPE_STEERING : - MLX5_SW_ICM_TYPE_HEADER_MODIFY; + switch (uapi_type) { + case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM: + return MLX5_SW_ICM_TYPE_HEADER_MODIFY; + case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_PATTERN_SW_ICM: + return MLX5_SW_ICM_TYPE_HEADER_MODIFY_PATTERN; + case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM: + default: + return MLX5_SW_ICM_TYPE_STEERING; + } } static struct ib_dm *handle_alloc_dm_sw_icm(struct ib_ucontext *ctx, @@ -347,11 +353,32 @@ static struct ib_dm *handle_alloc_dm_sw_icm(struct ib_ucontext *ctx, int type) { struct mlx5_core_dev *dev = to_mdev(ctx->device)->mdev; - enum mlx5_sw_icm_type icm_type = get_icm_type(type); + enum mlx5_sw_icm_type icm_type; struct mlx5_ib_dm_icm *dm; u64 act_size; int err; + if (!capable(CAP_SYS_RAWIO) || !capable(CAP_NET_RAW)) + return ERR_PTR(-EPERM); + + switch (type) { + case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM: + case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM: + if (!(MLX5_CAP_FLOWTABLE_NIC_RX(dev, sw_owner) || + MLX5_CAP_FLOWTABLE_NIC_TX(dev, sw_owner) || + MLX5_CAP_FLOWTABLE_NIC_RX(dev, sw_owner_v2) || + MLX5_CAP_FLOWTABLE_NIC_TX(dev, sw_owner_v2))) + return ERR_PTR(-EOPNOTSUPP); + break; + case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_PATTERN_SW_ICM: + if (!MLX5_CAP_FLOWTABLE_NIC_RX(dev, sw_owner_v2) || + !MLX5_CAP_FLOWTABLE_NIC_TX(dev, sw_owner_v2)) + return ERR_PTR(-EOPNOTSUPP); + break; + default: + return ERR_PTR(-EOPNOTSUPP); + } + dm = kzalloc(sizeof(*dm), GFP_KERNEL); if (!dm) return ERR_PTR(-ENOMEM); @@ -359,19 +386,6 @@ static struct ib_dm *handle_alloc_dm_sw_icm(struct ib_ucontext *ctx, dm->base.type = type; dm->base.ibdm.device = ctx->device; - if (!capable(CAP_SYS_RAWIO) || !capable(CAP_NET_RAW)) { - err = -EPERM; - goto free; - } - - if (!(MLX5_CAP_FLOWTABLE_NIC_RX(dev, sw_owner) || - MLX5_CAP_FLOWTABLE_NIC_TX(dev, sw_owner) || - MLX5_CAP_FLOWTABLE_NIC_RX(dev, sw_owner_v2) || - MLX5_CAP_FLOWTABLE_NIC_TX(dev, sw_owner_v2))) { - err = -EOPNOTSUPP; - goto free; - } - /* Allocation size must a multiple of the basic block size * and a power of 2. */ @@ -379,6 +393,8 @@ static struct ib_dm *handle_alloc_dm_sw_icm(struct ib_ucontext *ctx, act_size = roundup_pow_of_two(act_size); dm->base.size = act_size; + icm_type = get_icm_type(type); + err = mlx5_dm_sw_icm_alloc(dev, icm_type, act_size, attr->alignment, to_mucontext(ctx)->devx_uid, &dm->base.dev_addr, &dm->obj_id); @@ -420,8 +436,8 @@ struct ib_dm *mlx5_ib_alloc_dm(struct ib_device *ibdev, case MLX5_IB_UAPI_DM_TYPE_MEMIC: return handle_alloc_dm_memic(context, attr, attrs); case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM: - return handle_alloc_dm_sw_icm(context, attr, attrs, type); case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM: + case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_PATTERN_SW_ICM: return handle_alloc_dm_sw_icm(context, attr, attrs, type); default: return ERR_PTR(-EOPNOTSUPP); @@ -474,6 +490,7 @@ static int mlx5_ib_dealloc_dm(struct ib_dm *ibdm, return 0; case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM: case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM: + case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_PATTERN_SW_ICM: return mlx5_dm_icm_dealloc(ctx, to_icm(ibdm)); default: return -EOPNOTSUPP; diff --git a/drivers/infiniband/hw/mlx5/fs.c b/drivers/infiniband/hw/mlx5/fs.c index 39ffb363ba0c..691d00c89f33 100644 --- a/drivers/infiniband/hw/mlx5/fs.c +++ b/drivers/infiniband/hw/mlx5/fs.c @@ -679,7 +679,15 @@ enum flow_table_type { #define MLX5_FS_MAX_TYPES 6 #define MLX5_FS_MAX_ENTRIES BIT(16) -static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_flow_namespace *ns, +static bool mlx5_ib_shared_ft_allowed(struct ib_device *device) +{ + struct mlx5_ib_dev *dev = to_mdev(device); + + return MLX5_CAP_GEN(dev->mdev, shared_object_to_user_object_allowed); +} + +static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_ib_dev *dev, + struct mlx5_flow_namespace *ns, struct mlx5_ib_flow_prio *prio, int priority, int num_entries, int num_groups, @@ -688,6 +696,8 @@ static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_flow_namespace *ns, struct mlx5_flow_table_attr ft_attr = {}; struct mlx5_flow_table *ft; + if (mlx5_ib_shared_ft_allowed(&dev->ib_dev)) + ft_attr.uid = MLX5_SHARED_RESOURCE_UID; ft_attr.prio = priority; ft_attr.max_fte = num_entries; ft_attr.flags = flags; @@ -784,8 +794,8 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev, ft = prio->flow_table; if (!ft) - return _get_prio(ns, prio, priority, max_table_size, num_groups, - flags); + return _get_prio(dev, ns, prio, priority, max_table_size, + num_groups, flags); return prio; } @@ -927,7 +937,7 @@ int mlx5_ib_fs_add_op_fc(struct mlx5_ib_dev *dev, u32 port_num, prio = &dev->flow_db->opfcs[type]; if (!prio->flow_table) { - prio = _get_prio(ns, prio, priority, + prio = _get_prio(dev, ns, prio, priority, dev->num_ports * MAX_OPFC_RULES, 1, 0); if (IS_ERR(prio)) { err = PTR_ERR(prio); @@ -1407,8 +1417,8 @@ free_ucmd: } static struct mlx5_ib_flow_prio * -_get_flow_table(struct mlx5_ib_dev *dev, - struct mlx5_ib_flow_matcher *fs_matcher, +_get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority, + enum mlx5_flow_namespace_type ns_type, bool mcast) { struct mlx5_flow_namespace *ns = NULL; @@ -1421,11 +1431,11 @@ _get_flow_table(struct mlx5_ib_dev *dev, if (mcast) priority = MLX5_IB_FLOW_MCAST_PRIO; else - priority = ib_prio_to_core_prio(fs_matcher->priority, false); + priority = ib_prio_to_core_prio(user_priority, false); esw_encap = mlx5_eswitch_get_encap_mode(dev->mdev) != DEVLINK_ESWITCH_ENCAP_MODE_NONE; - switch (fs_matcher->ns_type) { + switch (ns_type) { case MLX5_FLOW_NAMESPACE_BYPASS: max_table_size = BIT( MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, log_max_ft_size)); @@ -1452,17 +1462,17 @@ _get_flow_table(struct mlx5_ib_dev *dev, reformat_l3_tunnel_to_l2) && esw_encap) flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT; - priority = fs_matcher->priority; + priority = user_priority; break; case MLX5_FLOW_NAMESPACE_RDMA_RX: max_table_size = BIT( MLX5_CAP_FLOWTABLE_RDMA_RX(dev->mdev, log_max_ft_size)); - priority = fs_matcher->priority; + priority = user_priority; break; case MLX5_FLOW_NAMESPACE_RDMA_TX: max_table_size = BIT( MLX5_CAP_FLOWTABLE_RDMA_TX(dev->mdev, log_max_ft_size)); - priority = fs_matcher->priority; + priority = user_priority; break; default: break; @@ -1470,11 +1480,11 @@ _get_flow_table(struct mlx5_ib_dev *dev, max_table_size = min_t(int, max_table_size, MLX5_FS_MAX_ENTRIES); - ns = mlx5_get_flow_namespace(dev->mdev, fs_matcher->ns_type); + ns = mlx5_get_flow_namespace(dev->mdev, ns_type); if (!ns) return ERR_PTR(-EOPNOTSUPP); - switch (fs_matcher->ns_type) { + switch (ns_type) { case MLX5_FLOW_NAMESPACE_BYPASS: prio = &dev->flow_db->prios[priority]; break; @@ -1499,7 +1509,7 @@ _get_flow_table(struct mlx5_ib_dev *dev, if (prio->flow_table) return prio; - return _get_prio(ns, prio, priority, max_table_size, + return _get_prio(dev, ns, prio, priority, max_table_size, MLX5_FS_MAX_TYPES, flags); } @@ -1618,7 +1628,8 @@ static struct mlx5_ib_flow_handler *raw_fs_rule_add( mcast = raw_fs_is_multicast(fs_matcher, cmd_in); mutex_lock(&dev->flow_db->lock); - ft_prio = _get_flow_table(dev, fs_matcher, mcast); + ft_prio = _get_flow_table(dev, fs_matcher->priority, + fs_matcher->ns_type, mcast); if (IS_ERR(ft_prio)) { err = PTR_ERR(ft_prio); goto unlock; @@ -2015,6 +2026,23 @@ static int flow_matcher_cleanup(struct ib_uobject *uobject, return 0; } +static int steering_anchor_cleanup(struct ib_uobject *uobject, + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_ib_steering_anchor *obj = uobject->object; + + if (atomic_read(&obj->usecnt)) + return -EBUSY; + + mutex_lock(&obj->dev->flow_db->lock); + put_flow_table(obj->dev, obj->ft_prio, true); + mutex_unlock(&obj->dev->flow_db->lock); + + kfree(obj); + return 0; +} + static int mlx5_ib_matcher_ns(struct uverbs_attr_bundle *attrs, struct mlx5_ib_flow_matcher *obj) { @@ -2121,6 +2149,75 @@ end: return err; } +static int UVERBS_HANDLER(MLX5_IB_METHOD_STEERING_ANCHOR_CREATE)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = uverbs_attr_get_uobject( + attrs, MLX5_IB_ATTR_STEERING_ANCHOR_CREATE_HANDLE); + struct mlx5_ib_dev *dev = mlx5_udata_to_mdev(&attrs->driver_udata); + enum mlx5_ib_uapi_flow_table_type ib_uapi_ft_type; + enum mlx5_flow_namespace_type ns_type; + struct mlx5_ib_steering_anchor *obj; + struct mlx5_ib_flow_prio *ft_prio; + u16 priority; + u32 ft_id; + int err; + + if (!capable(CAP_NET_RAW)) + return -EPERM; + + err = uverbs_get_const(&ib_uapi_ft_type, attrs, + MLX5_IB_ATTR_STEERING_ANCHOR_FT_TYPE); + if (err) + return err; + + err = mlx5_ib_ft_type_to_namespace(ib_uapi_ft_type, &ns_type); + if (err) + return err; + + err = uverbs_copy_from(&priority, attrs, + MLX5_IB_ATTR_STEERING_ANCHOR_PRIORITY); + if (err) + return err; + + obj = kzalloc(sizeof(*obj), GFP_KERNEL); + if (!obj) + return -ENOMEM; + + mutex_lock(&dev->flow_db->lock); + ft_prio = _get_flow_table(dev, priority, ns_type, 0); + if (IS_ERR(ft_prio)) { + mutex_unlock(&dev->flow_db->lock); + err = PTR_ERR(ft_prio); + goto free_obj; + } + + ft_prio->refcount++; + ft_id = mlx5_flow_table_id(ft_prio->flow_table); + mutex_unlock(&dev->flow_db->lock); + + err = uverbs_copy_to(attrs, MLX5_IB_ATTR_STEERING_ANCHOR_FT_ID, + &ft_id, sizeof(ft_id)); + if (err) + goto put_flow_table; + + uobj->object = obj; + obj->dev = dev; + obj->ft_prio = ft_prio; + atomic_set(&obj->usecnt, 0); + + return 0; + +put_flow_table: + mutex_lock(&dev->flow_db->lock); + put_flow_table(dev, ft_prio, true); + mutex_unlock(&dev->flow_db->lock); +free_obj: + kfree(obj); + + return err; +} + static struct ib_flow_action * mlx5_ib_create_modify_header(struct mlx5_ib_dev *dev, enum mlx5_ib_uapi_flow_table_type ft_type, @@ -2477,6 +2574,35 @@ DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_FLOW_MATCHER, &UVERBS_METHOD(MLX5_IB_METHOD_FLOW_MATCHER_CREATE), &UVERBS_METHOD(MLX5_IB_METHOD_FLOW_MATCHER_DESTROY)); +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_STEERING_ANCHOR_CREATE, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_STEERING_ANCHOR_CREATE_HANDLE, + MLX5_IB_OBJECT_STEERING_ANCHOR, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_STEERING_ANCHOR_FT_TYPE, + enum mlx5_ib_uapi_flow_table_type, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_STEERING_ANCHOR_PRIORITY, + UVERBS_ATTR_TYPE(u16), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_STEERING_ANCHOR_FT_ID, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD_DESTROY( + MLX5_IB_METHOD_STEERING_ANCHOR_DESTROY, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_STEERING_ANCHOR_DESTROY_HANDLE, + MLX5_IB_OBJECT_STEERING_ANCHOR, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT( + MLX5_IB_OBJECT_STEERING_ANCHOR, + UVERBS_TYPE_ALLOC_IDR(steering_anchor_cleanup), + &UVERBS_METHOD(MLX5_IB_METHOD_STEERING_ANCHOR_CREATE), + &UVERBS_METHOD(MLX5_IB_METHOD_STEERING_ANCHOR_DESTROY)); + const struct uapi_definition mlx5_ib_flow_defs[] = { UAPI_DEF_CHAIN_OBJ_TREE_NAMED( MLX5_IB_OBJECT_FLOW_MATCHER), @@ -2485,6 +2611,9 @@ const struct uapi_definition mlx5_ib_flow_defs[] = { &mlx5_ib_fs), UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_FLOW_ACTION, &mlx5_ib_flow_actions), + UAPI_DEF_CHAIN_OBJ_TREE_NAMED( + MLX5_IB_OBJECT_STEERING_ANCHOR, + UAPI_DEF_IS_OBJ_SUPPORTED(mlx5_ib_shared_ft_allowed)), {}, }; diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index b68fddeac0f1..a174a0eee8dc 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -4002,7 +4002,7 @@ static void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev) { int err; - err = mlx5_mr_cache_cleanup(dev); + err = mlx5_mkey_cache_cleanup(dev); if (err) mlx5_ib_warn(dev, "mr cache cleanup failed\n"); @@ -4022,7 +4022,7 @@ static int mlx5_ib_stage_post_ib_reg_umr_init(struct mlx5_ib_dev *dev) if (ret) return ret; - ret = mlx5_mr_cache_init(dev); + ret = mlx5_mkey_cache_init(dev); if (ret) { mlx5_ib_warn(dev, "mr cache init failed %d\n", ret); mlx5r_umr_resource_cleanup(dev); diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 998b67509a53..2e2ad3918385 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -259,6 +259,12 @@ struct mlx5_ib_flow_matcher { u8 match_criteria_enable; }; +struct mlx5_ib_steering_anchor { + struct mlx5_ib_flow_prio *ft_prio; + struct mlx5_ib_dev *dev; + atomic_t usecnt; +}; + struct mlx5_ib_pp { u16 index; struct mlx5_core_dev *mdev; @@ -613,6 +619,7 @@ struct mlx5_ib_mkey { unsigned int ndescs; struct wait_queue_head wait; refcount_t usecount; + struct mlx5_cache_ent *cache_ent; }; #define MLX5_IB_MTT_PRESENT (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE) @@ -635,20 +642,9 @@ struct mlx5_ib_mr { struct ib_mr ibmr; struct mlx5_ib_mkey mmkey; - /* User MR data */ - struct mlx5_cache_ent *cache_ent; - /* Everything after cache_ent is zero'd when MR allocated */ struct ib_umem *umem; union { - /* Used only while the MR is in the cache */ - struct { - u32 out[MLX5_ST_SZ_DW(create_mkey_out)]; - struct mlx5_async_work cb_work; - /* Cache list element */ - struct list_head list; - }; - /* Used only by kernel MRs (umem == NULL) */ struct { void *descs; @@ -688,12 +684,6 @@ struct mlx5_ib_mr { }; }; -/* Zero the fields in the mr that are variant depending on usage */ -static inline void mlx5_clear_mr(struct mlx5_ib_mr *mr) -{ - memset_after(mr, 0, cache_ent); -} - static inline bool is_odp_mr(struct mlx5_ib_mr *mr) { return IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && mr->umem && @@ -717,21 +707,29 @@ struct mlx5_ib_umr_context { struct completion done; }; +enum { + MLX5_UMR_STATE_ACTIVE, + MLX5_UMR_STATE_RECOVER, + MLX5_UMR_STATE_ERR, +}; + struct umr_common { struct ib_pd *pd; struct ib_cq *cq; struct ib_qp *qp; - /* control access to UMR QP + /* Protects from UMR QP overflow */ struct semaphore sem; + /* Protects from using UMR while the UMR is not active + */ + struct mutex lock; + unsigned int state; }; struct mlx5_cache_ent { - struct list_head head; - /* sync access to the cahce entry - */ - spinlock_t lock; - + struct xarray mkeys; + unsigned long stored; + unsigned long reserved; char name[4]; u32 order; @@ -743,18 +741,11 @@ struct mlx5_cache_ent { u8 fill_to_high_water:1; /* - * - available_mrs is the length of list head, ie the number of MRs - * available for immediate allocation. - * - total_mrs is available_mrs plus all in use MRs that could be - * returned to the cache. - * - limit is the low water mark for available_mrs, 2* limit is the + * - limit is the low water mark for stored mkeys, 2* limit is the * upper water mark. - * - pending is the number of MRs currently being created */ - u32 total_mrs; - u32 available_mrs; + u32 in_use; u32 limit; - u32 pending; /* Statistics */ u32 miss; @@ -763,9 +754,19 @@ struct mlx5_cache_ent { struct delayed_work dwork; }; -struct mlx5_mr_cache { +struct mlx5r_async_create_mkey { + union { + u32 in[MLX5_ST_SZ_BYTES(create_mkey_in)]; + u32 out[MLX5_ST_SZ_DW(create_mkey_out)]; + }; + struct mlx5_async_work cb_work; + struct mlx5_cache_ent *ent; + u32 mkey; +}; + +struct mlx5_mkey_cache { struct workqueue_struct *wq; - struct mlx5_cache_ent ent[MAX_MR_CACHE_ENTRIES]; + struct mlx5_cache_ent ent[MAX_MKEY_CACHE_ENTRIES]; struct dentry *root; unsigned long last_add; }; @@ -1064,7 +1065,7 @@ struct mlx5_ib_dev { struct mlx5_ib_resources devr; atomic_t mkey_var; - struct mlx5_mr_cache cache; + struct mlx5_mkey_cache cache; struct timer_list delay_timer; /* Prevents soft lock on massive reg MRs */ struct mutex slow_path_mutex; @@ -1309,8 +1310,8 @@ void mlx5_ib_populate_pas(struct ib_umem *umem, size_t page_size, __be64 *pas, u64 access_flags); void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num); int mlx5_ib_get_cqe_size(struct ib_cq *ibcq); -int mlx5_mr_cache_init(struct mlx5_ib_dev *dev); -int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev); +int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev); +int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev); struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent, @@ -1338,7 +1339,7 @@ int mlx5r_odp_create_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq); void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev); int __init mlx5_ib_odp_init(void); void mlx5_ib_odp_cleanup(void); -void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent); +void mlx5_odp_init_mkey_cache_entry(struct mlx5_cache_ent *ent); void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries, struct mlx5_ib_mr *mr, int flags); @@ -1357,7 +1358,7 @@ static inline int mlx5r_odp_create_eq(struct mlx5_ib_dev *dev, static inline void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev) {} static inline int mlx5_ib_odp_init(void) { return 0; } static inline void mlx5_ib_odp_cleanup(void) {} -static inline void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) {} +static inline void mlx5_odp_init_mkey_cache_entry(struct mlx5_cache_ent *ent) {} static inline void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries, struct mlx5_ib_mr *mr, int flags) {} diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 1e7653c997b5..129d531bd01b 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -82,15 +82,14 @@ static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr, MLX5_SET64(mkc, mkc, start_addr, start_addr); } -static void assign_mkey_variant(struct mlx5_ib_dev *dev, - struct mlx5_ib_mkey *mkey, u32 *in) +static void assign_mkey_variant(struct mlx5_ib_dev *dev, u32 *mkey, u32 *in) { u8 key = atomic_inc_return(&dev->mkey_var); void *mkc; mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); MLX5_SET(mkc, mkc, mkey_7_0, key); - mkey->key = key; + *mkey = key; } static int mlx5_ib_create_mkey(struct mlx5_ib_dev *dev, @@ -98,7 +97,7 @@ static int mlx5_ib_create_mkey(struct mlx5_ib_dev *dev, { int ret; - assign_mkey_variant(dev, mkey, in); + assign_mkey_variant(dev, &mkey->key, in); ret = mlx5_core_create_mkey(dev->mdev, &mkey->key, in, inlen); if (!ret) init_waitqueue_head(&mkey->wait); @@ -106,20 +105,21 @@ static int mlx5_ib_create_mkey(struct mlx5_ib_dev *dev, return ret; } -static int -mlx5_ib_create_mkey_cb(struct mlx5_ib_dev *dev, - struct mlx5_ib_mkey *mkey, - struct mlx5_async_ctx *async_ctx, - u32 *in, int inlen, u32 *out, int outlen, - struct mlx5_async_work *context) +static int mlx5_ib_create_mkey_cb(struct mlx5r_async_create_mkey *async_create) { - MLX5_SET(create_mkey_in, in, opcode, MLX5_CMD_OP_CREATE_MKEY); - assign_mkey_variant(dev, mkey, in); - return mlx5_cmd_exec_cb(async_ctx, in, inlen, out, outlen, - create_mkey_callback, context); + struct mlx5_ib_dev *dev = async_create->ent->dev; + size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in); + size_t outlen = MLX5_ST_SZ_BYTES(create_mkey_out); + + MLX5_SET(create_mkey_in, async_create->in, opcode, + MLX5_CMD_OP_CREATE_MKEY); + assign_mkey_variant(dev, &async_create->mkey, async_create->in); + return mlx5_cmd_exec_cb(&dev->async_ctx, async_create->in, inlen, + async_create->out, outlen, create_mkey_callback, + &async_create->cb_work); } -static int mr_cache_max_order(struct mlx5_ib_dev *dev); +static int mkey_cache_max_order(struct mlx5_ib_dev *dev); static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent); static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) @@ -142,40 +142,132 @@ static void create_mkey_warn(struct mlx5_ib_dev *dev, int status, void *out) mlx5_cmd_out_err(dev->mdev, MLX5_CMD_OP_CREATE_MKEY, 0, out); } + +static int push_mkey(struct mlx5_cache_ent *ent, bool limit_pendings, + void *to_store) +{ + XA_STATE(xas, &ent->mkeys, 0); + void *curr; + + xa_lock_irq(&ent->mkeys); + if (limit_pendings && + (ent->reserved - ent->stored) > MAX_PENDING_REG_MR) { + xa_unlock_irq(&ent->mkeys); + return -EAGAIN; + } + while (1) { + /* + * This is cmpxchg (NULL, XA_ZERO_ENTRY) however this version + * doesn't transparently unlock. Instead we set the xas index to + * the current value of reserved every iteration. + */ + xas_set(&xas, ent->reserved); + curr = xas_load(&xas); + if (!curr) { + if (to_store && ent->stored == ent->reserved) + xas_store(&xas, to_store); + else + xas_store(&xas, XA_ZERO_ENTRY); + if (xas_valid(&xas)) { + ent->reserved++; + if (to_store) { + if (ent->stored != ent->reserved) + __xa_store(&ent->mkeys, + ent->stored, + to_store, + GFP_KERNEL); + ent->stored++; + queue_adjust_cache_locked(ent); + WRITE_ONCE(ent->dev->cache.last_add, + jiffies); + } + } + } + xa_unlock_irq(&ent->mkeys); + + /* + * Notice xas_nomem() must always be called as it cleans + * up any cached allocation. + */ + if (!xas_nomem(&xas, GFP_KERNEL)) + break; + xa_lock_irq(&ent->mkeys); + } + if (xas_error(&xas)) + return xas_error(&xas); + if (WARN_ON(curr)) + return -EINVAL; + return 0; +} + +static void undo_push_reserve_mkey(struct mlx5_cache_ent *ent) +{ + void *old; + + ent->reserved--; + old = __xa_erase(&ent->mkeys, ent->reserved); + WARN_ON(old); +} + +static void push_to_reserved(struct mlx5_cache_ent *ent, u32 mkey) +{ + void *old; + + old = __xa_store(&ent->mkeys, ent->stored, xa_mk_value(mkey), 0); + WARN_ON(old); + ent->stored++; +} + +static u32 pop_stored_mkey(struct mlx5_cache_ent *ent) +{ + void *old, *xa_mkey; + + ent->stored--; + ent->reserved--; + + if (ent->stored == ent->reserved) { + xa_mkey = __xa_erase(&ent->mkeys, ent->stored); + WARN_ON(!xa_mkey); + return (u32)xa_to_value(xa_mkey); + } + + xa_mkey = __xa_store(&ent->mkeys, ent->stored, XA_ZERO_ENTRY, + GFP_KERNEL); + WARN_ON(!xa_mkey || xa_is_err(xa_mkey)); + old = __xa_erase(&ent->mkeys, ent->reserved); + WARN_ON(old); + return (u32)xa_to_value(xa_mkey); +} + static void create_mkey_callback(int status, struct mlx5_async_work *context) { - struct mlx5_ib_mr *mr = - container_of(context, struct mlx5_ib_mr, cb_work); - struct mlx5_cache_ent *ent = mr->cache_ent; + struct mlx5r_async_create_mkey *mkey_out = + container_of(context, struct mlx5r_async_create_mkey, cb_work); + struct mlx5_cache_ent *ent = mkey_out->ent; struct mlx5_ib_dev *dev = ent->dev; unsigned long flags; if (status) { - create_mkey_warn(dev, status, mr->out); - kfree(mr); - spin_lock_irqsave(&ent->lock, flags); - ent->pending--; + create_mkey_warn(dev, status, mkey_out->out); + kfree(mkey_out); + xa_lock_irqsave(&ent->mkeys, flags); + undo_push_reserve_mkey(ent); WRITE_ONCE(dev->fill_delay, 1); - spin_unlock_irqrestore(&ent->lock, flags); + xa_unlock_irqrestore(&ent->mkeys, flags); mod_timer(&dev->delay_timer, jiffies + HZ); return; } - mr->mmkey.type = MLX5_MKEY_MR; - mr->mmkey.key |= mlx5_idx_to_mkey( - MLX5_GET(create_mkey_out, mr->out, mkey_index)); - init_waitqueue_head(&mr->mmkey.wait); - + mkey_out->mkey |= mlx5_idx_to_mkey( + MLX5_GET(create_mkey_out, mkey_out->out, mkey_index)); WRITE_ONCE(dev->cache.last_add, jiffies); - spin_lock_irqsave(&ent->lock, flags); - list_add_tail(&mr->list, &ent->head); - ent->available_mrs++; - ent->total_mrs++; + xa_lock_irqsave(&ent->mkeys, flags); + push_to_reserved(ent, mkey_out->mkey); /* If we are doing fill_to_high_water then keep going. */ queue_adjust_cache_locked(ent); - ent->pending--; - spin_unlock_irqrestore(&ent->lock, flags); + xa_unlock_irqrestore(&ent->mkeys, flags); + kfree(mkey_out); } static int get_mkc_octo_size(unsigned int access_mode, unsigned int ndescs) @@ -197,15 +289,8 @@ static int get_mkc_octo_size(unsigned int access_mode, unsigned int ndescs) return ret; } -static struct mlx5_ib_mr *alloc_cache_mr(struct mlx5_cache_ent *ent, void *mkc) +static void set_cache_mkc(struct mlx5_cache_ent *ent, void *mkc) { - struct mlx5_ib_mr *mr; - - mr = kzalloc(sizeof(*mr), GFP_KERNEL); - if (!mr) - return NULL; - mr->cache_ent = ent; - set_mkc_access_pd_addr_fields(mkc, 0, 0, ent->dev->umrc.pd); MLX5_SET(mkc, mkc, free, 1); MLX5_SET(mkc, mkc, umr_en, 1); @@ -215,133 +300,106 @@ static struct mlx5_ib_mr *alloc_cache_mr(struct mlx5_cache_ent *ent, void *mkc) MLX5_SET(mkc, mkc, translations_octword_size, get_mkc_octo_size(ent->access_mode, ent->ndescs)); MLX5_SET(mkc, mkc, log_page_size, ent->page); - return mr; } /* Asynchronously schedule new MRs to be populated in the cache. */ static int add_keys(struct mlx5_cache_ent *ent, unsigned int num) { - size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in); - struct mlx5_ib_mr *mr; + struct mlx5r_async_create_mkey *async_create; void *mkc; - u32 *in; int err = 0; int i; - in = kzalloc(inlen, GFP_KERNEL); - if (!in) - return -ENOMEM; - - mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); for (i = 0; i < num; i++) { - mr = alloc_cache_mr(ent, mkc); - if (!mr) { - err = -ENOMEM; - break; - } - spin_lock_irq(&ent->lock); - if (ent->pending >= MAX_PENDING_REG_MR) { - err = -EAGAIN; - spin_unlock_irq(&ent->lock); - kfree(mr); - break; - } - ent->pending++; - spin_unlock_irq(&ent->lock); - err = mlx5_ib_create_mkey_cb(ent->dev, &mr->mmkey, - &ent->dev->async_ctx, in, inlen, - mr->out, sizeof(mr->out), - &mr->cb_work); + async_create = kzalloc(sizeof(struct mlx5r_async_create_mkey), + GFP_KERNEL); + if (!async_create) + return -ENOMEM; + mkc = MLX5_ADDR_OF(create_mkey_in, async_create->in, + memory_key_mkey_entry); + set_cache_mkc(ent, mkc); + async_create->ent = ent; + + err = push_mkey(ent, true, NULL); + if (err) + goto free_async_create; + + err = mlx5_ib_create_mkey_cb(async_create); if (err) { - spin_lock_irq(&ent->lock); - ent->pending--; - spin_unlock_irq(&ent->lock); mlx5_ib_warn(ent->dev, "create mkey failed %d\n", err); - kfree(mr); - break; + goto err_undo_reserve; } } - kfree(in); + return 0; + +err_undo_reserve: + xa_lock_irq(&ent->mkeys); + undo_push_reserve_mkey(ent); + xa_unlock_irq(&ent->mkeys); +free_async_create: + kfree(async_create); return err; } /* Synchronously create a MR in the cache */ -static struct mlx5_ib_mr *create_cache_mr(struct mlx5_cache_ent *ent) +static int create_cache_mkey(struct mlx5_cache_ent *ent, u32 *mkey) { size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in); - struct mlx5_ib_mr *mr; void *mkc; u32 *in; int err; in = kzalloc(inlen, GFP_KERNEL); if (!in) - return ERR_PTR(-ENOMEM); + return -ENOMEM; mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + set_cache_mkc(ent, mkc); - mr = alloc_cache_mr(ent, mkc); - if (!mr) { - err = -ENOMEM; - goto free_in; - } - - err = mlx5_core_create_mkey(ent->dev->mdev, &mr->mmkey.key, in, inlen); + err = mlx5_core_create_mkey(ent->dev->mdev, mkey, in, inlen); if (err) - goto free_mr; + goto free_in; - init_waitqueue_head(&mr->mmkey.wait); - mr->mmkey.type = MLX5_MKEY_MR; WRITE_ONCE(ent->dev->cache.last_add, jiffies); - spin_lock_irq(&ent->lock); - ent->total_mrs++; - spin_unlock_irq(&ent->lock); - kfree(in); - return mr; -free_mr: - kfree(mr); free_in: kfree(in); - return ERR_PTR(err); + return err; } static void remove_cache_mr_locked(struct mlx5_cache_ent *ent) { - struct mlx5_ib_mr *mr; + u32 mkey; - lockdep_assert_held(&ent->lock); - if (list_empty(&ent->head)) + lockdep_assert_held(&ent->mkeys.xa_lock); + if (!ent->stored) return; - mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); - list_del(&mr->list); - ent->available_mrs--; - ent->total_mrs--; - spin_unlock_irq(&ent->lock); - mlx5_core_destroy_mkey(ent->dev->mdev, mr->mmkey.key); - kfree(mr); - spin_lock_irq(&ent->lock); + mkey = pop_stored_mkey(ent); + xa_unlock_irq(&ent->mkeys); + mlx5_core_destroy_mkey(ent->dev->mdev, mkey); + xa_lock_irq(&ent->mkeys); } static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target, bool limit_fill) + __acquires(&ent->mkeys) __releases(&ent->mkeys) { int err; - lockdep_assert_held(&ent->lock); + lockdep_assert_held(&ent->mkeys.xa_lock); while (true) { if (limit_fill) target = ent->limit * 2; - if (target == ent->available_mrs + ent->pending) + if (target == ent->reserved) return 0; - if (target > ent->available_mrs + ent->pending) { - u32 todo = target - (ent->available_mrs + ent->pending); + if (target > ent->reserved) { + u32 todo = target - ent->reserved; - spin_unlock_irq(&ent->lock); + xa_unlock_irq(&ent->mkeys); err = add_keys(ent, todo); if (err == -EAGAIN) usleep_range(3000, 5000); - spin_lock_irq(&ent->lock); + xa_lock_irq(&ent->mkeys); if (err) { if (err != -EAGAIN) return err; @@ -366,15 +424,15 @@ static ssize_t size_write(struct file *filp, const char __user *buf, /* * Target is the new value of total_mrs the user requests, however we - * cannot free MRs that are in use. Compute the target value for - * available_mrs. + * cannot free MRs that are in use. Compute the target value for stored + * mkeys. */ - spin_lock_irq(&ent->lock); - if (target < ent->total_mrs - ent->available_mrs) { + xa_lock_irq(&ent->mkeys); + if (target < ent->in_use) { err = -EINVAL; goto err_unlock; } - target = target - (ent->total_mrs - ent->available_mrs); + target = target - ent->in_use; if (target < ent->limit || target > ent->limit*2) { err = -EINVAL; goto err_unlock; @@ -382,12 +440,12 @@ static ssize_t size_write(struct file *filp, const char __user *buf, err = resize_available_mrs(ent, target, false); if (err) goto err_unlock; - spin_unlock_irq(&ent->lock); + xa_unlock_irq(&ent->mkeys); return count; err_unlock: - spin_unlock_irq(&ent->lock); + xa_unlock_irq(&ent->mkeys); return err; } @@ -398,7 +456,7 @@ static ssize_t size_read(struct file *filp, char __user *buf, size_t count, char lbuf[20]; int err; - err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->total_mrs); + err = snprintf(lbuf, sizeof(lbuf), "%ld\n", ent->stored + ent->in_use); if (err < 0) return err; @@ -427,10 +485,10 @@ static ssize_t limit_write(struct file *filp, const char __user *buf, * Upon set we immediately fill the cache to high water mark implied by * the limit. */ - spin_lock_irq(&ent->lock); + xa_lock_irq(&ent->mkeys); ent->limit = var; err = resize_available_mrs(ent, 0, true); - spin_unlock_irq(&ent->lock); + xa_unlock_irq(&ent->mkeys); if (err) return err; return count; @@ -457,17 +515,17 @@ static const struct file_operations limit_fops = { .read = limit_read, }; -static bool someone_adding(struct mlx5_mr_cache *cache) +static bool someone_adding(struct mlx5_mkey_cache *cache) { unsigned int i; - for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { + for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) { struct mlx5_cache_ent *ent = &cache->ent[i]; bool ret; - spin_lock_irq(&ent->lock); - ret = ent->available_mrs < ent->limit; - spin_unlock_irq(&ent->lock); + xa_lock_irq(&ent->mkeys); + ret = ent->stored < ent->limit; + xa_unlock_irq(&ent->mkeys); if (ret) return true; } @@ -481,26 +539,26 @@ static bool someone_adding(struct mlx5_mr_cache *cache) */ static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent) { - lockdep_assert_held(&ent->lock); + lockdep_assert_held(&ent->mkeys.xa_lock); if (ent->disabled || READ_ONCE(ent->dev->fill_delay)) return; - if (ent->available_mrs < ent->limit) { + if (ent->stored < ent->limit) { ent->fill_to_high_water = true; mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0); } else if (ent->fill_to_high_water && - ent->available_mrs + ent->pending < 2 * ent->limit) { + ent->reserved < 2 * ent->limit) { /* * Once we start populating due to hitting a low water mark * continue until we pass the high water mark. */ mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0); - } else if (ent->available_mrs == 2 * ent->limit) { + } else if (ent->stored == 2 * ent->limit) { ent->fill_to_high_water = false; - } else if (ent->available_mrs > 2 * ent->limit) { + } else if (ent->stored > 2 * ent->limit) { /* Queue deletion of excess entries */ ent->fill_to_high_water = false; - if (ent->pending) + if (ent->stored != ent->reserved) queue_delayed_work(ent->dev->cache.wq, &ent->dwork, msecs_to_jiffies(1000)); else @@ -511,25 +569,24 @@ static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent) static void __cache_work_func(struct mlx5_cache_ent *ent) { struct mlx5_ib_dev *dev = ent->dev; - struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_mkey_cache *cache = &dev->cache; int err; - spin_lock_irq(&ent->lock); + xa_lock_irq(&ent->mkeys); if (ent->disabled) goto out; - if (ent->fill_to_high_water && - ent->available_mrs + ent->pending < 2 * ent->limit && + if (ent->fill_to_high_water && ent->reserved < 2 * ent->limit && !READ_ONCE(dev->fill_delay)) { - spin_unlock_irq(&ent->lock); + xa_unlock_irq(&ent->mkeys); err = add_keys(ent, 1); - spin_lock_irq(&ent->lock); + xa_lock_irq(&ent->mkeys); if (ent->disabled) goto out; if (err) { /* - * EAGAIN only happens if pending is positive, so we - * will be rescheduled from reg_mr_callback(). The only + * EAGAIN only happens if there are pending MRs, so we + * will be rescheduled when storing them. The only * failure path here is ENOMEM. */ if (err != -EAGAIN) { @@ -541,7 +598,7 @@ static void __cache_work_func(struct mlx5_cache_ent *ent) msecs_to_jiffies(1000)); } } - } else if (ent->available_mrs > 2 * ent->limit) { + } else if (ent->stored > 2 * ent->limit) { bool need_delay; /* @@ -556,11 +613,11 @@ static void __cache_work_func(struct mlx5_cache_ent *ent) * the garbage collection work to try to run in next cycle, in * order to free CPU resources to other tasks. */ - spin_unlock_irq(&ent->lock); + xa_unlock_irq(&ent->mkeys); need_delay = need_resched() || someone_adding(cache) || !time_after(jiffies, READ_ONCE(cache->last_add) + 300 * HZ); - spin_lock_irq(&ent->lock); + xa_lock_irq(&ent->mkeys); if (ent->disabled) goto out; if (need_delay) { @@ -571,7 +628,7 @@ static void __cache_work_func(struct mlx5_cache_ent *ent) queue_adjust_cache_locked(ent); } out: - spin_unlock_irq(&ent->lock); + xa_unlock_irq(&ent->mkeys); } static void delayed_cache_work_func(struct work_struct *work) @@ -587,73 +644,59 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, int access_flags) { struct mlx5_ib_mr *mr; + int err; - /* Matches access in alloc_cache_mr() */ if (!mlx5r_umr_can_reconfig(dev, 0, access_flags)) return ERR_PTR(-EOPNOTSUPP); - spin_lock_irq(&ent->lock); - if (list_empty(&ent->head)) { + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + xa_lock_irq(&ent->mkeys); + ent->in_use++; + + if (!ent->stored) { queue_adjust_cache_locked(ent); ent->miss++; - spin_unlock_irq(&ent->lock); - mr = create_cache_mr(ent); - if (IS_ERR(mr)) - return mr; + xa_unlock_irq(&ent->mkeys); + err = create_cache_mkey(ent, &mr->mmkey.key); + if (err) { + xa_lock_irq(&ent->mkeys); + ent->in_use--; + xa_unlock_irq(&ent->mkeys); + kfree(mr); + return ERR_PTR(err); + } } else { - mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); - list_del(&mr->list); - ent->available_mrs--; + mr->mmkey.key = pop_stored_mkey(ent); queue_adjust_cache_locked(ent); - spin_unlock_irq(&ent->lock); - - mlx5_clear_mr(mr); + xa_unlock_irq(&ent->mkeys); } + mr->mmkey.cache_ent = ent; + mr->mmkey.type = MLX5_MKEY_MR; + init_waitqueue_head(&mr->mmkey.wait); return mr; } -static void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) -{ - struct mlx5_cache_ent *ent = mr->cache_ent; - - WRITE_ONCE(dev->cache.last_add, jiffies); - spin_lock_irq(&ent->lock); - list_add_tail(&mr->list, &ent->head); - ent->available_mrs++; - queue_adjust_cache_locked(ent); - spin_unlock_irq(&ent->lock); -} - static void clean_keys(struct mlx5_ib_dev *dev, int c) { - struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_mkey_cache *cache = &dev->cache; struct mlx5_cache_ent *ent = &cache->ent[c]; - struct mlx5_ib_mr *tmp_mr; - struct mlx5_ib_mr *mr; - LIST_HEAD(del_list); + u32 mkey; cancel_delayed_work(&ent->dwork); - while (1) { - spin_lock_irq(&ent->lock); - if (list_empty(&ent->head)) { - spin_unlock_irq(&ent->lock); - break; - } - mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); - list_move(&mr->list, &del_list); - ent->available_mrs--; - ent->total_mrs--; - spin_unlock_irq(&ent->lock); - mlx5_core_destroy_mkey(dev->mdev, mr->mmkey.key); - } - - list_for_each_entry_safe(mr, tmp_mr, &del_list, list) { - list_del(&mr->list); - kfree(mr); + xa_lock_irq(&ent->mkeys); + while (ent->stored) { + mkey = pop_stored_mkey(ent); + xa_unlock_irq(&ent->mkeys); + mlx5_core_destroy_mkey(dev->mdev, mkey); + xa_lock_irq(&ent->mkeys); } + xa_unlock_irq(&ent->mkeys); } -static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev) +static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev) { if (!mlx5_debugfs_root || dev->is_rep) return; @@ -662,9 +705,9 @@ static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev) dev->cache.root = NULL; } -static void mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev) +static void mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev *dev) { - struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_mkey_cache *cache = &dev->cache; struct mlx5_cache_ent *ent; struct dentry *dir; int i; @@ -674,13 +717,13 @@ static void mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev) cache->root = debugfs_create_dir("mr_cache", mlx5_debugfs_get_dev_root(dev->mdev)); - for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { + for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) { ent = &cache->ent[i]; sprintf(ent->name, "%d", ent->order); dir = debugfs_create_dir(ent->name, cache->root); debugfs_create_file("size", 0600, dir, ent, &size_fops); debugfs_create_file("limit", 0600, dir, ent, &limit_fops); - debugfs_create_u32("cur", 0400, dir, &ent->available_mrs); + debugfs_create_ulong("cur", 0400, dir, &ent->stored); debugfs_create_u32("miss", 0600, dir, &ent->miss); } } @@ -692,9 +735,9 @@ static void delay_time_func(struct timer_list *t) WRITE_ONCE(dev->fill_delay, 0); } -int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) +int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) { - struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_mkey_cache *cache = &dev->cache; struct mlx5_cache_ent *ent; int i; @@ -707,22 +750,21 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx); timer_setup(&dev->delay_timer, delay_time_func, 0); - for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { + for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) { ent = &cache->ent[i]; - INIT_LIST_HEAD(&ent->head); - spin_lock_init(&ent->lock); + xa_init_flags(&ent->mkeys, XA_FLAGS_LOCK_IRQ); ent->order = i + 2; ent->dev = dev; ent->limit = 0; INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func); - if (i > MR_CACHE_LAST_STD_ENTRY) { - mlx5_odp_init_mr_cache_entry(ent); + if (i > MKEY_CACHE_LAST_STD_ENTRY) { + mlx5_odp_init_mkey_cache_entry(ent); continue; } - if (ent->order > mr_cache_max_order(dev)) + if (ent->order > mkey_cache_max_order(dev)) continue; ent->page = PAGE_SHIFT; @@ -734,36 +776,36 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) ent->limit = dev->mdev->profile.mr_cache[i].limit; else ent->limit = 0; - spin_lock_irq(&ent->lock); + xa_lock_irq(&ent->mkeys); queue_adjust_cache_locked(ent); - spin_unlock_irq(&ent->lock); + xa_unlock_irq(&ent->mkeys); } - mlx5_mr_cache_debugfs_init(dev); + mlx5_mkey_cache_debugfs_init(dev); return 0; } -int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev) +int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev) { unsigned int i; if (!dev->cache.wq) return 0; - for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { + for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) { struct mlx5_cache_ent *ent = &dev->cache.ent[i]; - spin_lock_irq(&ent->lock); + xa_lock_irq(&ent->mkeys); ent->disabled = true; - spin_unlock_irq(&ent->lock); + xa_unlock_irq(&ent->mkeys); cancel_delayed_work_sync(&ent->dwork); } - mlx5_mr_cache_debugfs_cleanup(dev); + mlx5_mkey_cache_debugfs_cleanup(dev); mlx5_cmd_cleanup_async_ctx(&dev->async_ctx); - for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) + for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) clean_keys(dev, i); destroy_workqueue(dev->cache.wq); @@ -830,22 +872,22 @@ static int get_octo_len(u64 addr, u64 len, int page_shift) return (npages + 1) / 2; } -static int mr_cache_max_order(struct mlx5_ib_dev *dev) +static int mkey_cache_max_order(struct mlx5_ib_dev *dev) { if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) - return MR_CACHE_LAST_STD_ENTRY + 2; + return MKEY_CACHE_LAST_STD_ENTRY + 2; return MLX5_MAX_UMR_SHIFT; } -static struct mlx5_cache_ent *mr_cache_ent_from_order(struct mlx5_ib_dev *dev, - unsigned int order) +static struct mlx5_cache_ent *mkey_cache_ent_from_order(struct mlx5_ib_dev *dev, + unsigned int order) { - struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_mkey_cache *cache = &dev->cache; if (order < cache->ent[0].order) return &cache->ent[0]; order = order - cache->ent[0].order; - if (order > MR_CACHE_LAST_STD_ENTRY) + if (order > MKEY_CACHE_LAST_STD_ENTRY) return NULL; return &cache->ent[order]; } @@ -888,7 +930,7 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, 0, iova); if (WARN_ON(!page_size)) return ERR_PTR(-EINVAL); - ent = mr_cache_ent_from_order( + ent = mkey_cache_ent_from_order( dev, order_base_2(ib_umem_num_dma_blocks(umem, page_size))); /* * Matches access in alloc_cache_mr(). If the MR can't come from the @@ -1083,6 +1125,7 @@ struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm, break; case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM: case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM: + case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_PATTERN_SW_ICM: if (attr->access_flags & ~MLX5_IB_DM_SW_ICM_ALLOWED_ACCESS) return ERR_PTR(-EINVAL); @@ -1319,7 +1362,7 @@ static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr, struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); /* We only track the allocated sizes of MRs from the cache */ - if (!mr->cache_ent) + if (!mr->mmkey.cache_ent) return false; if (!mlx5r_umr_can_load_pas(dev, new_umem->length)) return false; @@ -1328,7 +1371,7 @@ static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr, mlx5_umem_find_best_pgsz(new_umem, mkc, log_page_size, 0, iova); if (WARN_ON(!*page_size)) return false; - return (1ULL << mr->cache_ent->order) >= + return (1ULL << mr->mmkey.cache_ent->order) >= ib_umem_num_dma_blocks(new_umem, *page_size); } @@ -1569,15 +1612,17 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) } /* Stop DMA */ - if (mr->cache_ent) { - if (mlx5r_umr_revoke_mr(mr)) { - spin_lock_irq(&mr->cache_ent->lock); - mr->cache_ent->total_mrs--; - spin_unlock_irq(&mr->cache_ent->lock); - mr->cache_ent = NULL; - } + if (mr->mmkey.cache_ent) { + xa_lock_irq(&mr->mmkey.cache_ent->mkeys); + mr->mmkey.cache_ent->in_use--; + xa_unlock_irq(&mr->mmkey.cache_ent->mkeys); + + if (mlx5r_umr_revoke_mr(mr) || + push_mkey(mr->mmkey.cache_ent, false, + xa_mk_value(mr->mmkey.key))) + mr->mmkey.cache_ent = NULL; } - if (!mr->cache_ent) { + if (!mr->mmkey.cache_ent) { rc = destroy_mkey(to_mdev(mr->ibmr.device), mr); if (rc) return rc; @@ -1594,12 +1639,10 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) mlx5_ib_free_odp_mr(mr); } - if (mr->cache_ent) { - mlx5_mr_cache_free(dev, mr); - } else { + if (!mr->mmkey.cache_ent) mlx5_free_priv_descs(mr); - kfree(mr); - } + + kfree(mr); return 0; } diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 84da5674e1ab..e305bf1dc6c2 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -1588,7 +1588,7 @@ mlx5_ib_odp_destroy_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq) return err; } -void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) +void mlx5_odp_init_mkey_cache_entry(struct mlx5_cache_ent *ent) { if (!(ent->dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) return; diff --git a/drivers/infiniband/hw/mlx5/umr.c b/drivers/infiniband/hw/mlx5/umr.c index 3a48364c0918..e00b94d1b1ea 100644 --- a/drivers/infiniband/hw/mlx5/umr.c +++ b/drivers/infiniband/hw/mlx5/umr.c @@ -176,6 +176,7 @@ int mlx5r_umr_resource_init(struct mlx5_ib_dev *dev) dev->umrc.pd = pd; sema_init(&dev->umrc.sem, MAX_UMR_WR); + mutex_init(&dev->umrc.lock); return 0; @@ -195,6 +196,31 @@ void mlx5r_umr_resource_cleanup(struct mlx5_ib_dev *dev) ib_dealloc_pd(dev->umrc.pd); } +static int mlx5r_umr_recover(struct mlx5_ib_dev *dev) +{ + struct umr_common *umrc = &dev->umrc; + struct ib_qp_attr attr; + int err; + + attr.qp_state = IB_QPS_RESET; + err = ib_modify_qp(umrc->qp, &attr, IB_QP_STATE); + if (err) { + mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n"); + goto err; + } + + err = mlx5r_umr_qp_rst2rts(dev, umrc->qp); + if (err) + goto err; + + umrc->state = MLX5_UMR_STATE_ACTIVE; + return 0; + +err: + umrc->state = MLX5_UMR_STATE_ERR; + return err; +} + static int mlx5r_umr_post_send(struct ib_qp *ibqp, u32 mkey, struct ib_cqe *cqe, struct mlx5r_umr_wqe *wqe, bool with_data) { @@ -231,7 +257,7 @@ static int mlx5r_umr_post_send(struct ib_qp *ibqp, u32 mkey, struct ib_cqe *cqe, id.ib_cqe = cqe; mlx5r_finish_wqe(qp, ctrl, seg, size, cur_edge, idx, id.wr_id, 0, - MLX5_FENCE_MODE_NONE, MLX5_OPCODE_UMR); + MLX5_FENCE_MODE_INITIATOR_SMALL, MLX5_OPCODE_UMR); mlx5r_ring_db(qp, 1, ctrl); @@ -270,17 +296,49 @@ static int mlx5r_umr_post_send_wait(struct mlx5_ib_dev *dev, u32 mkey, mlx5r_umr_init_context(&umr_context); down(&umrc->sem); - err = mlx5r_umr_post_send(umrc->qp, mkey, &umr_context.cqe, wqe, - with_data); - if (err) - mlx5_ib_warn(dev, "UMR post send failed, err %d\n", err); - else { - wait_for_completion(&umr_context.done); - if (umr_context.status != IB_WC_SUCCESS) { - mlx5_ib_warn(dev, "reg umr failed (%u)\n", - umr_context.status); + while (true) { + mutex_lock(&umrc->lock); + if (umrc->state == MLX5_UMR_STATE_ERR) { + mutex_unlock(&umrc->lock); err = -EFAULT; + break; + } + + if (umrc->state == MLX5_UMR_STATE_RECOVER) { + mutex_unlock(&umrc->lock); + usleep_range(3000, 5000); + continue; + } + + err = mlx5r_umr_post_send(umrc->qp, mkey, &umr_context.cqe, wqe, + with_data); + mutex_unlock(&umrc->lock); + if (err) { + mlx5_ib_warn(dev, "UMR post send failed, err %d\n", + err); + break; } + + wait_for_completion(&umr_context.done); + + if (umr_context.status == IB_WC_SUCCESS) + break; + + if (umr_context.status == IB_WC_WR_FLUSH_ERR) + continue; + + WARN_ON_ONCE(1); + mlx5_ib_warn(dev, + "reg umr failed (%u). Trying to recover and resubmit the flushed WQEs\n", + umr_context.status); + mutex_lock(&umrc->lock); + err = mlx5r_umr_recover(dev); + mutex_unlock(&umrc->lock); + if (err) + mlx5_ib_warn(dev, "couldn't recover UMR, err %d\n", + err); + err = -EFAULT; + break; } up(&umrc->sem); return err; diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index f0f43b6db89e..3ecb472c9217 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -3082,7 +3082,7 @@ static struct qedr_mr *__qedr_alloc_mr(struct ib_pd *ibpd, else DP_ERR(dev, "roce alloc tid returned error %d\n", rc); - goto err0; + goto err1; } /* Index only, 18 bit long, lkey = itid << 8 | key */ @@ -3106,7 +3106,7 @@ static struct qedr_mr *__qedr_alloc_mr(struct ib_pd *ibpd, rc = dev->ops->rdma_register_tid(dev->rdma_ctx, &mr->hw_mr); if (rc) { DP_ERR(dev, "roce register tid returned an error %d\n", rc); - goto err1; + goto err2; } mr->ibmr.lkey = mr->hw_mr.itid << 8 | mr->hw_mr.key; @@ -3115,8 +3115,10 @@ static struct qedr_mr *__qedr_alloc_mr(struct ib_pd *ibpd, DP_DEBUG(dev, QEDR_MSG_MR, "alloc frmr: %x\n", mr->ibmr.lkey); return mr; -err1: +err2: dev->ops->rdma_free_tid(dev->rdma_ctx, mr->hw_mr.itid); +err1: + qedr_free_pbl(dev, &mr->info.pbl_info, mr->info.pbl_table); err0: kfree(mr); return ERR_PTR(rc); diff --git a/drivers/infiniband/hw/qib/qib.h b/drivers/infiniband/hw/qib/qib.h index b37b1c6d35c6..26c615772be3 100644 --- a/drivers/infiniband/hw/qib/qib.h +++ b/drivers/infiniband/hw/qib/qib.h @@ -321,7 +321,7 @@ struct qib_verbs_txreq { * These 7 values (SDR, DDR, and QDR may be ORed for auto-speed * negotiation) are used for the 3rd argument to path_f_set_ib_cfg * with cmd QIB_IB_CFG_SPD_ENB, by direct calls or via sysfs. They - * are also the the possible values for qib_link_speed_enabled and active + * are also the possible values for qib_link_speed_enabled and active * The values were chosen to match values used within the IB spec. */ #define QIB_IB_SDR 1 diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c index aa290928cf96..3937144b2ae5 100644 --- a/drivers/infiniband/hw/qib/qib_file_ops.c +++ b/drivers/infiniband/hw/qib/qib_file_ops.c @@ -153,7 +153,7 @@ static int qib_get_base_info(struct file *fp, void __user *ubase, kinfo->spi_tidcnt += dd->rcvtidcnt % subctxt_cnt; /* * for this use, may be cfgctxts summed over all chips that - * are are configured and present + * are configured and present */ kinfo->spi_nctxts = dd->cfgctxts; /* unit (chip/board) our context is on */ @@ -851,7 +851,7 @@ static int mmap_rcvegrbufs(struct vm_area_struct *vma, ret = -EPERM; goto bail; } - /* don't allow them to later change to writeable with mprotect */ + /* don't allow them to later change to writable with mprotect */ vma->vm_flags &= ~VM_MAYWRITE; start = vma->vm_start; @@ -941,7 +941,7 @@ static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr, goto bail; } /* - * Don't allow permission to later change to writeable + * Don't allow permission to later change to writable * with mprotect. */ vma->vm_flags &= ~VM_MAYWRITE; diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c index ceed302cf6a0..6861c6384f18 100644 --- a/drivers/infiniband/hw/qib/qib_iba7322.c +++ b/drivers/infiniband/hw/qib/qib_iba7322.c @@ -2850,9 +2850,9 @@ static void qib_setup_7322_cleanup(struct qib_devdata *dd) qib_7322_free_irq(dd); kfree(dd->cspec->cntrs); - kfree(dd->cspec->sendchkenable); - kfree(dd->cspec->sendgrhchk); - kfree(dd->cspec->sendibchk); + bitmap_free(dd->cspec->sendchkenable); + bitmap_free(dd->cspec->sendgrhchk); + bitmap_free(dd->cspec->sendibchk); kfree(dd->cspec->msix_entries); for (i = 0; i < dd->num_pports; i++) { unsigned long flags; @@ -6383,18 +6383,11 @@ static int qib_init_7322_variables(struct qib_devdata *dd) features = qib_7322_boardname(dd); /* now that piobcnt2k and 4k set, we can allocate these */ - sbufcnt = dd->piobcnt2k + dd->piobcnt4k + - NUM_VL15_BUFS + BITS_PER_LONG - 1; - sbufcnt /= BITS_PER_LONG; - dd->cspec->sendchkenable = - kmalloc_array(sbufcnt, sizeof(*dd->cspec->sendchkenable), - GFP_KERNEL); - dd->cspec->sendgrhchk = - kmalloc_array(sbufcnt, sizeof(*dd->cspec->sendgrhchk), - GFP_KERNEL); - dd->cspec->sendibchk = - kmalloc_array(sbufcnt, sizeof(*dd->cspec->sendibchk), - GFP_KERNEL); + sbufcnt = dd->piobcnt2k + dd->piobcnt4k + NUM_VL15_BUFS; + + dd->cspec->sendchkenable = bitmap_zalloc(sbufcnt, GFP_KERNEL); + dd->cspec->sendgrhchk = bitmap_zalloc(sbufcnt, GFP_KERNEL); + dd->cspec->sendibchk = bitmap_zalloc(sbufcnt, GFP_KERNEL); if (!dd->cspec->sendchkenable || !dd->cspec->sendgrhchk || !dd->cspec->sendibchk) { ret = -ENOMEM; diff --git a/drivers/infiniband/hw/qib/qib_init.c b/drivers/infiniband/hw/qib/qib_init.c index d1a72e89e297..45211008449f 100644 --- a/drivers/infiniband/hw/qib/qib_init.c +++ b/drivers/infiniband/hw/qib/qib_init.c @@ -1106,8 +1106,7 @@ struct qib_devdata *qib_alloc_devdata(struct pci_dev *pdev, size_t extra) if (!qib_cpulist_count) { u32 count = num_online_cpus(); - qib_cpulist = kcalloc(BITS_TO_LONGS(count), sizeof(long), - GFP_KERNEL); + qib_cpulist = bitmap_zalloc(count, GFP_KERNEL); if (qib_cpulist) qib_cpulist_count = count; } @@ -1279,7 +1278,7 @@ static void __exit qib_ib_cleanup(void) #endif qib_cpulist_count = 0; - kfree(qib_cpulist); + bitmap_free(qib_cpulist); WARN_ON(!xa_empty(&qib_dev_table)); qib_dev_cleanup(); diff --git a/drivers/infiniband/hw/qib/qib_sd7220.c b/drivers/infiniband/hw/qib/qib_sd7220.c index 81b810d006c0..1dc3ccf0cf1f 100644 --- a/drivers/infiniband/hw/qib/qib_sd7220.c +++ b/drivers/infiniband/hw/qib/qib_sd7220.c @@ -587,7 +587,7 @@ static int epb_access(struct qib_devdata *dd, int sdnum, int claim) /* Need to release */ u64 pollval; /* - * The only writeable bits are the request and CS. + * The only writable bits are the request and CS. * Both should be clear */ u64 newval = 0; diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c index e212929369df..67a1b4562dc2 100644 --- a/drivers/infiniband/hw/usnic/usnic_uiom.c +++ b/drivers/infiniband/hw/usnic/usnic_uiom.c @@ -482,7 +482,7 @@ int usnic_uiom_attach_dev_to_pd(struct usnic_uiom_pd *pd, struct device *dev) if (err) goto out_free_dev; - if (!iommu_capable(dev->bus, IOMMU_CAP_CACHE_COHERENCY)) { + if (!device_iommu_capable(dev, IOMMU_CAP_CACHE_COHERENCY)) { usnic_err("IOMMU of %s does not support cache coherency\n", dev_name(dev)); err = -EINVAL; diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c index da3a398053b8..bc53cad077aa 100644 --- a/drivers/infiniband/sw/rxe/rxe_comp.c +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -114,6 +114,8 @@ void retransmit_timer(struct timer_list *t) { struct rxe_qp *qp = from_timer(qp, t, retrans_timer); + pr_debug("%s: fired for qp#%d\n", __func__, qp->elem.index); + if (qp->valid) { qp->comp.timeout = 1; rxe_run_task(&qp->comp.task, 1); @@ -560,7 +562,7 @@ int rxe_completer(void *arg) struct sk_buff *skb = NULL; struct rxe_pkt_info *pkt = NULL; enum comp_state state; - int ret = 0; + int ret; if (!rxe_get(qp)) return -EAGAIN; @@ -569,8 +571,7 @@ int rxe_completer(void *arg) qp->req.state == QP_STATE_RESET) { rxe_drain_resp_pkts(qp, qp->valid && qp->req.state == QP_STATE_ERROR); - ret = -EAGAIN; - goto done; + goto exit; } if (qp->comp.timeout) { @@ -580,10 +581,8 @@ int rxe_completer(void *arg) qp->comp.timeout_retry = 0; } - if (qp->req.need_retry) { - ret = -EAGAIN; - goto done; - } + if (qp->req.need_retry) + goto exit; state = COMPST_GET_ACK; @@ -676,8 +675,7 @@ int rxe_completer(void *arg) qp->qp_timeout_jiffies) mod_timer(&qp->retrans_timer, jiffies + qp->qp_timeout_jiffies); - ret = -EAGAIN; - goto done; + goto exit; case COMPST_ERROR_RETRY: /* we come here if the retry timer fired and we did @@ -689,10 +687,8 @@ int rxe_completer(void *arg) */ /* there is nothing to retry in this case */ - if (!wqe || (wqe->state == wqe_state_posted)) { - ret = -EAGAIN; - goto done; - } + if (!wqe || (wqe->state == wqe_state_posted)) + goto exit; /* if we've started a retry, don't start another * retry sequence, unless this is a timeout. @@ -730,18 +726,21 @@ int rxe_completer(void *arg) break; case COMPST_RNR_RETRY: + /* we come here if we received an RNR NAK */ if (qp->comp.rnr_retry > 0) { if (qp->comp.rnr_retry != 7) qp->comp.rnr_retry--; - qp->req.need_retry = 1; + /* don't start a retry flow until the + * rnr timer has fired + */ + qp->req.wait_for_rnr_timer = 1; pr_debug("qp#%d set rnr nak timer\n", qp_num(qp)); mod_timer(&qp->rnr_nak_timer, jiffies + rnrnak_jiffies(aeth_syn(pkt) & ~AETH_TYPE_MASK)); - ret = -EAGAIN; - goto done; + goto exit; } else { rxe_counter_inc(rxe, RXE_CNT_RNR_RETRY_EXCEEDED); @@ -754,12 +753,20 @@ int rxe_completer(void *arg) WARN_ON_ONCE(wqe->status == IB_WC_SUCCESS); do_complete(qp, wqe); rxe_qp_error(qp); - ret = -EAGAIN; - goto done; + goto exit; } } + /* A non-zero return value will cause rxe_do_task to + * exit its loop and end the tasklet. A zero return + * will continue looping and return to rxe_completer + */ done: + ret = 0; + goto out; +exit: + ret = -EAGAIN; +out: if (pkt) free_pkt(pkt); rxe_put(qp); diff --git a/drivers/infiniband/sw/rxe/rxe_cq.c b/drivers/infiniband/sw/rxe/rxe_cq.c index 642b52539ac3..b1a0ab3cd4bd 100644 --- a/drivers/infiniband/sw/rxe/rxe_cq.c +++ b/drivers/infiniband/sw/rxe/rxe_cq.c @@ -19,16 +19,16 @@ int rxe_cq_chk_attr(struct rxe_dev *rxe, struct rxe_cq *cq, } if (cqe > rxe->attr.max_cqe) { - pr_warn("cqe(%d) > max_cqe(%d)\n", - cqe, rxe->attr.max_cqe); + pr_debug("cqe(%d) > max_cqe(%d)\n", + cqe, rxe->attr.max_cqe); goto err1; } if (cq) { count = queue_count(cq->queue, QUEUE_TYPE_TO_CLIENT); if (cqe < count) { - pr_warn("cqe(%d) < current # elements in queue (%d)", - cqe, count); + pr_debug("cqe(%d) < current # elements in queue (%d)", + cqe, count); goto err1; } } diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index 0e022ae1b8a5..22f6cc31d1d6 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -77,9 +77,8 @@ struct rxe_mr *lookup_mr(struct rxe_pd *pd, int access, u32 key, enum rxe_mr_lookup_type type); int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length); int advance_dma_data(struct rxe_dma_info *dma, unsigned int length); -int rxe_invalidate_mr(struct rxe_qp *qp, u32 rkey); +int rxe_invalidate_mr(struct rxe_qp *qp, u32 key); int rxe_reg_fast_mr(struct rxe_qp *qp, struct rxe_send_wqe *wqe); -int rxe_mr_set_page(struct ib_mr *ibmr, u64 addr); int rxe_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata); void rxe_mr_cleanup(struct rxe_pool_elem *elem); @@ -145,7 +144,7 @@ static inline int rcv_wqe_size(int max_sge) max_sge * sizeof(struct ib_sge); } -void free_rd_atomic_resource(struct rxe_qp *qp, struct resp_res *res); +void free_rd_atomic_resource(struct resp_res *res); static inline void rxe_advance_resp_resource(struct rxe_qp *qp) { diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index fc3942e04a1f..850b80f5ad8b 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -24,7 +24,7 @@ u8 rxe_get_next_key(u32 last_key) int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length) { - struct rxe_map_set *set = mr->cur_map_set; + switch (mr->type) { case IB_MR_TYPE_DMA: @@ -32,8 +32,8 @@ int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length) case IB_MR_TYPE_USER: case IB_MR_TYPE_MEM_REG: - if (iova < set->iova || length > set->length || - iova > set->iova + set->length - length) + if (iova < mr->iova || length > mr->length || + iova > mr->iova + mr->length - length) return -EFAULT; return 0; @@ -65,89 +65,41 @@ static void rxe_mr_init(int access, struct rxe_mr *mr) mr->map_shift = ilog2(RXE_BUF_PER_MAP); } -static void rxe_mr_free_map_set(int num_map, struct rxe_map_set *set) -{ - int i; - - for (i = 0; i < num_map; i++) - kfree(set->map[i]); - - kfree(set->map); - kfree(set); -} - -static int rxe_mr_alloc_map_set(int num_map, struct rxe_map_set **setp) +static int rxe_mr_alloc(struct rxe_mr *mr, int num_buf) { int i; - struct rxe_map_set *set; + int num_map; + struct rxe_map **map = mr->map; - set = kmalloc(sizeof(*set), GFP_KERNEL); - if (!set) - goto err_out; + num_map = (num_buf + RXE_BUF_PER_MAP - 1) / RXE_BUF_PER_MAP; - set->map = kmalloc_array(num_map, sizeof(struct rxe_map *), GFP_KERNEL); - if (!set->map) - goto err_free_set; + mr->map = kmalloc_array(num_map, sizeof(*map), GFP_KERNEL); + if (!mr->map) + goto err1; for (i = 0; i < num_map; i++) { - set->map[i] = kmalloc(sizeof(struct rxe_map), GFP_KERNEL); - if (!set->map[i]) - goto err_free_map; + mr->map[i] = kmalloc(sizeof(**map), GFP_KERNEL); + if (!mr->map[i]) + goto err2; } - *setp = set; - - return 0; - -err_free_map: - for (i--; i >= 0; i--) - kfree(set->map[i]); - - kfree(set->map); -err_free_set: - kfree(set); -err_out: - return -ENOMEM; -} - -/** - * rxe_mr_alloc() - Allocate memory map array(s) for MR - * @mr: Memory region - * @num_buf: Number of buffer descriptors to support - * @both: If non zero allocate both mr->map and mr->next_map - * else just allocate mr->map. Used for fast MRs - * - * Return: 0 on success else an error - */ -static int rxe_mr_alloc(struct rxe_mr *mr, int num_buf, int both) -{ - int ret; - int num_map; - BUILD_BUG_ON(!is_power_of_2(RXE_BUF_PER_MAP)); - num_map = (num_buf + RXE_BUF_PER_MAP - 1) / RXE_BUF_PER_MAP; mr->map_shift = ilog2(RXE_BUF_PER_MAP); mr->map_mask = RXE_BUF_PER_MAP - 1; + mr->num_buf = num_buf; - mr->max_buf = num_map * RXE_BUF_PER_MAP; mr->num_map = num_map; - - ret = rxe_mr_alloc_map_set(num_map, &mr->cur_map_set); - if (ret) - return -ENOMEM; - - if (both) { - ret = rxe_mr_alloc_map_set(num_map, &mr->next_map_set); - if (ret) - goto err_free; - } + mr->max_buf = num_map * RXE_BUF_PER_MAP; return 0; -err_free: - rxe_mr_free_map_set(mr->num_map, mr->cur_map_set); - mr->cur_map_set = NULL; +err2: + for (i--; i >= 0; i--) + kfree(mr->map[i]); + + kfree(mr->map); +err1: return -ENOMEM; } @@ -164,7 +116,6 @@ void rxe_mr_init_dma(struct rxe_pd *pd, int access, struct rxe_mr *mr) int rxe_mr_init_user(struct rxe_pd *pd, u64 start, u64 length, u64 iova, int access, struct rxe_mr *mr) { - struct rxe_map_set *set; struct rxe_map **map; struct rxe_phys_buf *buf = NULL; struct ib_umem *umem; @@ -172,6 +123,7 @@ int rxe_mr_init_user(struct rxe_pd *pd, u64 start, u64 length, u64 iova, int num_buf; void *vaddr; int err; + int i; umem = ib_umem_get(pd->ibpd.device, start, length, access); if (IS_ERR(umem)) { @@ -185,20 +137,18 @@ int rxe_mr_init_user(struct rxe_pd *pd, u64 start, u64 length, u64 iova, rxe_mr_init(access, mr); - err = rxe_mr_alloc(mr, num_buf, 0); + err = rxe_mr_alloc(mr, num_buf); if (err) { pr_warn("%s: Unable to allocate memory for map\n", __func__); goto err_release_umem; } - set = mr->cur_map_set; - set->page_shift = PAGE_SHIFT; - set->page_mask = PAGE_SIZE - 1; - - num_buf = 0; - map = set->map; + mr->page_shift = PAGE_SHIFT; + mr->page_mask = PAGE_SIZE - 1; + num_buf = 0; + map = mr->map; if (length > 0) { buf = map[0]->buf; @@ -214,29 +164,33 @@ int rxe_mr_init_user(struct rxe_pd *pd, u64 start, u64 length, u64 iova, pr_warn("%s: Unable to get virtual address\n", __func__); err = -ENOMEM; - goto err_release_umem; + goto err_cleanup_map; } buf->addr = (uintptr_t)vaddr; buf->size = PAGE_SIZE; num_buf++; buf++; + } } mr->ibmr.pd = &pd->ibpd; mr->umem = umem; mr->access = access; + mr->length = length; + mr->iova = iova; + mr->va = start; + mr->offset = ib_umem_offset(umem); mr->state = RXE_MR_STATE_VALID; mr->type = IB_MR_TYPE_USER; - set->length = length; - set->iova = iova; - set->va = start; - set->offset = ib_umem_offset(umem); - return 0; +err_cleanup_map: + for (i = 0; i < mr->num_map; i++) + kfree(mr->map[i]); + kfree(mr->map); err_release_umem: ib_umem_release(umem); err_out: @@ -250,7 +204,7 @@ int rxe_mr_init_fast(struct rxe_pd *pd, int max_pages, struct rxe_mr *mr) /* always allow remote access for FMRs */ rxe_mr_init(IB_ACCESS_REMOTE, mr); - err = rxe_mr_alloc(mr, max_pages, 1); + err = rxe_mr_alloc(mr, max_pages); if (err) goto err1; @@ -268,24 +222,21 @@ err1: static void lookup_iova(struct rxe_mr *mr, u64 iova, int *m_out, int *n_out, size_t *offset_out) { - struct rxe_map_set *set = mr->cur_map_set; - size_t offset = iova - set->iova + set->offset; + size_t offset = iova - mr->iova + mr->offset; int map_index; int buf_index; u64 length; - struct rxe_map *map; - if (likely(set->page_shift)) { - *offset_out = offset & set->page_mask; - offset >>= set->page_shift; + if (likely(mr->page_shift)) { + *offset_out = offset & mr->page_mask; + offset >>= mr->page_shift; *n_out = offset & mr->map_mask; *m_out = offset >> mr->map_shift; } else { map_index = 0; buf_index = 0; - map = set->map[map_index]; - length = map->buf[buf_index].size; + length = mr->map[map_index]->buf[buf_index].size; while (offset >= length) { offset -= length; @@ -295,8 +246,7 @@ static void lookup_iova(struct rxe_mr *mr, u64 iova, int *m_out, int *n_out, map_index++; buf_index = 0; } - map = set->map[map_index]; - length = map->buf[buf_index].size; + length = mr->map[map_index]->buf[buf_index].size; } *m_out = map_index; @@ -317,7 +267,7 @@ void *iova_to_vaddr(struct rxe_mr *mr, u64 iova, int length) goto out; } - if (!mr->cur_map_set) { + if (!mr->map) { addr = (void *)(uintptr_t)iova; goto out; } @@ -330,13 +280,13 @@ void *iova_to_vaddr(struct rxe_mr *mr, u64 iova, int length) lookup_iova(mr, iova, &m, &n, &offset); - if (offset + length > mr->cur_map_set->map[m]->buf[n].size) { + if (offset + length > mr->map[m]->buf[n].size) { pr_warn("crosses page boundary\n"); addr = NULL; goto out; } - addr = (void *)(uintptr_t)mr->cur_map_set->map[m]->buf[n].addr + offset; + addr = (void *)(uintptr_t)mr->map[m]->buf[n].addr + offset; out: return addr; @@ -372,7 +322,7 @@ int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length, return 0; } - WARN_ON_ONCE(!mr->cur_map_set); + WARN_ON_ONCE(!mr->map); err = mr_check_range(mr, iova, length); if (err) { @@ -382,7 +332,7 @@ int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length, lookup_iova(mr, iova, &m, &i, &offset); - map = mr->cur_map_set->map + m; + map = mr->map + m; buf = map[0]->buf + i; while (length > 0) { @@ -576,22 +526,22 @@ struct rxe_mr *lookup_mr(struct rxe_pd *pd, int access, u32 key, return mr; } -int rxe_invalidate_mr(struct rxe_qp *qp, u32 rkey) +int rxe_invalidate_mr(struct rxe_qp *qp, u32 key) { struct rxe_dev *rxe = to_rdev(qp->ibqp.device); struct rxe_mr *mr; int ret; - mr = rxe_pool_get_index(&rxe->mr_pool, rkey >> 8); + mr = rxe_pool_get_index(&rxe->mr_pool, key >> 8); if (!mr) { - pr_err("%s: No MR for rkey %#x\n", __func__, rkey); + pr_err("%s: No MR for key %#x\n", __func__, key); ret = -EINVAL; goto err; } - if (rkey != mr->rkey) { - pr_err("%s: rkey (%#x) doesn't match mr->rkey (%#x)\n", - __func__, rkey, mr->rkey); + if (mr->rkey ? (key != mr->rkey) : (key != mr->lkey)) { + pr_err("%s: wr key (%#x) doesn't match mr key (%#x)\n", + __func__, key, (mr->rkey ? mr->rkey : mr->lkey)); ret = -EINVAL; goto err_drop_ref; } @@ -628,9 +578,8 @@ err: int rxe_reg_fast_mr(struct rxe_qp *qp, struct rxe_send_wqe *wqe) { struct rxe_mr *mr = to_rmr(wqe->wr.wr.reg.mr); - u32 key = wqe->wr.wr.reg.key & 0xff; + u32 key = wqe->wr.wr.reg.key; u32 access = wqe->wr.wr.reg.access; - struct rxe_map_set *set; /* user can only register MR in free state */ if (unlikely(mr->state != RXE_MR_STATE_FREE)) { @@ -646,36 +595,19 @@ int rxe_reg_fast_mr(struct rxe_qp *qp, struct rxe_send_wqe *wqe) return -EINVAL; } + /* user is only allowed to change key portion of l/rkey */ + if (unlikely((mr->lkey & ~0xff) != (key & ~0xff))) { + pr_warn("%s: key = 0x%x has wrong index mr->lkey = 0x%x\n", + __func__, key, mr->lkey); + return -EINVAL; + } + mr->access = access; - mr->lkey = (mr->lkey & ~0xff) | key; - mr->rkey = (access & IB_ACCESS_REMOTE) ? mr->lkey : 0; + mr->lkey = key; + mr->rkey = (access & IB_ACCESS_REMOTE) ? key : 0; + mr->iova = wqe->wr.wr.reg.mr->iova; mr->state = RXE_MR_STATE_VALID; - set = mr->cur_map_set; - mr->cur_map_set = mr->next_map_set; - mr->cur_map_set->iova = wqe->wr.wr.reg.mr->iova; - mr->next_map_set = set; - - return 0; -} - -int rxe_mr_set_page(struct ib_mr *ibmr, u64 addr) -{ - struct rxe_mr *mr = to_rmr(ibmr); - struct rxe_map_set *set = mr->next_map_set; - struct rxe_map *map; - struct rxe_phys_buf *buf; - - if (unlikely(set->nbuf == mr->num_buf)) - return -ENOMEM; - - map = set->map[set->nbuf / RXE_BUF_PER_MAP]; - buf = &map->buf[set->nbuf % RXE_BUF_PER_MAP]; - - buf->addr = addr; - buf->size = ibmr->page_size; - set->nbuf++; - return 0; } @@ -687,7 +619,7 @@ int rxe_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) if (atomic_read(&mr->num_mw) > 0) return -EINVAL; - rxe_put(mr); + rxe_cleanup(mr); return 0; } @@ -695,14 +627,15 @@ int rxe_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) void rxe_mr_cleanup(struct rxe_pool_elem *elem) { struct rxe_mr *mr = container_of(elem, typeof(*mr), elem); + int i; rxe_put(mr_pd(mr)); - ib_umem_release(mr->umem); - if (mr->cur_map_set) - rxe_mr_free_map_set(mr->num_map, mr->cur_map_set); + if (mr->map) { + for (i = 0; i < mr->num_map; i++) + kfree(mr->map[i]); - if (mr->next_map_set) - rxe_mr_free_map_set(mr->num_map, mr->next_map_set); + kfree(mr->map); + } } diff --git a/drivers/infiniband/sw/rxe/rxe_mw.c b/drivers/infiniband/sw/rxe/rxe_mw.c index 2e1fa844fabf..104993801a80 100644 --- a/drivers/infiniband/sw/rxe/rxe_mw.c +++ b/drivers/infiniband/sw/rxe/rxe_mw.c @@ -33,6 +33,8 @@ int rxe_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata) RXE_MW_STATE_FREE : RXE_MW_STATE_VALID; spin_lock_init(&mw->lock); + rxe_finalize(mw); + return 0; } @@ -40,7 +42,7 @@ int rxe_dealloc_mw(struct ib_mw *ibmw) { struct rxe_mw *mw = to_rmw(ibmw); - rxe_put(mw); + rxe_cleanup(mw); return 0; } @@ -48,8 +50,6 @@ int rxe_dealloc_mw(struct ib_mw *ibmw) static int rxe_check_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe, struct rxe_mw *mw, struct rxe_mr *mr) { - u32 key = wqe->wr.wr.mw.rkey & 0xff; - if (mw->ibmw.type == IB_MW_TYPE_1) { if (unlikely(mw->state != RXE_MW_STATE_VALID)) { pr_err_once( @@ -87,11 +87,6 @@ static int rxe_check_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe, } } - if (unlikely(key == (mw->rkey & 0xff))) { - pr_err_once("attempt to bind MW with same key\n"); - return -EINVAL; - } - /* remaining checks only apply to a nonzero MR */ if (!mr) return 0; @@ -113,21 +108,21 @@ static int rxe_check_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe, (IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_ATOMIC)) && !(mr->access & IB_ACCESS_LOCAL_WRITE))) { pr_err_once( - "attempt to bind an writeable MW to an MR without local write access\n"); + "attempt to bind an Writable MW to an MR without local write access\n"); return -EINVAL; } /* C10-75 */ if (mw->access & IB_ZERO_BASED) { - if (unlikely(wqe->wr.wr.mw.length > mr->cur_map_set->length)) { + if (unlikely(wqe->wr.wr.mw.length > mr->length)) { pr_err_once( "attempt to bind a ZB MW outside of the MR\n"); return -EINVAL; } } else { - if (unlikely((wqe->wr.wr.mw.addr < mr->cur_map_set->iova) || + if (unlikely((wqe->wr.wr.mw.addr < mr->iova) || ((wqe->wr.wr.mw.addr + wqe->wr.wr.mw.length) > - (mr->cur_map_set->iova + mr->cur_map_set->length)))) { + (mr->iova + mr->length)))) { pr_err_once( "attempt to bind a VA MW outside of the MR\n"); return -EINVAL; diff --git a/drivers/infiniband/sw/rxe/rxe_param.h b/drivers/infiniband/sw/rxe/rxe_param.h index 568a7cbd13d4..86c7a8bf3cbb 100644 --- a/drivers/infiniband/sw/rxe/rxe_param.h +++ b/drivers/infiniband/sw/rxe/rxe_param.h @@ -105,6 +105,12 @@ enum rxe_device_param { RXE_INFLIGHT_SKBS_PER_QP_HIGH = 64, RXE_INFLIGHT_SKBS_PER_QP_LOW = 16, + /* Max number of interations of each tasklet + * before yielding the cpu to let other + * work make progress + */ + RXE_MAX_ITERATIONS = 1024, + /* Delay before calling arbiter timer */ RXE_NSEC_ARB_TIMER_DELAY = 200, diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c index 19b14826385b..f50620f5a0a1 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.c +++ b/drivers/infiniband/sw/rxe/rxe_pool.c @@ -6,6 +6,7 @@ #include "rxe.h" +#define RXE_POOL_TIMEOUT (200) #define RXE_POOL_ALIGN (16) static const struct rxe_type_info { @@ -136,10 +137,14 @@ void *rxe_alloc(struct rxe_pool *pool) elem->pool = pool; elem->obj = obj; kref_init(&elem->ref_cnt); + init_completion(&elem->complete); - err = xa_alloc_cyclic(&pool->xa, &elem->index, elem, pool->limit, + /* allocate index in array but leave pointer as NULL so it + * can't be looked up until rxe_finalize() is called + */ + err = xa_alloc_cyclic(&pool->xa, &elem->index, NULL, pool->limit, &pool->next, GFP_KERNEL); - if (err) + if (err < 0) goto err_free; return obj; @@ -151,9 +156,11 @@ err_cnt: return NULL; } -int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem) +int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem, + bool sleepable) { int err; + gfp_t gfp_flags; if (WARN_ON(pool->type == RXE_TYPE_MR)) return -EINVAL; @@ -164,10 +171,19 @@ int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem) elem->pool = pool; elem->obj = (u8 *)elem - pool->elem_offset; kref_init(&elem->ref_cnt); - - err = xa_alloc_cyclic(&pool->xa, &elem->index, elem, pool->limit, - &pool->next, GFP_KERNEL); - if (err) + init_completion(&elem->complete); + + /* AH objects are unique in that the create_ah verb + * can be called in atomic context. If the create_ah + * call is not sleepable use GFP_ATOMIC. + */ + gfp_flags = sleepable ? GFP_KERNEL : GFP_ATOMIC; + + if (sleepable) + might_sleep(); + err = xa_alloc_cyclic(&pool->xa, &elem->index, NULL, pool->limit, + &pool->next, gfp_flags); + if (err < 0) goto err_cnt; return 0; @@ -181,16 +197,15 @@ void *rxe_pool_get_index(struct rxe_pool *pool, u32 index) { struct rxe_pool_elem *elem; struct xarray *xa = &pool->xa; - unsigned long flags; void *obj; - xa_lock_irqsave(xa, flags); + rcu_read_lock(); elem = xa_load(xa, index); if (elem && kref_get_unless_zero(&elem->ref_cnt)) obj = elem->obj; else obj = NULL; - xa_unlock_irqrestore(xa, flags); + rcu_read_unlock(); return obj; } @@ -198,17 +213,74 @@ void *rxe_pool_get_index(struct rxe_pool *pool, u32 index) static void rxe_elem_release(struct kref *kref) { struct rxe_pool_elem *elem = container_of(kref, typeof(*elem), ref_cnt); - struct rxe_pool *pool = elem->pool; - xa_erase(&pool->xa, elem->index); + complete(&elem->complete); +} + +int __rxe_cleanup(struct rxe_pool_elem *elem, bool sleepable) +{ + struct rxe_pool *pool = elem->pool; + struct xarray *xa = &pool->xa; + static int timeout = RXE_POOL_TIMEOUT; + int ret, err = 0; + void *xa_ret; + + if (sleepable) + might_sleep(); + + /* erase xarray entry to prevent looking up + * the pool elem from its index + */ + xa_ret = xa_erase(xa, elem->index); + WARN_ON(xa_err(xa_ret)); + + /* if this is the last call to rxe_put complete the + * object. It is safe to touch obj->elem after this since + * it is freed below + */ + __rxe_put(elem); + + /* wait until all references to the object have been + * dropped before final object specific cleanup and + * return to rdma-core + */ + if (sleepable) { + if (!completion_done(&elem->complete) && timeout) { + ret = wait_for_completion_timeout(&elem->complete, + timeout); + + /* Shouldn't happen. There are still references to + * the object but, rather than deadlock, free the + * object or pass back to rdma-core. + */ + if (WARN_ON(!ret)) + err = -EINVAL; + } + } else { + unsigned long until = jiffies + timeout; + + /* AH objects are unique in that the destroy_ah verb + * can be called in atomic context. This delay + * replaces the wait_for_completion call above + * when the destroy_ah call is not sleepable + */ + while (!completion_done(&elem->complete) && + time_before(jiffies, until)) + mdelay(1); + + if (WARN_ON(!completion_done(&elem->complete))) + err = -EINVAL; + } if (pool->cleanup) pool->cleanup(elem); if (pool->type == RXE_TYPE_MR) - kfree(elem->obj); + kfree_rcu(elem->obj); atomic_dec(&pool->num_elem); + + return err; } int __rxe_get(struct rxe_pool_elem *elem) @@ -220,3 +292,11 @@ int __rxe_put(struct rxe_pool_elem *elem) { return kref_put(&elem->ref_cnt, rxe_elem_release); } + +void __rxe_finalize(struct rxe_pool_elem *elem) +{ + void *xa_ret; + + xa_ret = xa_store(&elem->pool->xa, elem->index, elem, GFP_KERNEL); + WARN_ON(xa_err(xa_ret)); +} diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h index 0860660d65ec..9d83cb32092f 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.h +++ b/drivers/infiniband/sw/rxe/rxe_pool.h @@ -24,6 +24,7 @@ struct rxe_pool_elem { void *obj; struct kref ref_cnt; struct list_head list; + struct completion complete; u32 index; }; @@ -57,21 +58,28 @@ void rxe_pool_cleanup(struct rxe_pool *pool); void *rxe_alloc(struct rxe_pool *pool); /* connect already allocated object to pool */ -int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem); - -#define rxe_add_to_pool(pool, obj) __rxe_add_to_pool(pool, &(obj)->elem) +int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem, + bool sleepable); +#define rxe_add_to_pool(pool, obj) __rxe_add_to_pool(pool, &(obj)->elem, true) +#define rxe_add_to_pool_ah(pool, obj, sleepable) __rxe_add_to_pool(pool, \ + &(obj)->elem, sleepable) /* lookup an indexed object from index. takes a reference on object */ void *rxe_pool_get_index(struct rxe_pool *pool, u32 index); int __rxe_get(struct rxe_pool_elem *elem); - #define rxe_get(obj) __rxe_get(&(obj)->elem) int __rxe_put(struct rxe_pool_elem *elem); - #define rxe_put(obj) __rxe_put(&(obj)->elem) +int __rxe_cleanup(struct rxe_pool_elem *elem, bool sleepable); +#define rxe_cleanup(obj) __rxe_cleanup(&(obj)->elem, true) +#define rxe_cleanup_ah(obj, sleepable) __rxe_cleanup(&(obj)->elem, sleepable) + #define rxe_read(obj) kref_read(&(obj)->elem.ref_cnt) +void __rxe_finalize(struct rxe_pool_elem *elem); +#define rxe_finalize(obj) __rxe_finalize(&(obj)->elem) + #endif /* RXE_POOL_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index 22e9b85344c3..eef91b8cb4ed 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -120,17 +120,15 @@ static void free_rd_atomic_resources(struct rxe_qp *qp) for (i = 0; i < qp->attr.max_dest_rd_atomic; i++) { struct resp_res *res = &qp->resp.resources[i]; - free_rd_atomic_resource(qp, res); + free_rd_atomic_resource(res); } kfree(qp->resp.resources); qp->resp.resources = NULL; } } -void free_rd_atomic_resource(struct rxe_qp *qp, struct resp_res *res) +void free_rd_atomic_resource(struct resp_res *res) { - if (res->type == RXE_ATOMIC_MASK) - kfree_skb(res->atomic.skb); res->type = 0; } @@ -142,7 +140,7 @@ static void cleanup_rd_atomic_resources(struct rxe_qp *qp) if (qp->resp.resources) { for (i = 0; i < qp->attr.max_dest_rd_atomic; i++) { res = &qp->resp.resources[i]; - free_rd_atomic_resource(qp, res); + free_rd_atomic_resource(res); } } } @@ -507,6 +505,7 @@ static void rxe_qp_reset(struct rxe_qp *qp) atomic_set(&qp->ssn, 0); qp->req.opcode = -1; qp->req.need_retry = 0; + qp->req.wait_for_rnr_timer = 0; qp->req.noack_pkts = 0; qp->resp.msn = 0; qp->resp.opcode = -1; @@ -804,13 +803,15 @@ static void rxe_qp_do_cleanup(struct work_struct *work) if (qp->rq.queue) rxe_queue_cleanup(qp->rq.queue); - atomic_dec(&qp->scq->num_wq); - if (qp->scq) + if (qp->scq) { + atomic_dec(&qp->scq->num_wq); rxe_put(qp->scq); + } - atomic_dec(&qp->rcq->num_wq); - if (qp->rcq) + if (qp->rcq) { + atomic_dec(&qp->rcq->num_wq); rxe_put(qp->rcq); + } if (qp->pd) rxe_put(qp->pd); diff --git a/drivers/infiniband/sw/rxe/rxe_queue.h b/drivers/infiniband/sw/rxe/rxe_queue.h index 6227112ef7a2..ed44042782fa 100644 --- a/drivers/infiniband/sw/rxe/rxe_queue.h +++ b/drivers/infiniband/sw/rxe/rxe_queue.h @@ -7,9 +7,6 @@ #ifndef RXE_QUEUE_H #define RXE_QUEUE_H -/* for definition of shared struct rxe_queue_buf */ -#include <uapi/rdma/rdma_user_rxe.h> - /* Implements a simple circular buffer that is shared between user * and the driver and can be resized. The requested element size is * rounded up to a power of 2 and the number of elements in the buffer @@ -53,6 +50,8 @@ enum queue_type { QUEUE_TYPE_FROM_DRIVER, }; +struct rxe_queue_buf; + struct rxe_queue { struct rxe_dev *rxe; struct rxe_queue_buf *buf; diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index 9d98237389cf..49e8f54db6f5 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -15,8 +15,7 @@ static int next_opcode(struct rxe_qp *qp, struct rxe_send_wqe *wqe, u32 opcode); static inline void retry_first_write_send(struct rxe_qp *qp, - struct rxe_send_wqe *wqe, - unsigned int mask, int npsn) + struct rxe_send_wqe *wqe, int npsn) { int i; @@ -83,7 +82,7 @@ static void req_retry(struct rxe_qp *qp) if (mask & WR_WRITE_OR_SEND_MASK) { npsn = (qp->comp.psn - wqe->first_psn) & BTH_PSN_MASK; - retry_first_write_send(qp, wqe, mask, npsn); + retry_first_write_send(qp, wqe, npsn); } if (mask & WR_READ_MASK) { @@ -101,7 +100,11 @@ void rnr_nak_timer(struct timer_list *t) { struct rxe_qp *qp = from_timer(qp, t, rnr_nak_timer); - pr_debug("qp#%d rnr nak timer fired\n", qp_num(qp)); + pr_debug("%s: fired for qp#%d\n", __func__, qp_num(qp)); + + /* request a send queue retry */ + qp->req.need_retry = 1; + qp->req.wait_for_rnr_timer = 0; rxe_run_task(&qp->req.task, 1); } @@ -161,16 +164,36 @@ static struct rxe_send_wqe *req_next_wqe(struct rxe_qp *qp) (wqe->state != wqe_state_processing))) return NULL; - if (unlikely((wqe->wr.send_flags & IB_SEND_FENCE) && - (index != cons))) { - qp->req.wait_fence = 1; - return NULL; - } - wqe->mask = wr_opcode_mask(wqe->wr.opcode, qp); return wqe; } +/** + * rxe_wqe_is_fenced - check if next wqe is fenced + * @qp: the queue pair + * @wqe: the next wqe + * + * Returns: 1 if wqe needs to wait + * 0 if wqe is ready to go + */ +static int rxe_wqe_is_fenced(struct rxe_qp *qp, struct rxe_send_wqe *wqe) +{ + /* Local invalidate fence (LIF) see IBA 10.6.5.1 + * Requires ALL previous operations on the send queue + * are complete. Make mandatory for the rxe driver. + */ + if (wqe->wr.opcode == IB_WR_LOCAL_INV) + return qp->req.wqe_index != queue_get_consumer(qp->sq.queue, + QUEUE_TYPE_FROM_CLIENT); + + /* Fence see IBA 10.8.3.3 + * Requires that all previous read and atomic operations + * are complete. + */ + return (wqe->wr.send_flags & IB_SEND_FENCE) && + atomic_read(&qp->req.rd_atomic) != qp->attr.max_rd_atomic; +} + static int next_opcode_rc(struct rxe_qp *qp, u32 opcode, int fits) { switch (opcode) { @@ -581,9 +604,11 @@ static int rxe_do_local_ops(struct rxe_qp *qp, struct rxe_send_wqe *wqe) wqe->status = IB_WC_SUCCESS; qp->req.wqe_index = queue_next_index(qp->sq.queue, qp->req.wqe_index); - if ((wqe->wr.send_flags & IB_SEND_SIGNALED) || - qp->sq_sig_type == IB_SIGNAL_ALL_WR) - rxe_run_task(&qp->comp.task, 1); + /* There is no ack coming for local work requests + * which can lead to a deadlock. So go ahead and complete + * it now. + */ + rxe_run_task(&qp->comp.task, 1); return 0; } @@ -599,6 +624,7 @@ int rxe_requester(void *arg) u32 payload; int mtu; int opcode; + int err; int ret; struct rxe_send_wqe rollback_wqe; u32 rollback_psn; @@ -609,7 +635,6 @@ int rxe_requester(void *arg) if (!rxe_get(qp)) return -EAGAIN; -next_wqe: if (unlikely(!qp->valid || qp->req.state == QP_STATE_ERROR)) goto exit; @@ -620,10 +645,17 @@ next_wqe: qp->req.need_rd_atomic = 0; qp->req.wait_psn = 0; qp->req.need_retry = 0; + qp->req.wait_for_rnr_timer = 0; goto exit; } - if (unlikely(qp->req.need_retry)) { + /* we come here if the retransmit timer has fired + * or if the rnr timer has fired. If the retransmit + * timer fires while we are processing an RNR NAK wait + * until the rnr timer has fired before starting the + * retry flow + */ + if (unlikely(qp->req.need_retry && !qp->req.wait_for_rnr_timer)) { req_retry(qp); qp->req.need_retry = 0; } @@ -632,12 +664,17 @@ next_wqe: if (unlikely(!wqe)) goto exit; + if (rxe_wqe_is_fenced(qp, wqe)) { + qp->req.wait_fence = 1; + goto exit; + } + if (wqe->mask & WR_LOCAL_OP_MASK) { - ret = rxe_do_local_ops(qp, wqe); - if (unlikely(ret)) + err = rxe_do_local_ops(qp, wqe); + if (unlikely(err)) goto err; else - goto next_wqe; + goto done; } if (unlikely(qp_type(qp) == IB_QPT_RC && @@ -685,9 +722,8 @@ next_wqe: qp->req.wqe_index); wqe->state = wqe_state_done; wqe->status = IB_WC_SUCCESS; - __rxe_do_task(&qp->comp.task); - rxe_put(qp); - return 0; + rxe_run_task(&qp->comp.task, 0); + goto done; } payload = mtu; } @@ -703,25 +739,29 @@ next_wqe: if (unlikely(!av)) { pr_err("qp#%d Failed no address vector\n", qp_num(qp)); wqe->status = IB_WC_LOC_QP_OP_ERR; - goto err_drop_ah; + goto err; } skb = init_req_packet(qp, av, wqe, opcode, payload, &pkt); if (unlikely(!skb)) { pr_err("qp#%d Failed allocating skb\n", qp_num(qp)); wqe->status = IB_WC_LOC_QP_OP_ERR; - goto err_drop_ah; + if (ah) + rxe_put(ah); + goto err; } - ret = finish_packet(qp, av, wqe, &pkt, skb, payload); - if (unlikely(ret)) { + err = finish_packet(qp, av, wqe, &pkt, skb, payload); + if (unlikely(err)) { pr_debug("qp#%d Error during finish packet\n", qp_num(qp)); - if (ret == -EFAULT) + if (err == -EFAULT) wqe->status = IB_WC_LOC_PROT_ERR; else wqe->status = IB_WC_LOC_QP_OP_ERR; kfree_skb(skb); - goto err_drop_ah; + if (ah) + rxe_put(ah); + goto err; } if (ah) @@ -736,13 +776,14 @@ next_wqe: save_state(wqe, qp, &rollback_wqe, &rollback_psn); update_wqe_state(qp, wqe, &pkt); update_wqe_psn(qp, wqe, &pkt, payload); - ret = rxe_xmit_packet(qp, &pkt, skb); - if (ret) { + + err = rxe_xmit_packet(qp, &pkt, skb); + if (err) { qp->need_req_skb = 1; rollback_state(wqe, qp, &rollback_wqe, rollback_psn); - if (ret == -EAGAIN) { + if (err == -EAGAIN) { rxe_run_task(&qp->req.task, 1); goto exit; } @@ -753,16 +794,20 @@ next_wqe: update_state(qp, &pkt); - goto next_wqe; - -err_drop_ah: - if (ah) - rxe_put(ah); + /* A non-zero return value will cause rxe_do_task to + * exit its loop and end the tasklet. A zero return + * will continue looping and return to rxe_requester + */ +done: + ret = 0; + goto out; err: wqe->state = wqe_state_error; - __rxe_do_task(&qp->comp.task); - + rxe_run_task(&qp->comp.task, 0); exit: + ret = -EAGAIN; +out: rxe_put(qp); - return -EAGAIN; + + return ret; } diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index f4f6ee5d81fe..b36ec5c4d5e0 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -21,6 +21,7 @@ enum resp_states { RESPST_CHK_RKEY, RESPST_EXECUTE, RESPST_READ_REPLY, + RESPST_ATOMIC_REPLY, RESPST_COMPLETE, RESPST_ACKNOWLEDGE, RESPST_CLEANUP, @@ -55,6 +56,7 @@ static char *resp_state_name[] = { [RESPST_CHK_RKEY] = "CHK_RKEY", [RESPST_EXECUTE] = "EXECUTE", [RESPST_READ_REPLY] = "READ_REPLY", + [RESPST_ATOMIC_REPLY] = "ATOMIC_REPLY", [RESPST_COMPLETE] = "COMPLETE", [RESPST_ACKNOWLEDGE] = "ACKNOWLEDGE", [RESPST_CLEANUP] = "CLEANUP", @@ -448,7 +450,8 @@ static enum resp_states check_rkey(struct rxe_qp *qp, if (rkey_is_mw(rkey)) { mw = rxe_lookup_mw(qp, access, rkey); if (!mw) { - pr_err("%s: no MW matches rkey %#x\n", __func__, rkey); + pr_debug("%s: no MW matches rkey %#x\n", + __func__, rkey); state = RESPST_ERR_RKEY_VIOLATION; goto err; } @@ -468,7 +471,8 @@ static enum resp_states check_rkey(struct rxe_qp *qp, } else { mr = lookup_mr(qp->pd, access, rkey, RXE_LOOKUP_REMOTE); if (!mr) { - pr_err("%s: no MR matches rkey %#x\n", __func__, rkey); + pr_debug("%s: no MR matches rkey %#x\n", + __func__, rkey); state = RESPST_ERR_RKEY_VIOLATION; goto err; } @@ -549,49 +553,106 @@ out: return rc; } +static struct resp_res *rxe_prepare_res(struct rxe_qp *qp, + struct rxe_pkt_info *pkt, + int type) +{ + struct resp_res *res; + u32 pkts; + + res = &qp->resp.resources[qp->resp.res_head]; + rxe_advance_resp_resource(qp); + free_rd_atomic_resource(res); + + res->type = type; + res->replay = 0; + + switch (type) { + case RXE_READ_MASK: + res->read.va = qp->resp.va + qp->resp.offset; + res->read.va_org = qp->resp.va + qp->resp.offset; + res->read.resid = qp->resp.resid; + res->read.length = qp->resp.resid; + res->read.rkey = qp->resp.rkey; + + pkts = max_t(u32, (reth_len(pkt) + qp->mtu - 1)/qp->mtu, 1); + res->first_psn = pkt->psn; + res->cur_psn = pkt->psn; + res->last_psn = (pkt->psn + pkts - 1) & BTH_PSN_MASK; + + res->state = rdatm_res_state_new; + break; + case RXE_ATOMIC_MASK: + res->first_psn = pkt->psn; + res->last_psn = pkt->psn; + res->cur_psn = pkt->psn; + break; + } + + return res; +} + /* Guarantee atomicity of atomic operations at the machine level. */ static DEFINE_SPINLOCK(atomic_ops_lock); -static enum resp_states process_atomic(struct rxe_qp *qp, - struct rxe_pkt_info *pkt) +static enum resp_states atomic_reply(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) { u64 *vaddr; enum resp_states ret; struct rxe_mr *mr = qp->resp.mr; + struct resp_res *res = qp->resp.res; + u64 value; - if (mr->state != RXE_MR_STATE_VALID) { - ret = RESPST_ERR_RKEY_VIOLATION; - goto out; + if (!res) { + res = rxe_prepare_res(qp, pkt, RXE_ATOMIC_MASK); + qp->resp.res = res; } - vaddr = iova_to_vaddr(mr, qp->resp.va + qp->resp.offset, sizeof(u64)); + if (!res->replay) { + if (mr->state != RXE_MR_STATE_VALID) { + ret = RESPST_ERR_RKEY_VIOLATION; + goto out; + } - /* check vaddr is 8 bytes aligned. */ - if (!vaddr || (uintptr_t)vaddr & 7) { - ret = RESPST_ERR_MISALIGNED_ATOMIC; - goto out; - } + vaddr = iova_to_vaddr(mr, qp->resp.va + qp->resp.offset, + sizeof(u64)); - spin_lock_bh(&atomic_ops_lock); + /* check vaddr is 8 bytes aligned. */ + if (!vaddr || (uintptr_t)vaddr & 7) { + ret = RESPST_ERR_MISALIGNED_ATOMIC; + goto out; + } - qp->resp.atomic_orig = *vaddr; + spin_lock_bh(&atomic_ops_lock); + res->atomic.orig_val = value = *vaddr; - if (pkt->opcode == IB_OPCODE_RC_COMPARE_SWAP) { - if (*vaddr == atmeth_comp(pkt)) - *vaddr = atmeth_swap_add(pkt); - } else { - *vaddr += atmeth_swap_add(pkt); - } + if (pkt->opcode == IB_OPCODE_RC_COMPARE_SWAP) { + if (value == atmeth_comp(pkt)) + value = atmeth_swap_add(pkt); + } else { + value += atmeth_swap_add(pkt); + } + + *vaddr = value; + spin_unlock_bh(&atomic_ops_lock); + + qp->resp.msn++; - spin_unlock_bh(&atomic_ops_lock); + /* next expected psn, read handles this separately */ + qp->resp.psn = (pkt->psn + 1) & BTH_PSN_MASK; + qp->resp.ack_psn = qp->resp.psn; - ret = RESPST_NONE; + qp->resp.opcode = pkt->opcode; + qp->resp.status = IB_WC_SUCCESS; + } + + ret = RESPST_ACKNOWLEDGE; out: return ret; } static struct sk_buff *prepare_ack_packet(struct rxe_qp *qp, - struct rxe_pkt_info *pkt, struct rxe_pkt_info *ack, int opcode, int payload, @@ -629,7 +690,7 @@ static struct sk_buff *prepare_ack_packet(struct rxe_qp *qp, } if (ack->mask & RXE_ATMACK_MASK) - atmack_set_orig(ack, qp->resp.atomic_orig); + atmack_set_orig(ack, qp->resp.res->atomic.orig_val); err = rxe_prepare(&qp->pri_av, ack, skb); if (err) { @@ -640,34 +701,6 @@ static struct sk_buff *prepare_ack_packet(struct rxe_qp *qp, return skb; } -static struct resp_res *rxe_prepare_read_res(struct rxe_qp *qp, - struct rxe_pkt_info *pkt) -{ - struct resp_res *res; - u32 pkts; - - res = &qp->resp.resources[qp->resp.res_head]; - rxe_advance_resp_resource(qp); - free_rd_atomic_resource(qp, res); - - res->type = RXE_READ_MASK; - res->replay = 0; - res->read.va = qp->resp.va + qp->resp.offset; - res->read.va_org = qp->resp.va + qp->resp.offset; - res->read.resid = qp->resp.resid; - res->read.length = qp->resp.resid; - res->read.rkey = qp->resp.rkey; - - pkts = max_t(u32, (reth_len(pkt) + qp->mtu - 1)/qp->mtu, 1); - res->first_psn = pkt->psn; - res->cur_psn = pkt->psn; - res->last_psn = (pkt->psn + pkts - 1) & BTH_PSN_MASK; - - res->state = rdatm_res_state_new; - - return res; -} - /** * rxe_recheck_mr - revalidate MR from rkey and get a reference * @qp: the qp @@ -738,7 +771,7 @@ static enum resp_states read_reply(struct rxe_qp *qp, struct rxe_mr *mr; if (!res) { - res = rxe_prepare_read_res(qp, req_pkt); + res = rxe_prepare_res(qp, req_pkt, RXE_READ_MASK); qp->resp.res = res; } @@ -771,7 +804,7 @@ static enum resp_states read_reply(struct rxe_qp *qp, payload = min_t(int, res->read.resid, mtu); - skb = prepare_ack_packet(qp, req_pkt, &ack_pkt, opcode, payload, + skb = prepare_ack_packet(qp, &ack_pkt, opcode, payload, res->cur_psn, AETH_ACK_UNLIMITED); if (!skb) return RESPST_ERR_RNR; @@ -858,9 +891,7 @@ static enum resp_states execute(struct rxe_qp *qp, struct rxe_pkt_info *pkt) qp->resp.msn++; return RESPST_READ_REPLY; } else if (pkt->mask & RXE_ATOMIC_MASK) { - err = process_atomic(qp, pkt); - if (err) - return err; + return RESPST_ATOMIC_REPLY; } else { /* Unreachable */ WARN_ON_ONCE(1); @@ -997,14 +1028,13 @@ finish: return RESPST_CLEANUP; } -static int send_ack(struct rxe_qp *qp, struct rxe_pkt_info *pkt, - u8 syndrome, u32 psn) +static int send_ack(struct rxe_qp *qp, u8 syndrome, u32 psn) { int err = 0; struct rxe_pkt_info ack_pkt; struct sk_buff *skb; - skb = prepare_ack_packet(qp, pkt, &ack_pkt, IB_OPCODE_RC_ACKNOWLEDGE, + skb = prepare_ack_packet(qp, &ack_pkt, IB_OPCODE_RC_ACKNOWLEDGE, 0, psn, syndrome); if (!skb) { err = -ENOMEM; @@ -1019,40 +1049,29 @@ err1: return err; } -static int send_atomic_ack(struct rxe_qp *qp, struct rxe_pkt_info *pkt, - u8 syndrome) +static int send_atomic_ack(struct rxe_qp *qp, u8 syndrome, u32 psn) { - int rc = 0; + int err = 0; struct rxe_pkt_info ack_pkt; struct sk_buff *skb; - struct resp_res *res; - skb = prepare_ack_packet(qp, pkt, &ack_pkt, - IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE, 0, pkt->psn, - syndrome); + skb = prepare_ack_packet(qp, &ack_pkt, IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE, + 0, psn, syndrome); if (!skb) { - rc = -ENOMEM; + err = -ENOMEM; goto out; } - res = &qp->resp.resources[qp->resp.res_head]; - free_rd_atomic_resource(qp, res); - rxe_advance_resp_resource(qp); - - skb_get(skb); - res->type = RXE_ATOMIC_MASK; - res->atomic.skb = skb; - res->first_psn = ack_pkt.psn; - res->last_psn = ack_pkt.psn; - res->cur_psn = ack_pkt.psn; + err = rxe_xmit_packet(qp, &ack_pkt, skb); + if (err) + pr_err_ratelimited("Failed sending atomic ack\n"); - rc = rxe_xmit_packet(qp, &ack_pkt, skb); - if (rc) { - pr_err_ratelimited("Failed sending ack\n"); - rxe_put(qp); - } + /* have to clear this since it is used to trigger + * long read replies + */ + qp->resp.res = NULL; out: - return rc; + return err; } static enum resp_states acknowledge(struct rxe_qp *qp, @@ -1062,11 +1081,11 @@ static enum resp_states acknowledge(struct rxe_qp *qp, return RESPST_CLEANUP; if (qp->resp.aeth_syndrome != AETH_ACK_UNLIMITED) - send_ack(qp, pkt, qp->resp.aeth_syndrome, pkt->psn); + send_ack(qp, qp->resp.aeth_syndrome, pkt->psn); else if (pkt->mask & RXE_ATOMIC_MASK) - send_atomic_ack(qp, pkt, AETH_ACK_UNLIMITED); + send_atomic_ack(qp, AETH_ACK_UNLIMITED, pkt->psn); else if (bth_ack(pkt)) - send_ack(qp, pkt, AETH_ACK_UNLIMITED, pkt->psn); + send_ack(qp, AETH_ACK_UNLIMITED, pkt->psn); return RESPST_CLEANUP; } @@ -1119,7 +1138,7 @@ static enum resp_states duplicate_request(struct rxe_qp *qp, if (pkt->mask & RXE_SEND_MASK || pkt->mask & RXE_WRITE_MASK) { /* SEND. Ack again and cleanup. C9-105. */ - send_ack(qp, pkt, AETH_ACK_UNLIMITED, prev_psn); + send_ack(qp, AETH_ACK_UNLIMITED, prev_psn); return RESPST_CLEANUP; } else if (pkt->mask & RXE_READ_MASK) { struct resp_res *res; @@ -1173,14 +1192,11 @@ static enum resp_states duplicate_request(struct rxe_qp *qp, /* Find the operation in our list of responder resources. */ res = find_resource(qp, pkt->psn); if (res) { - skb_get(res->atomic.skb); - /* Resend the result. */ - rc = rxe_xmit_packet(qp, pkt, res->atomic.skb); - if (rc) { - pr_err("Failed resending result. This flow is not handled - skb ignored\n"); - rc = RESPST_CLEANUP; - goto out; - } + res->replay = 1; + res->cur_psn = pkt->psn; + qp->resp.res = res; + rc = RESPST_ATOMIC_REPLY; + goto out; } /* Resource not found. Class D error. Drop the request. */ @@ -1260,17 +1276,15 @@ int rxe_responder(void *arg) struct rxe_dev *rxe = to_rdev(qp->ibqp.device); enum resp_states state; struct rxe_pkt_info *pkt = NULL; - int ret = 0; + int ret; if (!rxe_get(qp)) return -EAGAIN; qp->resp.aeth_syndrome = AETH_ACK_UNLIMITED; - if (!qp->valid) { - ret = -EINVAL; - goto done; - } + if (!qp->valid) + goto exit; switch (qp->resp.state) { case QP_STATE_RESET: @@ -1316,6 +1330,9 @@ int rxe_responder(void *arg) case RESPST_READ_REPLY: state = read_reply(qp, pkt); break; + case RESPST_ATOMIC_REPLY: + state = atomic_reply(qp, pkt); + break; case RESPST_ACKNOWLEDGE: state = acknowledge(qp, pkt); break; @@ -1327,7 +1344,7 @@ int rxe_responder(void *arg) break; case RESPST_ERR_PSN_OUT_OF_SEQ: /* RC only - Class B. Drop packet. */ - send_ack(qp, pkt, AETH_NAK_PSN_SEQ_ERROR, qp->resp.psn); + send_ack(qp, AETH_NAK_PSN_SEQ_ERROR, qp->resp.psn); state = RESPST_CLEANUP; break; @@ -1349,7 +1366,7 @@ int rxe_responder(void *arg) if (qp_type(qp) == IB_QPT_RC) { rxe_counter_inc(rxe, RXE_CNT_SND_RNR); /* RC - class B */ - send_ack(qp, pkt, AETH_RNR_NAK | + send_ack(qp, AETH_RNR_NAK | (~AETH_TYPE_MASK & qp->attr.min_rnr_timer), pkt->psn); @@ -1438,7 +1455,7 @@ int rxe_responder(void *arg) case RESPST_ERROR: qp->resp.goto_error = 0; - pr_warn("qp#%d moved to error state\n", qp_num(qp)); + pr_debug("qp#%d moved to error state\n", qp_num(qp)); rxe_qp_error(qp); goto exit; @@ -1447,9 +1464,16 @@ int rxe_responder(void *arg) } } + /* A non-zero return value will cause rxe_do_task to + * exit its loop and end the tasklet. A zero return + * will continue looping and return to rxe_responder + */ +done: + ret = 0; + goto out; exit: ret = -EAGAIN; -done: +out: rxe_put(qp); return ret; } diff --git a/drivers/infiniband/sw/rxe/rxe_task.c b/drivers/infiniband/sw/rxe/rxe_task.c index 0c4db5bb17d7..2248cf33d776 100644 --- a/drivers/infiniband/sw/rxe/rxe_task.c +++ b/drivers/infiniband/sw/rxe/rxe_task.c @@ -8,7 +8,7 @@ #include <linux/interrupt.h> #include <linux/hardirq.h> -#include "rxe_task.h" +#include "rxe.h" int __rxe_do_task(struct rxe_task *task) @@ -33,6 +33,7 @@ void rxe_do_task(struct tasklet_struct *t) int cont; int ret; struct rxe_task *task = from_tasklet(task, t, tasklet); + unsigned int iterations = RXE_MAX_ITERATIONS; spin_lock_bh(&task->state_lock); switch (task->state) { @@ -61,13 +62,20 @@ void rxe_do_task(struct tasklet_struct *t) spin_lock_bh(&task->state_lock); switch (task->state) { case TASK_STATE_BUSY: - if (ret) + if (ret) { task->state = TASK_STATE_START; - else + } else if (iterations--) { cont = 1; + } else { + /* reschedule the tasklet and exit + * the loop to give up the cpu + */ + tasklet_schedule(&task->tasklet); + task->state = TASK_STATE_START; + } break; - /* soneone tried to run the task since the last time we called + /* someone tried to run the task since the last time we called * func, so we will call one more time regardless of the * return value */ diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 9d995854a174..e264cf69bf55 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -115,7 +115,7 @@ static void rxe_dealloc_ucontext(struct ib_ucontext *ibuc) { struct rxe_ucontext *uc = to_ruc(ibuc); - rxe_put(uc); + rxe_cleanup(uc); } static int rxe_port_immutable(struct ib_device *dev, u32 port_num, @@ -149,7 +149,7 @@ static int rxe_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) { struct rxe_pd *pd = to_rpd(ibpd); - rxe_put(pd); + rxe_cleanup(pd); return 0; } @@ -176,7 +176,8 @@ static int rxe_create_ah(struct ib_ah *ibah, if (err) return err; - err = rxe_add_to_pool(&rxe->ah_pool, ah); + err = rxe_add_to_pool_ah(&rxe->ah_pool, ah, + init_attr->flags & RDMA_CREATE_AH_SLEEPABLE); if (err) return err; @@ -188,7 +189,7 @@ static int rxe_create_ah(struct ib_ah *ibah, err = copy_to_user(&uresp->ah_num, &ah->ah_num, sizeof(uresp->ah_num)); if (err) { - rxe_put(ah); + rxe_cleanup(ah); return -EFAULT; } } else if (ah->is_user) { @@ -197,6 +198,8 @@ static int rxe_create_ah(struct ib_ah *ibah, } rxe_init_av(init_attr->ah_attr, &ah->av); + rxe_finalize(ah); + return 0; } @@ -228,7 +231,8 @@ static int rxe_destroy_ah(struct ib_ah *ibah, u32 flags) { struct rxe_ah *ah = to_rah(ibah); - rxe_put(ah); + rxe_cleanup_ah(ah, flags & RDMA_DESTROY_AH_SLEEPABLE); + return 0; } @@ -308,12 +312,13 @@ static int rxe_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *init, err = rxe_srq_from_init(rxe, srq, init, udata, uresp); if (err) - goto err_put; + goto err_cleanup; return 0; -err_put: - rxe_put(srq); +err_cleanup: + rxe_cleanup(srq); + return err; } @@ -362,7 +367,7 @@ static int rxe_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata) { struct rxe_srq *srq = to_rsrq(ibsrq); - rxe_put(srq); + rxe_cleanup(srq); return 0; } @@ -429,10 +434,11 @@ static int rxe_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init, if (err) goto qp_init; + rxe_finalize(qp); return 0; qp_init: - rxe_put(qp); + rxe_cleanup(qp); return err; } @@ -485,7 +491,7 @@ static int rxe_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) if (ret) return ret; - rxe_put(qp); + rxe_cleanup(qp); return 0; } @@ -803,7 +809,7 @@ static int rxe_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) rxe_cq_disable(cq); - rxe_put(cq); + rxe_cleanup(cq); return 0; } @@ -898,6 +904,7 @@ static struct ib_mr *rxe_get_dma_mr(struct ib_pd *ibpd, int access) rxe_get(pd); rxe_mr_init_dma(pd, access, mr); + rxe_finalize(mr); return &mr->ibmr; } @@ -926,11 +933,13 @@ static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, if (err) goto err3; + rxe_finalize(mr); + return &mr->ibmr; err3: rxe_put(pd); - rxe_put(mr); + rxe_cleanup(mr); err2: return ERR_PTR(err); } @@ -958,35 +967,52 @@ static struct ib_mr *rxe_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type, if (err) goto err2; + rxe_finalize(mr); + return &mr->ibmr; err2: rxe_put(pd); - rxe_put(mr); + rxe_cleanup(mr); err1: return ERR_PTR(err); } -/* build next_map_set from scatterlist - * The IB_WR_REG_MR WR will swap map_sets - */ +static int rxe_set_page(struct ib_mr *ibmr, u64 addr) +{ + struct rxe_mr *mr = to_rmr(ibmr); + struct rxe_map *map; + struct rxe_phys_buf *buf; + + if (unlikely(mr->nbuf == mr->num_buf)) + return -ENOMEM; + + map = mr->map[mr->nbuf / RXE_BUF_PER_MAP]; + buf = &map->buf[mr->nbuf % RXE_BUF_PER_MAP]; + + buf->addr = addr; + buf->size = ibmr->page_size; + mr->nbuf++; + + return 0; +} + static int rxe_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset) { struct rxe_mr *mr = to_rmr(ibmr); - struct rxe_map_set *set = mr->next_map_set; int n; - set->nbuf = 0; + mr->nbuf = 0; - n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, rxe_mr_set_page); + n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, rxe_set_page); - set->va = ibmr->iova; - set->iova = ibmr->iova; - set->length = ibmr->length; - set->page_shift = ilog2(ibmr->page_size); - set->page_mask = ibmr->page_size - 1; - set->offset = set->iova & set->page_mask; + mr->va = ibmr->iova; + mr->iova = ibmr->iova; + mr->length = ibmr->length; + mr->page_shift = ilog2(ibmr->page_size); + mr->page_mask = ibmr->page_size - 1; + mr->offset = mr->iova & mr->page_mask; return n; } diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index ac464e68c923..a24fbe984066 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -9,7 +9,6 @@ #include <linux/interrupt.h> #include <linux/workqueue.h> -#include <rdma/rdma_user_rxe.h> #include "rxe_pool.h" #include "rxe_task.h" #include "rxe_hw_counters.h" @@ -124,6 +123,7 @@ struct rxe_req_info { int need_rd_atomic; int wait_psn; int need_retry; + int wait_for_rnr_timer; int noack_pkts; struct rxe_task task; }; @@ -155,7 +155,7 @@ struct resp_res { union { struct { - struct sk_buff *skb; + u64 orig_val; } atomic; struct { u64 va_org; @@ -189,7 +189,6 @@ struct rxe_resp_info { u32 resid; u32 rkey; u32 length; - u64 atomic_orig; /* SRQ only */ struct { @@ -288,17 +287,6 @@ struct rxe_map { struct rxe_phys_buf buf[RXE_BUF_PER_MAP]; }; -struct rxe_map_set { - struct rxe_map **map; - u64 va; - u64 iova; - size_t length; - u32 offset; - u32 nbuf; - int page_shift; - int page_mask; -}; - static inline int rkey_is_mw(u32 rkey) { u32 index = rkey >> 8; @@ -316,20 +304,26 @@ struct rxe_mr { u32 rkey; enum rxe_mr_state state; enum ib_mr_type type; + u64 va; + u64 iova; + size_t length; + u32 offset; int access; + int page_shift; + int page_mask; int map_shift; int map_mask; u32 num_buf; + u32 nbuf; u32 max_buf; u32 num_map; atomic_t num_mw; - struct rxe_map_set *cur_map_set; - struct rxe_map_set *next_map_set; + struct rxe_map **map; }; enum rxe_mw_state { diff --git a/drivers/infiniband/sw/siw/siw_cm.c b/drivers/infiniband/sw/siw/siw_cm.c index 17f34d584cd9..f88d2971c2c6 100644 --- a/drivers/infiniband/sw/siw/siw_cm.c +++ b/drivers/infiniband/sw/siw/siw_cm.c @@ -725,11 +725,11 @@ static int siw_proc_mpareply(struct siw_cep *cep) enum mpa_v2_ctrl mpa_p2p_mode = MPA_V2_RDMA_NO_RTR; rv = siw_recv_mpa_rr(cep); - if (rv != -EAGAIN) - siw_cancel_mpatimer(cep); if (rv) goto out_err; + siw_cancel_mpatimer(cep); + rep = &cep->mpa.hdr; if (__mpa_rr_revision(rep->params.bits) > MPA_REVISION_2) { @@ -895,7 +895,8 @@ static int siw_proc_mpareply(struct siw_cep *cep) } out_err: - siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -EINVAL); + if (rv != -EAGAIN) + siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -EINVAL); return rv; } diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c index 09316072b789..8dedae7ae79e 100644 --- a/drivers/infiniband/sw/siw/siw_verbs.c +++ b/drivers/infiniband/sw/siw/siw_verbs.c @@ -1167,7 +1167,7 @@ int siw_create_cq(struct ib_cq *base_cq, const struct ib_cq_init_attr *attr, err_out: siw_dbg(base_cq->device, "CQ creation failed: %d", rv); - if (cq && cq->queue) { + if (cq->queue) { struct siw_ucontext *ctx = rdma_udata_to_drv_context(udata, struct siw_ucontext, base_ucontext); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index 2c3dca41d3bd..ab86165e45de 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -1109,7 +1109,7 @@ static bool ipoib_dev_addr_changed_valid(struct ipoib_dev_priv *priv) * if he sets the device address back to be based on GID index 0, * he no longer wishs to control it. * - * If the user doesn't control the the device address, + * If the user doesn't control the device address, * IPOIB_FLAG_DEV_ADDR_SET is set and ib_find_gid failed it means * the port GUID has changed and GID at index 0 has changed * so we need to change priv->local_gid and priv->dev->dev_addr diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 2a8961b685c2..a4904371e2db 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -1664,8 +1664,10 @@ static void ipoib_napi_add(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); - netif_napi_add(dev, &priv->recv_napi, ipoib_rx_poll, IPOIB_NUM_WC); - netif_napi_add(dev, &priv->send_napi, ipoib_tx_poll, MAX_SEND_CQE); + netif_napi_add_weight(dev, &priv->recv_napi, ipoib_rx_poll, + IPOIB_NUM_WC); + netif_napi_add_weight(dev, &priv->send_napi, ipoib_tx_poll, + MAX_SEND_CQE); } static void ipoib_napi_del(struct net_device *dev) diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index c08f2d9133b6..a00ca117303a 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -246,6 +246,7 @@ static int iser_create_ib_conn_res(struct ib_conn *ib_conn) device = ib_conn->device; ib_dev = device->ib_device; + /* +1 for drain */ if (ib_conn->pi_support) max_send_wr = ISER_QP_SIG_MAX_REQ_DTOS + 1; else @@ -267,7 +268,8 @@ static int iser_create_ib_conn_res(struct ib_conn *ib_conn) init_attr.qp_context = (void *)ib_conn; init_attr.send_cq = ib_conn->cq; init_attr.recv_cq = ib_conn->cq; - init_attr.cap.max_recv_wr = ISER_QP_MAX_RECV_DTOS; + /* +1 for drain */ + init_attr.cap.max_recv_wr = ISER_QP_MAX_RECV_DTOS + 1; init_attr.cap.max_send_sge = 2; init_attr.cap.max_recv_sge = 1; init_attr.sq_sig_type = IB_SIGNAL_REQ_WR; @@ -485,7 +487,7 @@ int iser_conn_terminate(struct iser_conn *iser_conn) iser_conn, err); /* block until all flush errors are consumed */ - ib_drain_sq(ib_conn->qp); + ib_drain_qp(ib_conn->qp); } return 1; diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt-stats.c b/drivers/infiniband/ulp/rtrs/rtrs-clt-stats.c index 385a19846c24..1e6ffafa2db3 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt-stats.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt-stats.c @@ -32,11 +32,7 @@ void rtrs_clt_update_wc_stats(struct rtrs_clt_con *con) void rtrs_clt_inc_failover_cnt(struct rtrs_clt_stats *stats) { - struct rtrs_clt_stats_pcpu *s; - - s = get_cpu_ptr(stats->pcpu_stats); - s->rdma.failover_cnt++; - put_cpu_ptr(stats->pcpu_stats); + this_cpu_inc(stats->pcpu_stats->rdma.failover_cnt); } int rtrs_clt_stats_migration_from_cnt_to_str(struct rtrs_clt_stats *stats, char *buf) @@ -169,12 +165,8 @@ int rtrs_clt_reset_all_stats(struct rtrs_clt_stats *s, bool enable) static inline void rtrs_clt_update_rdma_stats(struct rtrs_clt_stats *stats, size_t size, int d) { - struct rtrs_clt_stats_pcpu *s; - - s = get_cpu_ptr(stats->pcpu_stats); - s->rdma.dir[d].cnt++; - s->rdma.dir[d].size_total += size; - put_cpu_ptr(stats->pcpu_stats); + this_cpu_inc(stats->pcpu_stats->rdma.dir[d].cnt); + this_cpu_add(stats->pcpu_stats->rdma.dir[d].size_total, size); } void rtrs_clt_update_all_stats(struct rtrs_clt_io_req *req, int dir) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c index 9809c3883979..baecde41d126 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c @@ -740,25 +740,25 @@ struct path_it { struct rtrs_clt_path *(*next_path)(struct path_it *it); }; -/** - * list_next_or_null_rr_rcu - get next list element in round-robin fashion. +/* + * rtrs_clt_get_next_path_or_null - get clt path from the list or return NULL * @head: the head for the list. - * @ptr: the list head to take the next element from. - * @type: the type of the struct this is embedded in. - * @memb: the name of the list_head within the struct. + * @clt_path: The element to take the next clt_path from. * - * Next element returned in round-robin fashion, i.e. head will be skipped, + * Next clt path returned in round-robin fashion, i.e. head will be skipped, * but if list is observed as empty, NULL will be returned. * - * This primitive may safely run concurrently with the _rcu list-mutation + * This function may safely run concurrently with the _rcu list-mutation * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). */ -#define list_next_or_null_rr_rcu(head, ptr, type, memb) \ -({ \ - list_next_or_null_rcu(head, ptr, type, memb) ?: \ - list_next_or_null_rcu(head, READ_ONCE((ptr)->next), \ - type, memb); \ -}) +static inline struct rtrs_clt_path * +rtrs_clt_get_next_path_or_null(struct list_head *head, struct rtrs_clt_path *clt_path) +{ + return list_next_or_null_rcu(head, &clt_path->s.entry, typeof(*clt_path), s.entry) ?: + list_next_or_null_rcu(head, + READ_ONCE((&clt_path->s.entry)->next), + typeof(*clt_path), s.entry); +} /** * get_next_path_rr() - Returns path in round-robin fashion. @@ -789,10 +789,8 @@ static struct rtrs_clt_path *get_next_path_rr(struct path_it *it) path = list_first_or_null_rcu(&clt->paths_list, typeof(*path), s.entry); else - path = list_next_or_null_rr_rcu(&clt->paths_list, - &path->s.entry, - typeof(*path), - s.entry); + path = rtrs_clt_get_next_path_or_null(&clt->paths_list, path); + rcu_assign_pointer(*ppcpu_path, path); return path; @@ -1403,8 +1401,7 @@ static int alloc_permits(struct rtrs_clt_sess *clt) unsigned int chunk_bits; int err, i; - clt->permits_map = kcalloc(BITS_TO_LONGS(clt->queue_depth), - sizeof(long), GFP_KERNEL); + clt->permits_map = bitmap_zalloc(clt->queue_depth, GFP_KERNEL); if (!clt->permits_map) { err = -ENOMEM; goto out_err; @@ -1426,7 +1423,7 @@ static int alloc_permits(struct rtrs_clt_sess *clt) return 0; err_map: - kfree(clt->permits_map); + bitmap_free(clt->permits_map); clt->permits_map = NULL; out_err: return err; @@ -1434,13 +1431,11 @@ out_err: static void free_permits(struct rtrs_clt_sess *clt) { - if (clt->permits_map) { - size_t sz = clt->queue_depth; - + if (clt->permits_map) wait_event(clt->permits_wait, - find_first_bit(clt->permits_map, sz) >= sz); - } - kfree(clt->permits_map); + bitmap_empty(clt->permits_map, clt->queue_depth)); + + bitmap_free(clt->permits_map); clt->permits_map = NULL; kfree(clt->permits); clt->permits = NULL; @@ -2277,8 +2272,7 @@ static void rtrs_clt_remove_path_from_arr(struct rtrs_clt_path *clt_path) * removed. If @sess is the last element, then @next is NULL. */ rcu_read_lock(); - next = list_next_or_null_rr_rcu(&clt->paths_list, &clt_path->s.entry, - typeof(*next), s.entry); + next = rtrs_clt_get_next_path_or_null(&clt->paths_list, clt_path); rcu_read_unlock(); /* diff --git a/drivers/infiniband/ulp/rtrs/rtrs-pri.h b/drivers/infiniband/ulp/rtrs/rtrs-pri.h index 9a1e5c2ae55c..ac0df734eba8 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-pri.h +++ b/drivers/infiniband/ulp/rtrs/rtrs-pri.h @@ -23,6 +23,17 @@ #define RTRS_PROTO_VER_STRING __stringify(RTRS_PROTO_VER_MAJOR) "." \ __stringify(RTRS_PROTO_VER_MINOR) +/* + * Max IB immediate data size is 2^28 (MAX_IMM_PAYL_BITS) + * and the minimum chunk size is 4096 (2^12). + * So the maximum sess_queue_depth is 65536 (2^16) in theory. + * But mempool_create, create_qp and ib_post_send fail with + * "cannot allocate memory" error if sess_queue_depth is too big. + * Therefore the pratical max value of sess_queue_depth is + * somewhere between 1 and 65534 and it depends on the system. + */ +#define MAX_SESS_QUEUE_DEPTH 65535 + enum rtrs_imm_const { MAX_IMM_TYPE_BITS = 4, MAX_IMM_TYPE_MASK = ((1 << MAX_IMM_TYPE_BITS) - 1), @@ -46,16 +57,6 @@ enum { MAX_PATHS_NUM = 128, - /* - * Max IB immediate data size is 2^28 (MAX_IMM_PAYL_BITS) - * and the minimum chunk size is 4096 (2^12). - * So the maximum sess_queue_depth is 65536 (2^16) in theory. - * But mempool_create, create_qp and ib_post_send fail with - * "cannot allocate memory" error if sess_queue_depth is too big. - * Therefore the pratical max value of sess_queue_depth is - * somewhere between 1 and 65534 and it depends on the system. - */ - MAX_SESS_QUEUE_DEPTH = 65535, MIN_CHUNK_SIZE = 8192, RTRS_HB_INTERVAL_MS = 5000, diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv-stats.c b/drivers/infiniband/ulp/rtrs/rtrs-srv-stats.c index 44b1c1652131..2aff1213a19d 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv-stats.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv-stats.c @@ -14,9 +14,14 @@ int rtrs_srv_reset_rdma_stats(struct rtrs_srv_stats *stats, bool enable) { if (enable) { - struct rtrs_srv_stats_rdma_stats *r = &stats->rdma_stats; + int cpu; + struct rtrs_srv_stats_rdma_stats *r; + + for_each_possible_cpu(cpu) { + r = per_cpu_ptr(stats->rdma_stats, cpu); + memset(r, 0, sizeof(*r)); + } - memset(r, 0, sizeof(*r)); return 0; } @@ -25,11 +30,22 @@ int rtrs_srv_reset_rdma_stats(struct rtrs_srv_stats *stats, bool enable) ssize_t rtrs_srv_stats_rdma_to_str(struct rtrs_srv_stats *stats, char *page) { - struct rtrs_srv_stats_rdma_stats *r = &stats->rdma_stats; + int cpu; + struct rtrs_srv_stats_rdma_stats sum; + struct rtrs_srv_stats_rdma_stats *r; + + memset(&sum, 0, sizeof(sum)); + + for_each_possible_cpu(cpu) { + r = per_cpu_ptr(stats->rdma_stats, cpu); + + sum.dir[READ].cnt += r->dir[READ].cnt; + sum.dir[READ].size_total += r->dir[READ].size_total; + sum.dir[WRITE].cnt += r->dir[WRITE].cnt; + sum.dir[WRITE].size_total += r->dir[WRITE].size_total; + } - return sysfs_emit(page, "%lld %lld %lld %lldn %u\n", - (s64)atomic64_read(&r->dir[READ].cnt), - (s64)atomic64_read(&r->dir[READ].size_total), - (s64)atomic64_read(&r->dir[WRITE].cnt), - (s64)atomic64_read(&r->dir[WRITE].size_total), 0); + return sysfs_emit(page, "%llu %llu %llu %llu\n", + sum.dir[READ].cnt, sum.dir[READ].size_total, + sum.dir[WRITE].cnt, sum.dir[WRITE].size_total); } diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c b/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c index b94ae12c2795..2a3c9ac64a42 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c @@ -220,6 +220,8 @@ static void rtrs_srv_path_stats_release(struct kobject *kobj) stats = container_of(kobj, struct rtrs_srv_stats, kobj_stats); + free_percpu(stats->rdma_stats); + kfree(stats); } diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.c b/drivers/infiniband/ulp/rtrs/rtrs-srv.c index 24024bce2566..34c03bde5064 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c @@ -11,7 +11,6 @@ #define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt #include <linux/module.h> -#include <linux/mempool.h> #include "rtrs-srv.h" #include "rtrs-log.h" @@ -26,11 +25,7 @@ MODULE_LICENSE("GPL"); #define DEFAULT_SESS_QUEUE_DEPTH 512 #define MAX_HDR_SIZE PAGE_SIZE -/* We guarantee to serve 10 paths at least */ -#define CHUNK_POOL_SZ 10 - static struct rtrs_rdma_dev_pd dev_pd; -static mempool_t *chunk_pool; struct class *rtrs_dev_class; static struct rtrs_srv_ib_ctx ib_ctx; @@ -1358,7 +1353,7 @@ static void free_srv(struct rtrs_srv_sess *srv) WARN_ON(refcount_read(&srv->refcount)); for (i = 0; i < srv->queue_depth; i++) - mempool_free(srv->chunks[i], chunk_pool); + __free_pages(srv->chunks[i], get_order(max_chunk_size)); kfree(srv->chunks); mutex_destroy(&srv->paths_mutex); mutex_destroy(&srv->paths_ev_mutex); @@ -1411,7 +1406,8 @@ static struct rtrs_srv_sess *get_or_create_srv(struct rtrs_srv_ctx *ctx, goto err_free_srv; for (i = 0; i < srv->queue_depth; i++) { - srv->chunks[i] = mempool_alloc(chunk_pool, GFP_KERNEL); + srv->chunks[i] = alloc_pages(GFP_KERNEL, + get_order(max_chunk_size)); if (!srv->chunks[i]) goto err_free_chunks; } @@ -1424,7 +1420,7 @@ static struct rtrs_srv_sess *get_or_create_srv(struct rtrs_srv_ctx *ctx, err_free_chunks: while (i--) - mempool_free(srv->chunks[i], chunk_pool); + __free_pages(srv->chunks[i], get_order(max_chunk_size)); kfree(srv->chunks); err_free_srv: @@ -1513,6 +1509,7 @@ static void free_path(struct rtrs_srv_path *srv_path) kobject_del(&srv_path->kobj); kobject_put(&srv_path->kobj); } else { + free_percpu(srv_path->stats->rdma_stats); kfree(srv_path->stats); kfree(srv_path); } @@ -1755,13 +1752,17 @@ static struct rtrs_srv_path *__alloc_path(struct rtrs_srv_sess *srv, if (!srv_path->stats) goto err_free_sess; + srv_path->stats->rdma_stats = alloc_percpu(struct rtrs_srv_stats_rdma_stats); + if (!srv_path->stats->rdma_stats) + goto err_free_stats; + srv_path->stats->srv_path = srv_path; srv_path->dma_addr = kcalloc(srv->queue_depth, sizeof(*srv_path->dma_addr), GFP_KERNEL); if (!srv_path->dma_addr) - goto err_free_stats; + goto err_free_percpu; srv_path->s.con = kcalloc(con_num, sizeof(*srv_path->s.con), GFP_KERNEL); @@ -1813,6 +1814,8 @@ err_free_con: kfree(srv_path->s.con); err_free_dma_addr: kfree(srv_path->dma_addr); +err_free_percpu: + free_percpu(srv_path->stats->rdma_stats); err_free_stats: kfree(srv_path->stats); err_free_sess: @@ -2266,14 +2269,10 @@ static int __init rtrs_server_init(void) err); return err; } - chunk_pool = mempool_create_page_pool(sess_queue_depth * CHUNK_POOL_SZ, - get_order(max_chunk_size)); - if (!chunk_pool) - return -ENOMEM; rtrs_dev_class = class_create(THIS_MODULE, "rtrs-server"); if (IS_ERR(rtrs_dev_class)) { err = PTR_ERR(rtrs_dev_class); - goto out_chunk_pool; + goto out_err; } rtrs_wq = alloc_workqueue("rtrs_server_wq", 0, 0); if (!rtrs_wq) { @@ -2285,9 +2284,7 @@ static int __init rtrs_server_init(void) out_dev_class: class_destroy(rtrs_dev_class); -out_chunk_pool: - mempool_destroy(chunk_pool); - +out_err: return err; } @@ -2295,7 +2292,6 @@ static void __exit rtrs_server_exit(void) { destroy_workqueue(rtrs_wq); class_destroy(rtrs_dev_class); - mempool_destroy(chunk_pool); rtrs_rdma_dev_pd_deinit(&dev_pd); } diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.h b/drivers/infiniband/ulp/rtrs/rtrs-srv.h index 6292e87f6afd..186a63c217df 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv.h +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.h @@ -12,6 +12,7 @@ #include <linux/device.h> #include <linux/refcount.h> +#include <linux/percpu.h> #include "rtrs-pri.h" /* @@ -29,15 +30,15 @@ enum rtrs_srv_state { */ struct rtrs_srv_stats_rdma_stats { struct { - atomic64_t cnt; - atomic64_t size_total; + u64 cnt; + u64 size_total; } dir[2]; }; struct rtrs_srv_stats { - struct kobject kobj_stats; - struct rtrs_srv_stats_rdma_stats rdma_stats; - struct rtrs_srv_path *srv_path; + struct kobject kobj_stats; + struct rtrs_srv_stats_rdma_stats __percpu *rdma_stats; + struct rtrs_srv_path *srv_path; }; struct rtrs_srv_con { @@ -130,8 +131,8 @@ void close_path(struct rtrs_srv_path *srv_path); static inline void rtrs_srv_update_rdma_stats(struct rtrs_srv_stats *s, size_t size, int d) { - atomic64_inc(&s->rdma_stats.dir[d].cnt); - atomic64_add(size, &s->rdma_stats.dir[d].size_total); + this_cpu_inc(s->rdma_stats->dir[d].cnt); + this_cpu_add(s->rdma_stats->dir[d].size_total, size); } /* functions which are implemented in rtrs-srv-stats.c */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c index 2ccf7bef9b05..e735e19461ba 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c @@ -50,10 +50,12 @@ static int mlx5_cmd_stub_update_root_ft(struct mlx5_flow_root_namespace *ns, static int mlx5_cmd_stub_create_flow_table(struct mlx5_flow_root_namespace *ns, struct mlx5_flow_table *ft, - unsigned int size, + struct mlx5_flow_table_attr *ft_attr, struct mlx5_flow_table *next_ft) { - ft->max_fte = size ? roundup_pow_of_two(size) : 1; + int max_fte = ft_attr->max_fte; + + ft->max_fte = max_fte ? roundup_pow_of_two(max_fte) : 1; return 0; } @@ -258,7 +260,7 @@ static int mlx5_cmd_update_root_ft(struct mlx5_flow_root_namespace *ns, static int mlx5_cmd_create_flow_table(struct mlx5_flow_root_namespace *ns, struct mlx5_flow_table *ft, - unsigned int size, + struct mlx5_flow_table_attr *ft_attr, struct mlx5_flow_table *next_ft) { int en_encap = !!(ft->flags & MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT); @@ -267,17 +269,19 @@ static int mlx5_cmd_create_flow_table(struct mlx5_flow_root_namespace *ns, u32 out[MLX5_ST_SZ_DW(create_flow_table_out)] = {}; u32 in[MLX5_ST_SZ_DW(create_flow_table_in)] = {}; struct mlx5_core_dev *dev = ns->dev; + unsigned int size; int err; - if (size != POOL_NEXT_SIZE) - size = roundup_pow_of_two(size); - size = mlx5_ft_pool_get_avail_sz(dev, ft->type, size); + if (ft_attr->max_fte != POOL_NEXT_SIZE) + size = roundup_pow_of_two(ft_attr->max_fte); + size = mlx5_ft_pool_get_avail_sz(dev, ft->type, ft_attr->max_fte); if (!size) return -ENOSPC; MLX5_SET(create_flow_table_in, in, opcode, MLX5_CMD_OP_CREATE_FLOW_TABLE); + MLX5_SET(create_flow_table_in, in, uid, ft_attr->uid); MLX5_SET(create_flow_table_in, in, table_type, ft->type); MLX5_SET(create_flow_table_in, in, flow_table_context.level, ft->level); MLX5_SET(create_flow_table_in, in, flow_table_context.log_size, size ? ilog2(size) : 0); @@ -479,6 +483,30 @@ static int mlx5_set_extended_dest(struct mlx5_core_dev *dev, return 0; } + +static void +mlx5_cmd_set_fte_flow_meter(struct fs_fte *fte, void *in_flow_context) +{ + void *exe_aso_ctrl; + void *execute_aso; + + execute_aso = MLX5_ADDR_OF(flow_context, in_flow_context, + execute_aso[0]); + MLX5_SET(execute_aso, execute_aso, valid, 1); + MLX5_SET(execute_aso, execute_aso, aso_object_id, + fte->action.exe_aso.object_id); + + exe_aso_ctrl = MLX5_ADDR_OF(execute_aso, execute_aso, exe_aso_ctrl); + MLX5_SET(exe_aso_ctrl_flow_meter, exe_aso_ctrl, return_reg_id, + fte->action.exe_aso.return_reg_id); + MLX5_SET(exe_aso_ctrl_flow_meter, exe_aso_ctrl, aso_type, + fte->action.exe_aso.type); + MLX5_SET(exe_aso_ctrl_flow_meter, exe_aso_ctrl, init_color, + fte->action.exe_aso.flow_meter.init_color); + MLX5_SET(exe_aso_ctrl_flow_meter, exe_aso_ctrl, meter_id, + fte->action.exe_aso.flow_meter.meter_idx); +} + static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev, int opmod, int modify_mask, struct mlx5_flow_table *ft, @@ -663,6 +691,15 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev, list_size); } + if (fte->action.action & MLX5_FLOW_CONTEXT_ACTION_EXECUTE_ASO) { + if (fte->action.exe_aso.type == MLX5_EXE_ASO_FLOW_METER) { + mlx5_cmd_set_fte_flow_meter(fte, in_flow_context); + } else { + err = -EOPNOTSUPP; + goto err_out; + } + } + err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); err_out: kvfree(in); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h index 274004e80f03..8ef4254b9ea1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h @@ -38,7 +38,7 @@ struct mlx5_flow_cmds { int (*create_flow_table)(struct mlx5_flow_root_namespace *ns, struct mlx5_flow_table *ft, - unsigned int size, + struct mlx5_flow_table_attr *ft_attr, struct mlx5_flow_table *next_ft); int (*destroy_flow_table)(struct mlx5_flow_root_namespace *ns, struct mlx5_flow_table *ft); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index fdcf7f529330..35d89edb1bcd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -1155,7 +1155,7 @@ static struct mlx5_flow_table *__mlx5_create_flow_table(struct mlx5_flow_namespa find_next_chained_ft(fs_prio); ft->def_miss_action = ns->def_miss_action; ft->ns = ns; - err = root->cmds->create_flow_table(root, ft, ft_attr->max_fte, next_ft); + err = root->cmds->create_flow_table(root, ft, ft_attr, next_ft); if (err) goto free_ft; @@ -1195,6 +1195,12 @@ struct mlx5_flow_table *mlx5_create_flow_table(struct mlx5_flow_namespace *ns, } EXPORT_SYMBOL(mlx5_create_flow_table); +u32 mlx5_flow_table_id(struct mlx5_flow_table *ft) +{ + return ft->id; +} +EXPORT_SYMBOL(mlx5_flow_table_id); + struct mlx5_flow_table * mlx5_create_vport_flow_table(struct mlx5_flow_namespace *ns, struct mlx5_flow_table_attr *ft_attr, u16 vport) @@ -2866,6 +2872,14 @@ static int create_fdb_bypass(struct mlx5_flow_steering *steering) return 0; } +static void cleanup_fdb_root_ns(struct mlx5_flow_steering *steering) +{ + cleanup_root_ns(steering->fdb_root_ns); + steering->fdb_root_ns = NULL; + kfree(steering->fdb_sub_ns); + steering->fdb_sub_ns = NULL; +} + static int init_fdb_root_ns(struct mlx5_flow_steering *steering) { struct fs_prio *maj_prio; @@ -2916,10 +2930,7 @@ static int init_fdb_root_ns(struct mlx5_flow_steering *steering) return 0; out_err: - cleanup_root_ns(steering->fdb_root_ns); - kfree(steering->fdb_sub_ns); - steering->fdb_sub_ns = NULL; - steering->fdb_root_ns = NULL; + cleanup_fdb_root_ns(steering); return err; } @@ -3079,10 +3090,7 @@ void mlx5_fs_core_cleanup(struct mlx5_core_dev *dev) struct mlx5_flow_steering *steering = dev->priv.steering; cleanup_root_ns(steering->root_ns); - cleanup_root_ns(steering->fdb_root_ns); - steering->fdb_root_ns = NULL; - kfree(steering->fdb_sub_ns); - steering->fdb_sub_ns = NULL; + cleanup_fdb_root_ns(steering); cleanup_root_ns(steering->port_sel_root_ns); cleanup_root_ns(steering->sniffer_rx_root_ns); cleanup_root_ns(steering->sniffer_tx_root_ns); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/dm.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/dm.c index 3d5e57ff558c..7e02cbe8c3b9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/dm.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/dm.c @@ -12,13 +12,16 @@ struct mlx5_dm { spinlock_t lock; unsigned long *steering_sw_icm_alloc_blocks; unsigned long *header_modify_sw_icm_alloc_blocks; + unsigned long *header_modify_pattern_sw_icm_alloc_blocks; }; struct mlx5_dm *mlx5_dm_create(struct mlx5_core_dev *dev) { + u64 header_modify_pattern_icm_blocks = 0; u64 header_modify_icm_blocks = 0; u64 steering_icm_blocks = 0; struct mlx5_dm *dm; + bool support_v2; if (!(MLX5_CAP_GEN_64(dev, general_obj_types) & MLX5_GENERAL_OBJ_TYPES_CAP_SW_ICM)) return NULL; @@ -53,8 +56,27 @@ struct mlx5_dm *mlx5_dm_create(struct mlx5_core_dev *dev) goto err_modify_hdr; } + support_v2 = MLX5_CAP_FLOWTABLE_NIC_RX(dev, sw_owner_v2) && + MLX5_CAP_FLOWTABLE_NIC_TX(dev, sw_owner_v2) && + MLX5_CAP64_DEV_MEM(dev, header_modify_pattern_sw_icm_start_address); + + if (support_v2) { + header_modify_pattern_icm_blocks = + BIT(MLX5_CAP_DEV_MEM(dev, log_header_modify_pattern_sw_icm_size) - + MLX5_LOG_SW_ICM_BLOCK_SIZE(dev)); + + dm->header_modify_pattern_sw_icm_alloc_blocks = + kcalloc(BITS_TO_LONGS(header_modify_pattern_icm_blocks), + sizeof(unsigned long), GFP_KERNEL); + if (!dm->header_modify_pattern_sw_icm_alloc_blocks) + goto err_pattern; + } + return dm; +err_pattern: + kfree(dm->header_modify_sw_icm_alloc_blocks); + err_modify_hdr: kfree(dm->steering_sw_icm_alloc_blocks); @@ -86,6 +108,14 @@ void mlx5_dm_cleanup(struct mlx5_core_dev *dev) kfree(dm->header_modify_sw_icm_alloc_blocks); } + if (dm->header_modify_pattern_sw_icm_alloc_blocks) { + WARN_ON(!bitmap_empty(dm->header_modify_pattern_sw_icm_alloc_blocks, + BIT(MLX5_CAP_DEV_MEM(dev, + log_header_modify_pattern_sw_icm_size) - + MLX5_LOG_SW_ICM_BLOCK_SIZE(dev)))); + kfree(dm->header_modify_pattern_sw_icm_alloc_blocks); + } + kfree(dm); } @@ -130,6 +160,13 @@ int mlx5_dm_sw_icm_alloc(struct mlx5_core_dev *dev, enum mlx5_sw_icm_type type, log_header_modify_sw_icm_size); block_map = dm->header_modify_sw_icm_alloc_blocks; break; + case MLX5_SW_ICM_TYPE_HEADER_MODIFY_PATTERN: + icm_start_addr = MLX5_CAP64_DEV_MEM(dev, + header_modify_pattern_sw_icm_start_address); + log_icm_size = MLX5_CAP_DEV_MEM(dev, + log_header_modify_pattern_sw_icm_size); + block_map = dm->header_modify_pattern_sw_icm_alloc_blocks; + break; default: return -EINVAL; } @@ -203,6 +240,11 @@ int mlx5_dm_sw_icm_dealloc(struct mlx5_core_dev *dev, enum mlx5_sw_icm_type type icm_start_addr = MLX5_CAP64_DEV_MEM(dev, header_modify_sw_icm_start_address); block_map = dm->header_modify_sw_icm_alloc_blocks; break; + case MLX5_SW_ICM_TYPE_HEADER_MODIFY_PATTERN: + icm_start_addr = MLX5_CAP64_DEV_MEM(dev, + header_modify_pattern_sw_icm_start_address); + block_map = dm->header_modify_pattern_sw_icm_alloc_blocks; + break; default: return -EINVAL; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index c9b4e50a593e..2078d9f03a5f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -314,13 +314,6 @@ struct mlx5_reg_host_endianness { u8 rsvd[15]; }; -#define CAP_MASK(pos, size) ((u64)((1 << (size)) - 1) << (pos)) - -enum { - MLX5_CAP_BITS_RW_MASK = CAP_MASK(MLX5_CAP_OFF_CMDIF_CSUM, 2) | - MLX5_DEV_CAP_FLAG_DCT, -}; - static u16 to_fw_pkey_sz(struct mlx5_core_dev *dev, u32 size) { switch (size) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c index 223c8741b7ae..16d65fe4f654 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c @@ -439,6 +439,7 @@ int mlx5dr_cmd_create_flow_table(struct mlx5_core_dev *mdev, MLX5_SET(create_flow_table_in, in, opcode, MLX5_CMD_OP_CREATE_FLOW_TABLE); MLX5_SET(create_flow_table_in, in, table_type, attr->table_type); + MLX5_SET(create_flow_table_in, in, uid, attr->uid); ft_mdev = MLX5_ADDR_OF(create_flow_table_in, in, flow_table_context); MLX5_SET(flow_table_context, ft_mdev, termination_table, attr->term_tbl); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_table.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_table.c index e5f6412baea9..31d443dd8386 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_table.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_table.c @@ -214,7 +214,7 @@ static int dr_table_destroy_sw_owned_tbl(struct mlx5dr_table *tbl) tbl->table_type); } -static int dr_table_create_sw_owned_tbl(struct mlx5dr_table *tbl) +static int dr_table_create_sw_owned_tbl(struct mlx5dr_table *tbl, u16 uid) { bool en_encap = !!(tbl->flags & MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT); bool en_decap = !!(tbl->flags & MLX5_FLOW_TABLE_TUNNEL_EN_DECAP); @@ -236,6 +236,7 @@ static int dr_table_create_sw_owned_tbl(struct mlx5dr_table *tbl) ft_attr.sw_owner = true; ft_attr.decap_en = en_decap; ft_attr.reformat_en = en_encap; + ft_attr.uid = uid; ret = mlx5dr_cmd_create_flow_table(tbl->dmn->mdev, &ft_attr, NULL, &tbl->table_id); @@ -243,7 +244,8 @@ static int dr_table_create_sw_owned_tbl(struct mlx5dr_table *tbl) return ret; } -struct mlx5dr_table *mlx5dr_table_create(struct mlx5dr_domain *dmn, u32 level, u32 flags) +struct mlx5dr_table *mlx5dr_table_create(struct mlx5dr_domain *dmn, u32 level, + u32 flags, u16 uid) { struct mlx5dr_table *tbl; int ret; @@ -263,7 +265,7 @@ struct mlx5dr_table *mlx5dr_table_create(struct mlx5dr_domain *dmn, u32 level, u if (ret) goto free_tbl; - ret = dr_table_create_sw_owned_tbl(tbl); + ret = dr_table_create_sw_owned_tbl(tbl, uid); if (ret) goto uninit_tbl; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h index 98320e3945ad..50b0dd4fb4a9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h @@ -1200,6 +1200,7 @@ struct mlx5dr_cmd_query_flow_table_details { struct mlx5dr_cmd_create_flow_table_attr { u32 table_type; + u16 uid; u64 icm_addr_rx; u64 icm_addr_tx; u8 level; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c index 6a9abba92df6..c30ed8e18458 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c @@ -62,7 +62,7 @@ static int set_miss_action(struct mlx5_flow_root_namespace *ns, static int mlx5_cmd_dr_create_flow_table(struct mlx5_flow_root_namespace *ns, struct mlx5_flow_table *ft, - unsigned int size, + struct mlx5_flow_table_attr *ft_attr, struct mlx5_flow_table *next_ft) { struct mlx5dr_table *tbl; @@ -71,7 +71,7 @@ static int mlx5_cmd_dr_create_flow_table(struct mlx5_flow_root_namespace *ns, if (mlx5_dr_is_fw_table(ft->flags)) return mlx5_fs_cmd_get_fw_cmds()->create_flow_table(ns, ft, - size, + ft_attr, next_ft); flags = ft->flags; /* turn off encap/decap if not supported for sw-str by fw */ @@ -79,7 +79,8 @@ static int mlx5_cmd_dr_create_flow_table(struct mlx5_flow_root_namespace *ns, flags = ft->flags & ~(MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT | MLX5_FLOW_TABLE_TUNNEL_EN_DECAP); - tbl = mlx5dr_table_create(ns->fs_dr_domain.dr_domain, ft->level, flags); + tbl = mlx5dr_table_create(ns->fs_dr_domain.dr_domain, ft->level, flags, + ft_attr->uid); if (!tbl) { mlx5_core_err(ns->dev, "Failed creating dr flow_table\n"); return -EINVAL; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h index 7626c85643b1..3bb14860b36d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h @@ -51,7 +51,8 @@ void mlx5dr_domain_set_peer(struct mlx5dr_domain *dmn, struct mlx5dr_domain *peer_dmn); struct mlx5dr_table * -mlx5dr_table_create(struct mlx5dr_domain *domain, u32 level, u32 flags); +mlx5dr_table_create(struct mlx5dr_domain *domain, u32 level, u32 flags, + u16 uid); struct mlx5dr_table * mlx5dr_table_get_from_fs_ft(struct mlx5_flow_table *ft); diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 604b85dd770a..b5f58fd37a0f 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -387,21 +387,6 @@ enum { }; enum { - MLX5_DEV_CAP_FLAG_XRC = 1LL << 3, - MLX5_DEV_CAP_FLAG_BAD_PKEY_CNTR = 1LL << 8, - MLX5_DEV_CAP_FLAG_BAD_QKEY_CNTR = 1LL << 9, - MLX5_DEV_CAP_FLAG_APM = 1LL << 17, - MLX5_DEV_CAP_FLAG_ATOMIC = 1LL << 18, - MLX5_DEV_CAP_FLAG_BLOCK_MCAST = 1LL << 23, - MLX5_DEV_CAP_FLAG_ON_DMND_PG = 1LL << 24, - MLX5_DEV_CAP_FLAG_CQ_MODER = 1LL << 29, - MLX5_DEV_CAP_FLAG_RESIZE_CQ = 1LL << 30, - MLX5_DEV_CAP_FLAG_DCT = 1LL << 37, - MLX5_DEV_CAP_FLAG_SIG_HAND_OVER = 1LL << 40, - MLX5_DEV_CAP_FLAG_CMDIF_CSUM = 3LL << 46, -}; - -enum { MLX5_ROCE_VERSION_1 = 0, MLX5_ROCE_VERSION_2 = 2, }; @@ -455,6 +440,7 @@ enum { MLX5_OPCODE_UMR = 0x25, + MLX5_OPCODE_ACCESS_ASO = 0x2d, }; enum { @@ -496,10 +482,6 @@ enum { }; enum { - MLX5_CAP_OFF_CMDIF_CSUM = 46, -}; - -enum { /* * Max wqe size for rdma read is 512 bytes, so this * limits our max_sge_rd as the wqe needs to fit: @@ -840,7 +822,10 @@ struct mlx5_cqe64 { __be32 timestamp_l; __be32 sop_drop_qpn; __be16 wqe_counter; - u8 signature; + union { + u8 signature; + u8 validity_iteration_count; + }; u8 op_own; }; @@ -872,6 +857,11 @@ enum { MLX5_CQE_FORMAT_CSUM_STRIDX = 0x3, }; +enum { + MLX5_CQE_COMPRESS_LAYOUT_BASIC = 0, + MLX5_CQE_COMPRESS_LAYOUT_ENHANCED = 1, +}; + #define MLX5_MINI_CQE_ARRAY_SIZE 8 static inline u8 mlx5_get_cqe_format(struct mlx5_cqe64 *cqe) @@ -884,6 +874,12 @@ static inline u8 get_cqe_opcode(struct mlx5_cqe64 *cqe) return cqe->op_own >> 4; } +static inline u8 get_cqe_enhanced_num_mini_cqes(struct mlx5_cqe64 *cqe) +{ + /* num_of_mini_cqes is zero based */ + return get_cqe_opcode(cqe) + 1; +} + static inline u8 get_cqe_lro_tcppsh(struct mlx5_cqe64 *cqe) { return (cqe->lro.tcppsh_abort_dupack >> 6) & 1; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 5040cd774c5a..220597c2f436 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -676,6 +676,7 @@ struct mlx5e_resources { enum mlx5_sw_icm_type { MLX5_SW_ICM_TYPE_STEERING, MLX5_SW_ICM_TYPE_HEADER_MODIFY, + MLX5_SW_ICM_TYPE_HEADER_MODIFY_PATTERN, }; #define MLX5_MAX_RESERVED_GIDS 8 @@ -727,10 +728,10 @@ enum { }; enum { - MR_CACHE_LAST_STD_ENTRY = 20, + MKEY_CACHE_LAST_STD_ENTRY = 20, MLX5_IMR_MTT_CACHE_ENTRY, MLX5_IMR_KSM_CACHE_ENTRY, - MAX_MR_CACHE_ENTRIES + MAX_MKEY_CACHE_ENTRIES }; struct mlx5_profile { @@ -739,7 +740,7 @@ struct mlx5_profile { struct { int size; int limit; - } mr_cache[MAX_MR_CACHE_ENTRIES]; + } mr_cache[MAX_MKEY_CACHE_ENTRIES]; }; struct mlx5_hca_cap { diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 8135713b0d2d..8e73c377da2c 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -178,6 +178,7 @@ struct mlx5_flow_table_attr { int max_fte; u32 level; u32 flags; + u16 uid; struct mlx5_flow_table *next_ft; struct { @@ -212,6 +213,19 @@ struct mlx5_flow_group * mlx5_create_flow_group(struct mlx5_flow_table *ft, u32 *in); void mlx5_destroy_flow_group(struct mlx5_flow_group *fg); +struct mlx5_exe_aso { + u32 object_id; + u8 type; + u8 return_reg_id; + union { + u32 ctrl_data; + struct { + u8 meter_idx; + u8 init_color; + } flow_meter; + }; +}; + struct mlx5_fs_vlan { u16 ethtype; u16 vid; @@ -237,6 +251,7 @@ struct mlx5_flow_act { struct mlx5_fs_vlan vlan[MLX5_FS_VLAN_DEPTH]; struct ib_counters *counters; struct mlx5_flow_group *fg; + struct mlx5_exe_aso exe_aso; }; #define MLX5_DECLARE_FLOW_ACT(name) \ @@ -301,4 +316,5 @@ struct mlx5_pkt_reformat *mlx5_packet_reformat_alloc(struct mlx5_core_dev *dev, void mlx5_packet_reformat_dealloc(struct mlx5_core_dev *dev, struct mlx5_pkt_reformat *reformat); +u32 mlx5_flow_table_id(struct mlx5_flow_table *ft); #endif diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index fd7d083a34d3..9321d774e2d8 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -442,7 +442,9 @@ struct mlx5_ifc_flow_table_prop_layout_bits { u8 max_modify_header_actions[0x8]; u8 max_ft_level[0x8]; - u8 reserved_at_40[0x20]; + u8 reserved_at_40[0x6]; + u8 execute_aso[0x1]; + u8 reserved_at_47[0x19]; u8 reserved_at_60[0x2]; u8 reformat_insert[0x1]; @@ -940,7 +942,17 @@ struct mlx5_ifc_qos_cap_bits { u8 max_tsar_bw_share[0x20]; - u8 reserved_at_100[0x700]; + u8 reserved_at_100[0x20]; + + u8 reserved_at_120[0x3]; + u8 log_meter_aso_granularity[0x5]; + u8 reserved_at_128[0x3]; + u8 log_meter_aso_max_alloc[0x5]; + u8 reserved_at_130[0x3]; + u8 log_max_num_meter_aso[0x5]; + u8 reserved_at_138[0x8]; + + u8 reserved_at_140[0x6c0]; }; struct mlx5_ifc_debug_cap_bits { @@ -1086,11 +1098,14 @@ struct mlx5_ifc_device_mem_cap_bits { u8 log_sw_icm_alloc_granularity[0x6]; u8 log_steering_sw_icm_size[0x8]; - u8 reserved_at_120[0x20]; + u8 reserved_at_120[0x18]; + u8 log_header_modify_pattern_sw_icm_size[0x8]; u8 header_modify_sw_icm_start_address[0x40]; - u8 reserved_at_180[0x80]; + u8 reserved_at_180[0x40]; + + u8 header_modify_pattern_sw_icm_start_address[0x40]; u8 memic_operations[0x20]; @@ -1356,7 +1371,9 @@ enum { }; struct mlx5_ifc_cmd_hca_cap_bits { - u8 reserved_at_0[0x1f]; + u8 reserved_at_0[0x10]; + u8 shared_object_to_user_object_allowed[0x1]; + u8 reserved_at_13[0xe]; u8 vhca_resource_manager[0x1]; u8 hca_cap_2[0x1]; @@ -1426,7 +1443,8 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_120[0xa]; u8 log_max_ra_req_dc[0x6]; - u8 reserved_at_130[0xa]; + u8 reserved_at_130[0x9]; + u8 vnic_env_cq_overrun[0x1]; u8 log_max_ra_res_dc[0x6]; u8 reserved_at_140[0x5]; @@ -1621,7 +1639,11 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 nic_receive_steering_discard[0x1]; u8 receive_discard_vport_down[0x1]; u8 transmit_discard_vport_down[0x1]; - u8 reserved_at_343[0x5]; + u8 eq_overrun_count[0x1]; + u8 reserved_at_344[0x1]; + u8 invalid_command_count[0x1]; + u8 quota_exceeded_count[0x1]; + u8 reserved_at_347[0x1]; u8 log_max_flow_counter_bulk[0x8]; u8 max_flow_counter_15_0[0x10]; @@ -1719,7 +1741,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 log_max_dci_errored_streams[0x5]; u8 reserved_at_598[0x8]; - u8 reserved_at_5a0[0x13]; + u8 reserved_at_5a0[0x10]; + u8 enhanced_cqe_compression[0x1]; + u8 reserved_at_5b1[0x2]; u8 log_max_dek[0x5]; u8 reserved_at_5b8[0x4]; u8 mini_cqe_resp_stride_index[0x1]; @@ -3277,6 +3301,7 @@ enum { MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH_2 = 0x800, MLX5_FLOW_CONTEXT_ACTION_IPSEC_DECRYPT = 0x1000, MLX5_FLOW_CONTEXT_ACTION_IPSEC_ENCRYPT = 0x2000, + MLX5_FLOW_CONTEXT_ACTION_EXECUTE_ASO = 0x4000, }; enum { @@ -3292,6 +3317,38 @@ struct mlx5_ifc_vlan_bits { u8 vid[0xc]; }; +enum { + MLX5_FLOW_METER_COLOR_RED = 0x0, + MLX5_FLOW_METER_COLOR_YELLOW = 0x1, + MLX5_FLOW_METER_COLOR_GREEN = 0x2, + MLX5_FLOW_METER_COLOR_UNDEFINED = 0x3, +}; + +enum { + MLX5_EXE_ASO_FLOW_METER = 0x2, +}; + +struct mlx5_ifc_exe_aso_ctrl_flow_meter_bits { + u8 return_reg_id[0x4]; + u8 aso_type[0x4]; + u8 reserved_at_8[0x14]; + u8 action[0x1]; + u8 init_color[0x2]; + u8 meter_id[0x1]; +}; + +union mlx5_ifc_exe_aso_ctrl { + struct mlx5_ifc_exe_aso_ctrl_flow_meter_bits exe_aso_ctrl_flow_meter; +}; + +struct mlx5_ifc_execute_aso_bits { + u8 valid[0x1]; + u8 reserved_at_1[0x7]; + u8 aso_object_id[0x18]; + + union mlx5_ifc_exe_aso_ctrl exe_aso_ctrl; +}; + struct mlx5_ifc_flow_context_bits { struct mlx5_ifc_vlan_bits push_vlan; @@ -3323,7 +3380,9 @@ struct mlx5_ifc_flow_context_bits { struct mlx5_ifc_fte_match_param_bits match_value; - u8 reserved_at_1200[0x600]; + struct mlx5_ifc_execute_aso_bits execute_aso[4]; + + u8 reserved_at_1300[0x500]; union mlx5_ifc_dest_format_struct_flow_counter_list_auto_bits destination[]; }; @@ -3391,11 +3450,21 @@ struct mlx5_ifc_vnic_diagnostic_statistics_bits { u8 transmit_discard_vport_down[0x40]; - u8 reserved_at_140[0xa0]; + u8 async_eq_overrun[0x20]; + + u8 comp_eq_overrun[0x20]; + + u8 reserved_at_180[0x20]; + + u8 invalid_command[0x20]; + + u8 quota_exceeded_command[0x20]; u8 internal_rq_out_of_buffer[0x20]; - u8 reserved_at_200[0xe00]; + u8 cq_overrun[0x20]; + + u8 reserved_at_220[0xde0]; }; struct mlx5_ifc_traffic_counter_bits { @@ -4074,7 +4143,8 @@ struct mlx5_ifc_cqc_bits { u8 cqe_comp_en[0x1]; u8 mini_cqe_res_format[0x2]; u8 st[0x4]; - u8 reserved_at_18[0x8]; + u8 reserved_at_18[0x6]; + u8 cqe_compression_layout[0x2]; u8 reserved_at_20[0x20]; @@ -5970,7 +6040,9 @@ struct mlx5_ifc_general_obj_in_cmd_hdr_bits { u8 obj_id[0x20]; - u8 reserved_at_60[0x20]; + u8 reserved_at_60[0x3]; + u8 log_obj_range[0x5]; + u8 reserved_at_68[0x18]; }; struct mlx5_ifc_general_obj_out_cmd_hdr_bits { @@ -8437,7 +8509,7 @@ struct mlx5_ifc_create_flow_table_out_bits { struct mlx5_ifc_create_flow_table_in_bits { u8 opcode[0x10]; - u8 reserved_at_10[0x10]; + u8 uid[0x10]; u8 reserved_at_20[0x10]; u8 op_mod[0x10]; @@ -11370,12 +11442,14 @@ enum { MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_ENCRYPTION_KEY = BIT_ULL(0xc), MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_IPSEC = BIT_ULL(0x13), MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_SAMPLER = BIT_ULL(0x20), + MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_FLOW_METER_ASO = BIT_ULL(0x24), }; enum { MLX5_GENERAL_OBJECT_TYPES_ENCRYPTION_KEY = 0xc, MLX5_GENERAL_OBJECT_TYPES_IPSEC = 0x13, MLX5_GENERAL_OBJECT_TYPES_SAMPLER = 0x20, + MLX5_GENERAL_OBJECT_TYPES_FLOW_METER_ASO = 0x24, }; enum { @@ -11448,6 +11522,61 @@ struct mlx5_ifc_create_encryption_key_in_bits { struct mlx5_ifc_encryption_key_obj_bits encryption_key_object; }; +enum { + MLX5_FLOW_METER_MODE_BYTES_IP_LENGTH = 0x0, + MLX5_FLOW_METER_MODE_BYTES_CALC_WITH_L2 = 0x1, + MLX5_FLOW_METER_MODE_BYTES_CALC_WITH_L2_IPG = 0x2, + MLX5_FLOW_METER_MODE_NUM_PACKETS = 0x3, +}; + +struct mlx5_ifc_flow_meter_parameters_bits { + u8 valid[0x1]; + u8 bucket_overflow[0x1]; + u8 start_color[0x2]; + u8 both_buckets_on_green[0x1]; + u8 reserved_at_5[0x1]; + u8 meter_mode[0x2]; + u8 reserved_at_8[0x18]; + + u8 reserved_at_20[0x20]; + + u8 reserved_at_40[0x3]; + u8 cbs_exponent[0x5]; + u8 cbs_mantissa[0x8]; + u8 reserved_at_50[0x3]; + u8 cir_exponent[0x5]; + u8 cir_mantissa[0x8]; + + u8 reserved_at_60[0x20]; + + u8 reserved_at_80[0x3]; + u8 ebs_exponent[0x5]; + u8 ebs_mantissa[0x8]; + u8 reserved_at_90[0x3]; + u8 eir_exponent[0x5]; + u8 eir_mantissa[0x8]; + + u8 reserved_at_a0[0x60]; +}; + +struct mlx5_ifc_flow_meter_aso_obj_bits { + u8 modify_field_select[0x40]; + + u8 reserved_at_40[0x40]; + + u8 reserved_at_80[0x8]; + u8 meter_aso_access_pd[0x18]; + + u8 reserved_at_a0[0x160]; + + struct mlx5_ifc_flow_meter_parameters_bits flow_meter_parameters[2]; +}; + +struct mlx5_ifc_create_flow_meter_aso_obj_in_bits { + struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr; + struct mlx5_ifc_flow_meter_aso_obj_bits flow_meter_aso_obj; +}; + struct mlx5_ifc_sampler_obj_bits { u8 modify_field_select[0x40]; diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 9c6317cf80d5..7c2f76f34f6f 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -4603,7 +4603,7 @@ static inline enum rdma_ah_attr_type rdma_ah_find_type(struct ib_device *dev, /** * ib_lid_cpu16 - Return lid in 16bit CPU encoding. - * In the current implementation the only way to get + * In the current implementation the only way to * get the 32bit lid is from other sources for OPA. * For IB, lids will always be 16bits so cast the * value accordingly. diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h index d989f030fae0..5b18e2e36ee6 100644 --- a/include/rdma/rdma_cm.h +++ b/include/rdma/rdma_cm.h @@ -108,6 +108,7 @@ struct rdma_cm_id { enum rdma_ucm_port_space ps; enum ib_qp_type qp_type; u32 port_num; + struct work_struct net_work; }; struct rdma_cm_id * diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h index e539c84d63f1..3bee490eb585 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h +++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h @@ -228,6 +228,7 @@ enum mlx5_ib_objects { MLX5_IB_OBJECT_VAR, MLX5_IB_OBJECT_PP, MLX5_IB_OBJECT_UAR, + MLX5_IB_OBJECT_STEERING_ANCHOR, }; enum mlx5_ib_flow_matcher_create_attrs { @@ -248,6 +249,22 @@ enum mlx5_ib_flow_matcher_methods { MLX5_IB_METHOD_FLOW_MATCHER_DESTROY, }; +enum mlx5_ib_flow_steering_anchor_create_attrs { + MLX5_IB_ATTR_STEERING_ANCHOR_CREATE_HANDLE = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_ATTR_STEERING_ANCHOR_FT_TYPE, + MLX5_IB_ATTR_STEERING_ANCHOR_PRIORITY, + MLX5_IB_ATTR_STEERING_ANCHOR_FT_ID, +}; + +enum mlx5_ib_flow_steering_anchor_destroy_attrs { + MLX5_IB_ATTR_STEERING_ANCHOR_DESTROY_HANDLE = (1U << UVERBS_ID_NS_SHIFT), +}; + +enum mlx5_ib_steering_anchor_methods { + MLX5_IB_METHOD_STEERING_ANCHOR_CREATE = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_METHOD_STEERING_ANCHOR_DESTROY, +}; + enum mlx5_ib_device_query_context_attrs { MLX5_IB_ATTR_QUERY_CONTEXT_RESP_UCTX = (1U << UVERBS_ID_NS_SHIFT), }; diff --git a/include/uapi/rdma/mlx5_user_ioctl_verbs.h b/include/uapi/rdma/mlx5_user_ioctl_verbs.h index a21ca8ece8db..7af9e09ea556 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_verbs.h +++ b/include/uapi/rdma/mlx5_user_ioctl_verbs.h @@ -63,6 +63,7 @@ enum mlx5_ib_uapi_dm_type { MLX5_IB_UAPI_DM_TYPE_MEMIC, MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM, MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM, + MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_PATTERN_SW_ICM, }; enum mlx5_ib_uapi_devx_create_event_channel_flags { |